├── Gemfile
├── .travis.yml
├── lib
    ├── elasticrawl
    │   ├── version.rb
    │   ├── error.rb
    │   ├── crawl_segment.rb
    │   ├── job_step.rb
    │   ├── job.rb
    │   ├── combine_job.rb
    │   ├── parse_job.rb
    │   ├── crawl.rb
    │   ├── cluster.rb
    │   └── config.rb
    └── elasticrawl.rb
├── db
    └── migrate
    │   ├── 201412311554_add_file_count_to_crawl_segments.rb
    │   ├── 201401051536_create_crawls.rb
    │   ├── 201401141606_create_job_steps.rb
    │   ├── 201401101723_create_jobs.rb
    │   └── 201401051855_create_crawl_segments.rb
├── spec
    ├── fixtures
    │   ├── aws.yml
    │   ├── warc.paths
    │   ├── jobs.yml
    │   └── cluster.yml
    ├── unit
    │   ├── job_spec.rb
    │   ├── config_spec.rb
    │   ├── crawl_segment_spec.rb
    │   ├── cluster_spec.rb
    │   ├── job_step_spec.rb
    │   ├── combine_job_spec.rb
    │   ├── crawl_spec.rb
    │   └── parse_job_spec.rb
    └── spec_helper.rb
├── Rakefile
├── Cheffile
├── .gitignore
├── templates
    ├── aws.yml
    ├── jobs.yml
    └── cluster.yml
├── CHANGELOG.md
├── Cheffile.lock
├── LICENSE
├── elasticrawl.gemspec
├── Vagrantfile
├── bin
    └── elasticrawl
└── README.md


/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | gemspec
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 |   - 2.2.6
4 |   - 2.3.3
5 |   - 2.4.0
6 | 


--------------------------------------------------------------------------------
/lib/elasticrawl/version.rb:
--------------------------------------------------------------------------------
1 | module Elasticrawl
2 |   VERSION = '1.1.8'
3 | end
4 | 


--------------------------------------------------------------------------------
/db/migrate/201412311554_add_file_count_to_crawl_segments.rb:
--------------------------------------------------------------------------------
1 | class AddFileCountToCrawlSegments < ActiveRecord::Migration
2 |   def change
3 |     add_column(:crawl_segments, :file_count, :integer)
4 |   end
5 | end
6 | 


--------------------------------------------------------------------------------
/spec/fixtures/aws.yml:
--------------------------------------------------------------------------------
1 | # Configures the AWS credentials used when accessing the AWS EMR and S3 APIs.
2 | # This file is populated by the elasticrawl init command.
3 | access_key_id: 'ACCESS_KEY_ID'
4 | secret_access_key: 'SECRET_ACCESS_KEY'
5 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'bundler/gem_tasks'
 2 | require 'rspec/core/rake_task'
 3 | 
 4 | namespace :spec do
 5 |   RSpec::Core::RakeTask.new(:unit) do |t|
 6 |     t.pattern = 'spec/unit/*_spec.rb'
 7 |   end
 8 | end
 9 | 
10 | desc 'Run unit specs'
11 | task :default => 'spec:unit'
12 | 


--------------------------------------------------------------------------------
/db/migrate/201401051536_create_crawls.rb:
--------------------------------------------------------------------------------
 1 | class CreateCrawls < ActiveRecord::Migration
 2 |   def change
 3 |     create_table :crawls do |t|
 4 |       t.string :crawl_name
 5 |       t.timestamps(:null => false)
 6 |     end
 7 | 
 8 |     add_index(:crawls, :crawl_name, :unique => true)
 9 |   end
10 | end
11 | 


--------------------------------------------------------------------------------
/Cheffile:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | #^syntax detection
 3 | 
 4 | site "https://supermarket.getchef.com/api/v1"
 5 | 
 6 | cookbook "apt", "2.9.2"
 7 | cookbook "build-essential", "2.2.4"
 8 | cookbook "git", "4.3.5"
 9 | cookbook "ruby_rbenv", "1.0.1"
10 | cookbook "ruby_build", "0.8.0"
11 | cookbook "vim", "2.0.0"
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gem
 2 | *.rbc
 3 | .bundle
 4 | .config
 5 | .yardoc
 6 | Gemfile.lock
 7 | InstalledFiles
 8 | _yardoc
 9 | coverage
10 | doc/
11 | lib/bundler/man
12 | pkg
13 | rdoc
14 | spec/reports
15 | test/tmp
16 | test/version_tmp
17 | tmp
18 | 
19 | .vagrant
20 | cookbooks
21 | spec/fixtures/elasticrawl.sqlite3
22 | 


--------------------------------------------------------------------------------
/db/migrate/201401141606_create_job_steps.rb:
--------------------------------------------------------------------------------
 1 | class CreateJobSteps < ActiveRecord::Migration
 2 |   def change
 3 |     create_table :job_steps do |t|
 4 |       t.references :job
 5 |       t.references :crawl_segment
 6 |       t.text :input_paths
 7 |       t.text :output_path
 8 |       t.timestamps(:null => false)
 9 |     end
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/templates/aws.yml:
--------------------------------------------------------------------------------
1 | # Configures the AWS access credentials used when calling the AWS 
2 | # Elastic MapReduce and S3 APIs.  This file is populated by the init command.
3 | # 
4 | # Instead of configuring this file you can set the environment variables
5 | # AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.
6 | access_key_id: 'ACCESS_KEY_ID'
7 | secret_access_key: 'SECRET_ACCESS_KEY'
8 | 


--------------------------------------------------------------------------------
/db/migrate/201401101723_create_jobs.rb:
--------------------------------------------------------------------------------
 1 | class CreateJobs < ActiveRecord::Migration
 2 |   def change
 3 |     create_table :jobs do |t|
 4 |       t.string :type
 5 |       t.string :job_name
 6 |       t.string :job_desc
 7 |       t.integer :max_files
 8 |       t.string :job_flow_id
 9 |       t.timestamps(:null => false)
10 |     end
11 | 
12 |     add_index(:jobs, :job_name, :unique => true)
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## v1.1.6 / 2016-06-26
 2 | * Change CommonCrawl bucket to s3://commoncrawl
 3 | 
 4 | ## v1.1.3 / 2015-02-04
 5 | * Upgrade Traveling Ruby to 20150204-2.1.5
 6 | 
 7 | ## v1.1.2 / 2015-01-27
 8 | * Improve error handling for S3 API calls
 9 | 
10 | ## v1.1.1 / 2015-01-27
11 | * Use Traveling Ruby to build deploy packages
12 | 
13 | ## v1.1.0 / 2015-01-03
14 | * Show file counts for each segment
15 | 
16 | ## v1.0.0 / 2014-02-04
17 | * Initial release
18 | 


--------------------------------------------------------------------------------
/spec/unit/job_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | describe Elasticrawl::Job, type: :model do
 4 |   it { should have_many(:job_steps) }
 5 |   it { should have_db_column(:type).of_type(:string) }
 6 |   it { should have_db_column(:job_name).of_type(:string) }
 7 |   it { should have_db_column(:job_desc).of_type(:string) }
 8 |   it { should have_db_column(:max_files).of_type(:integer) }
 9 |   it { should have_db_column(:job_flow_id).of_type(:string) }
10 | end
11 | 


--------------------------------------------------------------------------------
/db/migrate/201401051855_create_crawl_segments.rb:
--------------------------------------------------------------------------------
 1 | class CreateCrawlSegments < ActiveRecord::Migration
 2 |   def change
 3 |     create_table :crawl_segments do |t|
 4 |       t.references :crawl
 5 |       t.string :segment_name
 6 |       t.string :segment_s3_uri
 7 |       t.datetime :parse_time
 8 |       t.timestamps(:null => false)
 9 |     end
10 | 
11 |     add_index(:crawl_segments, :segment_name, :unique => true)
12 |     add_index(:crawl_segments, :segment_s3_uri, :unique => true)
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/spec/unit/config_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | describe Elasticrawl::Config do
 4 |   describe '#load_config' do
 5 |     subject { Elasticrawl::Config.new }
 6 | 
 7 |     it 'should return a hash of config data' do
 8 |       config_data = subject.load_config('jobs')
 9 |       expect(config_data).to be_a Hash
10 |     end
11 | 
12 |     it 'should load yaml config file' do
13 |       config_data = subject.load_config('jobs')
14 |       expect(config_data['s3_bucket_name']).to eq 'elasticrawl'
15 |     end
16 |   end
17 | end
18 | 


--------------------------------------------------------------------------------
/Cheffile.lock:
--------------------------------------------------------------------------------
 1 | SITE
 2 |   remote: https://supermarket.getchef.com/api/v1
 3 |   specs:
 4 |     apt (2.9.2)
 5 |     build-essential (2.2.4)
 6 |     chef_handler (1.2.0)
 7 |     dmg (2.3.0)
 8 |     git (4.3.5)
 9 |       build-essential (>= 0.0.0)
10 |       dmg (>= 0.0.0)
11 |       windows (>= 0.0.0)
12 |       yum-epel (>= 0.0.0)
13 |     ruby_build (0.8.0)
14 |     ruby_rbenv (1.0.1)
15 |       ruby_build (>= 0.0.0)
16 |     vim (2.0.0)
17 |     windows (1.39.0)
18 |       chef_handler (>= 0.0.0)
19 |     yum (3.8.2)
20 |     yum-epel (0.6.5)
21 |       yum (~> 3.2)
22 | 
23 | DEPENDENCIES
24 |   apt (= 2.9.2)
25 |   build-essential (= 2.2.4)
26 |   git (= 4.3.5)
27 |   ruby_build (= 0.8.0)
28 |   ruby_rbenv (= 1.0.1)
29 |   vim (= 2.0.0)
30 | 
31 | 


--------------------------------------------------------------------------------
/spec/fixtures/warc.paths:
--------------------------------------------------------------------------------
1 | crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00001-ip-10-235-23-156.ec2.internal.warc.gz
2 | crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz
3 | crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00003-ip-10-235-23-156.ec2.internal.warc.gz
4 | crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz
5 | crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz
6 | crawl-data/CC-MAIN-2014-49/segments/1416400372542.20/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz
7 | 


--------------------------------------------------------------------------------
/lib/elasticrawl.rb:
--------------------------------------------------------------------------------
 1 | require 'aws-sdk'
 2 | require 'active_record'
 3 | require 'active_support'
 4 | require 'elasticity'
 5 | require 'highline/import'
 6 | require 'thor'
 7 | 
 8 | module Elasticrawl
 9 |   # S3 locations
10 |   COMMON_CRAWL_BUCKET = 'commoncrawl'
11 |   COMMON_CRAWL_PATH = 'crawl-data'
12 |   SEGMENTS_PATH = 'segments'
13 |   WARC_PATHS = 'warc.paths.gz'
14 |   MAX_SEGMENTS = 256
15 | 
16 |   require 'elasticrawl/version'
17 | 
18 |   require 'elasticrawl/config'
19 |   require 'elasticrawl/error'
20 | 
21 |   require 'elasticrawl/cluster'
22 |   require 'elasticrawl/crawl'
23 |   require 'elasticrawl/crawl_segment'
24 |   require 'elasticrawl/job'
25 |   require 'elasticrawl/combine_job'
26 |   require 'elasticrawl/parse_job'
27 |   require 'elasticrawl/job_step'
28 | end
29 | 


--------------------------------------------------------------------------------
/lib/elasticrawl/error.rb:
--------------------------------------------------------------------------------
 1 | module Elasticrawl
 2 |   # Base error class extends standard error.
 3 |   class Error < StandardError
 4 |     attr_reader :http_response
 5 | 
 6 |     def initialize(response = nil)
 7 |       @http_response = response
 8 |     end
 9 |   end
10 | 
11 |   # AWS access credentials are invalid.
12 |   class AWSCredentialsInvalidError < Error; end
13 | 
14 |   # Config directory does not exist.
15 |   class ConfigDirMissingError < Error; end
16 | 
17 |   # Database error accessing sqlite database.
18 |   class DatabaseAccessError < Error; end
19 | 
20 |   # Error accessing AWS Elastic MapReduce API.
21 |   class ElasticMapReduceAccessError < Error; end
22 | 
23 |   # Error accessing config directory.
24 |   class FileAccessError < Error; end
25 | 
26 |   # Error accessing AWS S3 API.
27 |   class S3AccessError < Error; end
28 | end
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 Ross Fairbanks
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | require 'elasticrawl'
 2 | require 'rspec'
 3 | require 'database_cleaner'
 4 | require 'shoulda-matchers'
 5 | 
 6 | RSpec.configure do |config|
 7 |   # Run each test in a transaction and rollback data on completion.
 8 |   DatabaseCleaner.strategy = :transaction
 9 | 
10 |   # Use Shoulda matchers for schema tests.
11 |   config.include(Shoulda::Matchers::ActiveRecord, type: :model)
12 | 
13 |   config.before(:each) do
14 |     # Stub S3 call to get WARC file paths
15 |     warc_paths = IO.read(File.join(File.dirname(__FILE__), 'fixtures', 'warc.paths'))
16 |     allow_any_instance_of(Elasticrawl::Crawl).to receive(:warc_paths).and_return(warc_paths)
17 | 
18 |     # Load config from spec/fixtures/ rather than ~/.elasticrawl/
19 |     config_dir = File.join(File.dirname(__FILE__), 'fixtures')
20 |     allow_any_instance_of(Elasticrawl::Config).to receive(:config_dir).and_return(config_dir)
21 | 
22 |     # Load sqlite database. For testing this is stored at db/elasticrawl.sqlite3
23 |     config = Elasticrawl::Config.new
24 |     config.load_database
25 | 
26 |     DatabaseCleaner.start
27 |   end
28 | 
29 |   config.after(:each) do
30 |     DatabaseCleaner.clean
31 |   end
32 | end
33 | 


--------------------------------------------------------------------------------
/spec/unit/crawl_segment_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | describe Elasticrawl::CrawlSegment, type: :model do
 4 |   it { should belong_to(:crawl) }
 5 |   it { should have_many(:job_steps) }
 6 |   it { should have_db_column(:segment_name).of_type(:string) }
 7 |   it { should have_db_column(:segment_s3_uri).of_type(:string) }
 8 |   it { should have_db_column(:parse_time).of_type(:datetime) }
 9 |   it { should have_db_column(:file_count).of_type(:integer) }
10 | 
11 |   describe '.create_segment' do
12 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
13 |     let(:segment_name) { '1416400372202.67' }
14 |     let(:file_count) { 3 }
15 |     let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' }
16 |     subject { Elasticrawl::CrawlSegment.create_segment(crawl,
17 |                                                        segment_name,
18 |                                                        file_count) }
19 |     it 'should have a segment name' do
20 |       expect(subject.segment_name).to eq segment_name
21 |     end
22 | 
23 |     it 'should have an s3 uri' do
24 |       expect(subject.segment_s3_uri).to eq \
25 |         "s3://commoncrawl/crawl-data/#{crawl.crawl_name}/segments/#{segment_name}/"
26 |     end
27 | 
28 |     it 'should have a file count' do
29 |       expect(subject.file_count).to eq file_count
30 |     end
31 | 
32 |     it 'should have a segment description' do
33 |       expect(subject.segment_desc).to eq segment_desc
34 |     end
35 |   end
36 | end
37 | 


--------------------------------------------------------------------------------
/lib/elasticrawl/crawl_segment.rb:
--------------------------------------------------------------------------------
 1 | module Elasticrawl
 2 |   # Represents a segment of a web crawl released by the Common Crawl Foundation.
 3 |   # Each segment contains archive, metadata and text files.
 4 |   class CrawlSegment < ActiveRecord::Base
 5 |     belongs_to :crawl
 6 |     has_many :job_steps
 7 | 
 8 |     # Description shows name and number of files in the segment.
 9 |     def segment_desc
10 |       "Segment: #{segment_name} Files: #{file_count}"
11 |     end
12 | 
13 |     # Creates a crawl segment based on its S3 path if it does not exist.
14 |     def self.create_segment(crawl, segment_name, file_count)
15 |       s3_uri = build_s3_uri(crawl.crawl_name, segment_name)
16 | 
17 |       segment = CrawlSegment.where(:crawl_id => crawl.id,
18 |                                   :segment_name => segment_name,
19 |                                   :segment_s3_uri => s3_uri,
20 |                                   :file_count => file_count).first_or_create
21 |     end
22 | 
23 | private
24 |     # Generates the S3 location where this segment is stored.
25 |     def self.build_s3_uri(crawl_name, segment_name)
26 |       s3_path = ['',
27 |                  Elasticrawl::COMMON_CRAWL_PATH,
28 |                  crawl_name,
29 |                  Elasticrawl::SEGMENTS_PATH,
30 |                  segment_name,
31 |                  '']
32 | 
33 |       URI::Generic.build(:scheme => 's3',
34 |                          :host => Elasticrawl::COMMON_CRAWL_BUCKET,
35 |                          :path => s3_path.join('/')).to_s
36 |     end
37 |   end
38 | end
39 | 


--------------------------------------------------------------------------------
/spec/fixtures/jobs.yml:
--------------------------------------------------------------------------------
 1 | # Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
 2 | # corpus. 
 3 | 
 4 | # An S3 bucket is created by the init command and is used to store data and logs.
 5 | s3_bucket_name: 'elasticrawl'
 6 | 
 7 | # A parse step is created per Common Crawl segment.  A combine step takes the
 8 | # results from multiple segments to create a single set of output files.
 9 | 
10 | # The parse input filter is used to specify the Common Crawl file type.
11 | 
12 | # WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
13 | # WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
14 | # WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.
15 | 
16 | # The EMR config is an XML file that sets Hadoop properties.  If a config file
17 | # is specified then a bootstrap action is run on each node to apply it.
18 | steps:
19 |   # Parse step for the Example Elasticrawl JAR.  This does a word count
20 |   # against the text extractions of the corpus.
21 |   parse:
22 |     jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
23 |     class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
24 |     input_filter: 'wet/*.warc.wet.gz'
25 |     emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
26 |   # Combine step for the Example Elasticrawl JAR.
27 |   combine:
28 |     jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
29 |     class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
30 |     input_filter: 'part-*'
31 |     emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'
32 | 


--------------------------------------------------------------------------------
/templates/jobs.yml:
--------------------------------------------------------------------------------
 1 | # Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
 2 | # corpus. 
 3 | 
 4 | # An S3 bucket is created by the init command and is used to store data and logs.
 5 | s3_bucket_name: 'BUCKET_NAME'
 6 | 
 7 | # A parse step is created per Common Crawl segment.  A combine step takes the
 8 | # results from multiple segments to create a single set of output files.
 9 | 
10 | # The parse input filter is used to specify the Common Crawl file type.
11 | 
12 | # WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
13 | # WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
14 | # WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.
15 | 
16 | # The EMR config is an XML file that sets Hadoop properties.  If a config file
17 | # is specified then a bootstrap action is run on each node to apply it.
18 | steps:
19 |   # Parse step for the Example Elasticrawl JAR.  This does a word count
20 |   # against the text extractions of the corpus.
21 |   parse:
22 |     jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
23 |     class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
24 |     input_filter: 'wet/*.warc.wet.gz'
25 |     emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
26 |   # Combine step for the Example Elasticrawl JAR.
27 |   combine:
28 |     jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
29 |     class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
30 |     input_filter: 'part-*'
31 |     emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'
32 | 


--------------------------------------------------------------------------------
/spec/unit/cluster_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | describe Elasticrawl::Cluster do
 4 |   describe '#create_job_flow' do
 5 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
 6 |     let(:job) { Elasticrawl::ParseJob.new }
 7 |     let(:cluster) { Elasticrawl::Cluster.new }
 8 |     subject { cluster.create_job_flow(job) }
 9 | 
10 |     before do
11 |       job.set_segments(crawl.crawl_segments)
12 |     end
13 | 
14 |     it 'should be an Elasticity::JobFlow' do
15 |       expect(subject).to be_a Elasticity::JobFlow
16 |     end
17 | 
18 |     it 'should have a job flow name' do
19 |       expect(subject.name).to eq "Job: #{job.job_name} #{job.job_desc}"
20 |     end
21 | 
22 |     it 'should have a log uri' do
23 |       expect(subject.log_uri).to eq job.log_uri
24 |     end
25 | 
26 |     it 'should have an ec2 key name' do
27 |       expect(subject.ec2_key_name).to eq 'elasticrawl'
28 |     end
29 | 
30 |     it 'should have a placement az name' do
31 |       expect(subject.placement).to eq 'us-east-1c'
32 |     end
33 | 
34 |     it 'should have an ami version' do
35 |       expect(subject.ami_version).to eq 'latest'
36 |     end
37 |   end
38 | 
39 |   describe '#cluster_desc' do
40 |     let(:cluster_desc) {
41 |       cluster_desc = <<-HERE
42 | Cluster configuration
43 | Master: 1 m1.medium  (Spot: 0.12)
44 | Core:   2 m1.medium  (Spot: 0.12)
45 | Task:   --
46 |       HERE
47 |     }
48 |     subject { Elasticrawl::Cluster.new } 
49 | 
50 |     it 'should describe configured instance groups' do
51 |       expect(subject.cluster_desc).to eq cluster_desc
52 |     end
53 |   end
54 | end
55 | 


--------------------------------------------------------------------------------
/elasticrawl.gemspec:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | lib = File.expand_path('../lib', __FILE__)
 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 4 | require 'elasticrawl/version'
 5 | 
 6 | Gem::Specification.new do |spec|
 7 |   spec.name          = 'elasticrawl'
 8 |   spec.version       = Elasticrawl::VERSION
 9 |   spec.authors       = ['Ross Fairbanks']
10 |   spec.email         = ['ross@rossfairbanks.com']
11 |   spec.summary       = %q{Launch AWS Elastic MapReduce jobs that process Common Crawl data.}
12 |   spec.description   = %q{Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process Common Crawl data.}
13 |   spec.homepage      = 'https://github.com/rossf7/elasticrawl'
14 |   spec.license       = 'MIT'
15 | 
16 |   spec.files         = `git ls-files`.split($/)
17 |   spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18 |   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
19 |   spec.require_paths = ['lib']
20 | 
21 |   spec.add_dependency 'activerecord', '~> 4.2.5'
22 |   spec.add_dependency 'activesupport', '~> 4.2.5'
23 |   spec.add_dependency 'aws-sdk', '~> 1.66.0'
24 |   spec.add_dependency 'elasticity', '~> 6.0.5'
25 |   spec.add_dependency 'highline', '~> 1.7.8'
26 |   spec.add_dependency 'sqlite3', '~> 1.3.11'
27 |   spec.add_dependency 'thor', '~> 0.19.1'
28 | 
29 |   spec.add_development_dependency 'rake', '~> 10.4.2'
30 |   spec.add_development_dependency 'bundler', '~> 1.14.4'
31 |   spec.add_development_dependency 'rspec', '~> 3.4.0'
32 |   spec.add_development_dependency 'database_cleaner', '~> 1.5.1'
33 |   spec.add_development_dependency 'shoulda-matchers', '~> 3.0.1'
34 | end
35 | 


--------------------------------------------------------------------------------
/lib/elasticrawl/job_step.rb:
--------------------------------------------------------------------------------
 1 | module Elasticrawl
 2 |   # Represents an Elastic MapReduce job flow step.  For a parse job this will
 3 |   # process a single Common Crawl segment.  For a combine job a single step
 4 |   # will aggregate the results of multiple parse jobs.
 5 |   class JobStep < ActiveRecord::Base
 6 |     belongs_to :job
 7 |     belongs_to :crawl_segment
 8 | 
 9 |     # Returns a custom jar step that is configured with the jar location,
10 |     # class name and input and output paths.
11 |     #
12 |     # For parse jobs optionally specifies the maximum # of Common Crawl
13 |     # data files to process before the job exits.
14 |     def job_flow_step(job_config)
15 |       jar = job_config['jar']
16 |       max_files = self.job.max_files
17 | 
18 |       step_args = []
19 |       step_args[0] = job_config['class']
20 |       step_args[1] = self.input_paths
21 |       step_args[2] = self.output_path
22 |       # All arguments must be strings.
23 |       step_args[3] = max_files.to_s if max_files.present?
24 | 
25 |       step = Elasticity::CustomJarStep.new(jar)
26 |       step.name = set_step_name
27 |       step.arguments = step_args
28 | 
29 |       step
30 |     end
31 | 
32 |   private
33 |     # Sets the Elastic MapReduce job flow step name based on the type of job it
34 |     # belongs to.
35 |     def set_step_name
36 |       case self.job.type
37 |         when 'Elasticrawl::ParseJob'
38 |           if self.crawl_segment.present?
39 |             max_files = self.job.max_files || 'all'
40 |             "#{self.crawl_segment.segment_desc} Parsing: #{max_files}"
41 |           end
42 |         when 'Elasticrawl::CombineJob'
43 |           paths = self.input_paths.split(',')
44 |           "Combining #{paths.count} jobs"
45 |       end
46 |     end
47 |   end
48 | end
49 | 


--------------------------------------------------------------------------------
/spec/fixtures/cluster.yml:
--------------------------------------------------------------------------------
 1 | # Configures the Elastic MapReduce cluster that is launched to run parse and
 2 | # combine jobs. The list of EC2 instance types can be found at
 3 | # http://aws.amazon.com/ec2/instance-types/#instance-details
 4 | 
 5 | # Using spot instances is recommended to reduce costs. However if the spot
 6 | # price rises above your bid price the cluster may be terminated. Elasticrawl
 7 | # tries to reduce the effect of this by parsing each Commmon Crawl segment
 8 | # in a separate job flow step.
 9 | 
10 | # The master node manages the cluster.
11 | master_instance_group:
12 |   instance_type: m1.medium
13 |   use_spot_instances: true
14 |   bid_price: 0.120
15 | 
16 | # Core nodes run map and reduce tasks and store data using HDFS.
17 | core_instance_group:
18 |   instance_type: m1.medium
19 |   instance_count: 2
20 |   use_spot_instances: true
21 |   bid_price: 0.120
22 | 
23 | # Task nodes are optional and only run map and reduce tasks.
24 | task_instance_group:
25 |   instance_type: m1.small
26 |   instance_count: 0
27 |   use_spot_instances: true
28 |   bid_price: 0.080
29 | 
30 | # Array of bootstrap scripts that will be applied when the cluster nodes are
31 | # initialized. The example installs the Ganglia distributed monitoring system.
32 | bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia']
33 | 
34 | # Specifying an EC2 key pair allows SSH access to the master node. This also
35 | # allows accessing the Hadoop Web UI over an SSH tunnel.
36 | ec2_key_name: 'elasticrawl'
37 | 
38 | # Availability Zone (AZ) to launch instances in. An AZ in the US-East region is
39 | # recommended since the Common Crawl corpus is stored there. Otherwise inter
40 | # region data transfer charges will apply.
41 | placement: 'us-east-1c'
42 | 
43 | # The AMI version to use when launching instances.
44 | emr_ami_version: 'latest'
45 | 


--------------------------------------------------------------------------------
/templates/cluster.yml:
--------------------------------------------------------------------------------
 1 | # Configures the Elastic MapReduce cluster that is launched to run parse and
 2 | # combine jobs. The list of EC2 instance types can be found at
 3 | # http://aws.amazon.com/ec2/instance-types/#instance-details
 4 | 
 5 | # Using spot instances is recommended to reduce costs. However if the spot
 6 | # price rises above your bid price the cluster may be terminated. Elasticrawl
 7 | # tries to reduce the effect of this by parsing each Commmon Crawl segment
 8 | # in a separate job flow step.
 9 | 
10 | # The master node manages the cluster.
11 | master_instance_group:
12 |   instance_type: m1.medium
13 |   use_spot_instances: true
14 |   bid_price: 0.120
15 | 
16 | # Core nodes run map and reduce tasks and store data using HDFS.
17 | core_instance_group:
18 |   instance_type: m1.medium
19 |   instance_count: 2
20 |   use_spot_instances: true
21 |   bid_price: 0.120
22 | 
23 | # Task nodes are optional and only run map and reduce tasks.
24 | task_instance_group:
25 |   instance_type: m1.small
26 |   instance_count: 0
27 |   use_spot_instances: true
28 |   bid_price: 0.080
29 | 
30 | # Array of bootstrap scripts that will be applied when the cluster nodes are
31 | # initialized. The example installs the Ganglia distributed monitoring system.
32 | bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia']
33 | 
34 | # Specifying an EC2 key pair allows SSH access to the master node. This also
35 | # allows accessing the Hadoop Web UI over an SSH tunnel.
36 | ec2_key_name: # 'key-pair-name'
37 | 
38 | # Availability Zone (AZ) to launch instances in. An AZ in the US-East region is
39 | # recommended since the Common Crawl corpus is stored there. Otherwise inter
40 | # region data transfer charges will apply.
41 | placement: 'us-east-1a'
42 | 
43 | # The AMI version to use when launching instances.
44 | emr_ami_version: 'latest'
45 | 
46 | # Default instance profile
47 | job_flow_role: 'EMR_EC2_DefaultRole'
48 | 
49 | # Default service role
50 | service_role: 'EMR_DefaultRole'
51 | 
52 | # Subnet ID. Required for new Amazon accounts launching more powerful instance types.
53 | ec2_subnet_id: 'subnet-name'
54 | 


--------------------------------------------------------------------------------
/spec/unit/job_step_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | describe Elasticrawl::JobStep, type: :model do
 4 |   it { should belong_to(:job) }
 5 |   it { should belong_to(:crawl_segment) }
 6 |   it { should have_db_column(:input_paths).of_type(:text) }
 7 |   it { should have_db_column(:output_path).of_type(:text) }
 8 | 
 9 |   describe '#job_flow_step' do
10 |     let(:job) { Elasticrawl::ParseJob.create(:job_name => '1389789645620',
11 |                                               :max_files => 3) }
12 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
13 |     let(:crawl_segment) { crawl.crawl_segments[0] }
14 |     let(:input_paths) {
15 |       's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1368696381249/wet/*.warc.wet.gz'
16 |     }
17 |     let(:output_path) {
18 |       's3://elasticrawl/data/1-parse/1389789645620/segments/1368696381249/'
19 |     }
20 |     let(:config) {
21 |       { 'jar' => 's3://elasticrawl/jar/elasticrawl-example-0.0.1.jar',
22 |         'class' => 'com.rossfairbanks.commoncrawl.elasticrawl.ParserDriver'
23 |       }
24 |     }
25 | 
26 |     let(:job_step) { Elasticrawl::JobStep.create(:job => job,
27 |                                           :crawl_segment => crawl_segment,
28 |                                           :input_paths => input_paths,
29 |                                           :output_path => output_path) }
30 |     subject { job_step.job_flow_step(config) } 
31 | 
32 |     it 'should be a CustomJarStep' do
33 |       expect(subject).to be_a Elasticity::CustomJarStep
34 |     end
35 | 
36 |     it 'should have a jar location' do
37 |       expect(subject.jar).to eq config['jar']
38 |     end
39 | 
40 |     it 'should have 4 jar args' do
41 |       expect(subject.arguments.count).to eq 4
42 |     end
43 | 
44 |     it 'should have a class argument' do
45 |       expect(subject.arguments[0]).to eq config['class']
46 |     end
47 | 
48 |     it 'should have an input path arg' do
49 |       expect(subject.arguments[1]).to eq input_paths
50 |     end
51 | 
52 |     it 'should have an output path arg' do
53 |       expect(subject.arguments[2]).to eq output_path
54 |     end
55 | 
56 |     it 'should have a max files arg' do
57 |       expect(subject.arguments[3]).to eq '3'
58 |     end
59 |   end
60 | end
61 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | Vagrant.configure("2") do |config|
 5 |   # All Vagrant configuration is done here. The most common configuration
 6 |   # options are documented and commented below. For a complete reference,
 7 |   # please see the online documentation at vagrantup.com.
 8 | 
 9 |   # Increase RAM to 1 GB
10 |   config.vm.provider "virtualbox" do |vbox|
11 |     vbox.customize ["modifyvm", :id, "--memory", 1024]
12 |   end
13 | 
14 |   # Elasticrawl launches Hadoop jobs for the CommonCrawl dataset using the AWS EMR service.
15 |   config.vm.define :elasticrawl do |elasticrawl|
16 |     elasticrawl.vm.box = "elasticrawl"
17 | 
18 |     # Ubuntu Server 14.04 LTS
19 |     elasticrawl.vm.box = "ubuntu/trusty64"
20 | 
21 |     # Network config
22 |     elasticrawl.vm.network :public_network
23 | 
24 |     # Synced folder for creating deploy packages
25 |     elasticrawl.vm.synced_folder "../traveling-elasticrawl/", "/traveling-elasticrawl/"
26 | 
27 |     # Provision using Chef Solo
28 |     elasticrawl.vm.provision "chef_solo" do |chef|
29 |       chef.cookbooks_path = "cookbooks"
30 |       chef.add_recipe "apt"
31 |       chef.add_recipe "build-essential"
32 |       chef.add_recipe "ruby_build"
33 |       chef.add_recipe "ruby_rbenv::user"
34 |       chef.add_recipe "git"
35 |       chef.add_recipe "vim"
36 | 
37 |       chef.json = {
38 |         "rbenv" => {
39 |           "user_installs" => [
40 |             {
41 |               "user" => "vagrant",
42 |               "rubies" => ["2.0.0-p648", "2.1.8", "2.2.4", "2.3.0"],
43 |               "global" => "2.2.4",
44 |               "gems" => {
45 |                 "2.0.0-p648" => [
46 |                   { "name" => "bundler",
47 |                     "version" => "1.11.2" }
48 |                 ],
49 |                 "2.1.8" => [
50 |                   { "name" => "bundler",
51 |                     "version" => "1.11.2" }
52 |                 ],
53 |                 "2.2.4" => [
54 |                   { "name" => "bundler",
55 |                     "version" => "1.11.2" }
56 |                 ],
57 |                 "2.3.0" => [
58 |                   { "name" => "bundler",
59 |                     "version" => "1.11.2" }
60 |                 ]
61 |               }
62 |             }
63 |           ]
64 |         }
65 |       }
66 | 
67 |     end
68 |   end
69 | end
70 | 


--------------------------------------------------------------------------------
/lib/elasticrawl/job.rb:
--------------------------------------------------------------------------------
 1 | module Elasticrawl
 2 |   # The base job class that is extended by ParseJob and CombineJob.
 3 |   class Job < ActiveRecord::Base
 4 |     has_many :job_steps
 5 | 
 6 |     # Displays a confirmation message showing the configuration of the
 7 |     # Elastic MapReduce job flow and cluster.
 8 |     def confirm_message
 9 |       cluster = Cluster.new
10 | 
11 |       case self.type
12 |       when 'Elasticrawl::ParseJob'
13 |         message = segment_list
14 |       else
15 |         message = []
16 |       end
17 | 
18 |       message.push('Job configuration')
19 |       message.push(self.job_desc)
20 |       message.push('')
21 |       message.push(cluster.cluster_desc)
22 | 
23 |       message.join("\n")
24 |     end
25 | 
26 |     # Displays the Job Name and Elastic MapReduce Job Flow ID if the job was
27 |     # launched successfully.
28 |     def result_message
29 |       "\nJob: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
30 |     end
31 | 
32 |     # Displays the history of the current job. Called by the status command.
33 |     def history
34 |       launch_time = "Launched: #{self.created_at.strftime('%Y-%m-%d %H:%M:%S')}"
35 |       "#{self.job_name} #{launch_time} #{self.job_desc}"
36 |     end
37 | 
38 |   protected
39 |     # Calls the Elastic MapReduce API to create a Job Flow. Returns the Job Flow ID.
40 |     def run_job_flow(emr_config)
41 |       cluster = Cluster.new
42 |       job_flow = cluster.create_job_flow(self, emr_config)
43 | 
44 |       job_steps.each do |step|
45 |         job_flow.add_step(step.job_flow_step(job_config))
46 |       end
47 | 
48 |       begin
49 |         job_flow.run
50 | 
51 |       rescue StandardError => e
52 |         raise ElasticMapReduceAccessError, e.message
53 |       end
54 |     end
55 | 
56 |     # Returns an S3 location for storing either data or logs.
57 |     def build_s3_uri(s3_path)
58 |       URI::Generic.build(:scheme => 's3',
59 |                          :host => bucket_name,
60 |                          :path => s3_path).to_s
61 |     end
62 | 
63 |     # Returns the S3 bucket name configured by the user using the init command.
64 |     def bucket_name
65 |       config = Config.new
66 |       config.load_config('jobs')['s3_bucket_name']
67 |     end
68 | 
69 |     # Sets the job name which is the current Unix timestamp in milliseconds.
70 |     # This is the same naming format used for Common Crawl segment names.
71 |     def set_job_name
72 |       (Time.now.to_f * 1000).to_i.to_s
73 |     end
74 |   end
75 | end
76 | 


--------------------------------------------------------------------------------
/lib/elasticrawl/combine_job.rb:
--------------------------------------------------------------------------------
 1 | module Elasticrawl
 2 |   # Represents an Elastic MapReduce job flow that combines the results of
 3 |   # multiple Elasticrawl Parse jobs.  Parse jobs write their results per
 4 |   # segment. Combine jobs aggregate parse results into a single set of files.
 5 |   #
 6 |   # Inherits from Job which is the ActiveRecord model class.
 7 |   class CombineJob < Job
 8 |     # Takes in an array of parse jobs that are to be combined. Creates a single
 9 |     # job step whose input paths are the outputs of the parse jobs.
10 |     def set_input_jobs(input_jobs)
11 |       segment_count = 0
12 |       input_paths = []
13 | 
14 |       input_jobs.each do |job_name|
15 |         input_job = Job.where(:job_name => job_name,
16 |                               :type => 'Elasticrawl::ParseJob').first_or_initialize
17 |         step_count = input_job.job_steps.count
18 | 
19 |         if step_count > 0
20 |           segment_count += step_count
21 |           input_paths << set_input_path(input_job)
22 |         end
23 |       end
24 | 
25 |       self.job_name = set_job_name
26 |       self.job_desc = set_job_desc(segment_count)
27 |       job_steps.push(create_job_step(input_paths.join(',')))
28 |     end
29 | 
30 |     # Runs the job by calling the Elastic MapReduce API.
31 |     def run
32 |       emr_config = job_config['emr_config']
33 |       job_flow_id = run_job_flow(emr_config)
34 | 
35 |       if job_flow_id.present?
36 |         self.job_flow_id = job_flow_id
37 |         self.save
38 |         self.result_message
39 |       end
40 |     end
41 | 
42 |     # Returns the S3 location for storing Elastic MapReduce job logs.
43 |     def log_uri
44 |       s3_path = "/logs/2-combine/#{self.job_name}/"
45 |       build_s3_uri(s3_path)
46 |     end
47 | 
48 |   private
49 |     # Returns a single job step.  The input paths are a CSV list of parse
50 |     # job outputs.
51 |     def create_job_step(input_paths)
52 |       JobStep.create(:job => self,
53 |                      :input_paths => input_paths,
54 |                      :output_path => set_output_path)
55 |     end
56 | 
57 |     # Returns the S3 location for reading a parse job. A wildcard is
58 |     # used for the segment names. The input filter depends on the output
59 |     # file type of the parse job and what type of compression is used.
60 |     def set_input_path(input_job)
61 |       job_name = input_job.job_name
62 |       input_filter = job_config['input_filter']
63 | 
64 |       s3_path = "/data/1-parse/#{job_name}/segments/*/#{input_filter}"
65 |       build_s3_uri(s3_path)
66 |     end
67 | 
68 |     # Returns the S3 location for storing the combine job results.
69 |     def set_output_path
70 |       s3_path = "/data/2-combine/#{self.job_name}/"
71 |       build_s3_uri(s3_path)
72 |     end
73 | 
74 |     # Sets the job description which forms part of the Elastic MapReduce
75 |     # job flow name.
76 |     def set_job_desc(segment_count)
77 |       "Combining: #{segment_count} segments"
78 |     end
79 | 
80 |     # Returns the combine job configuration from ~/.elasticrawl.jobs.yml.
81 |     def job_config
82 |       config = Config.new
83 |       config.load_config('jobs')['steps']['combine']
84 |     end
85 |   end
86 | end
87 | 


--------------------------------------------------------------------------------
/lib/elasticrawl/parse_job.rb:
--------------------------------------------------------------------------------
 1 | module Elasticrawl
 2 |   # Represents an Elastic MapReduce job flow that parses segments of
 3 |   # Common Crawl data. A job step is created per segment.
 4 |   #
 5 |   # Inherits from Job which is the ActiveRecord model class.
 6 |   class ParseJob < Job
 7 |     # Populates the job from the list of segments to be parsed.
 8 |     def set_segments(crawl_segments, max_files = nil)
 9 |       self.job_name = set_job_name
10 |       self.job_desc = set_job_desc(crawl_segments, max_files)
11 |       self.max_files = max_files
12 | 
13 |       crawl_segments.each do |segment|
14 |         self.job_steps.push(create_job_step(segment))
15 |       end
16 |     end
17 | 
18 |     # Runs the job by calling Elastic MapReduce API.  If successful the
19 |     # parse time is set for each segment.
20 |     def run
21 |       emr_config = job_config['emr_config']
22 |       job_flow_id = run_job_flow(emr_config)
23 | 
24 |       if job_flow_id.present?
25 |         self.job_flow_id = job_flow_id
26 | 
27 |         self.job_steps.each do |step|
28 |           segment = step.crawl_segment
29 |           segment.parse_time = DateTime.now
30 |           segment.save
31 |         end
32 | 
33 |         self.save
34 |         self.result_message
35 |       end
36 |     end
37 | 
38 |     # Returns the S3 location for storing Elastic MapReduce job logs.
39 |     def log_uri
40 |       s3_path = "/logs/1-parse/#{self.job_name}/"
41 |       build_s3_uri(s3_path)
42 |     end
43 | 
44 |     # Return list of segment descriptions.
45 |     def segment_list
46 |       segments = ['Segments']
47 | 
48 |       job_steps.each do |job_step|
49 |         if job_step.crawl_segment.present?
50 |           segment = job_step.crawl_segment
51 |           segments.push(segment.segment_desc)
52 |         end
53 |       end
54 | 
55 |       segments.push('')
56 |     end
57 | 
58 |   private
59 |     # Creates a job step for the crawl segment.
60 |     def create_job_step(segment)
61 |       JobStep.create(:job => self,
62 |                      :crawl_segment => segment,
63 |                      :input_paths => segment_input(segment),
64 |                      :output_path => segment_output(segment))
65 |     end
66 | 
67 |     # Returns the S3 location for reading a crawl segment. The input filter
68 |     # determines which type of Common Crawl data files are parsed.
69 |     def segment_input(segment)
70 |       segment.segment_s3_uri + job_config['input_filter']
71 |     end
72 | 
73 |     # Returns the S3 location for storing the step results.  This includes
74 |     # the segment name.
75 |     def segment_output(segment)
76 |       job_path = "/data/1-parse/#{self.job_name}"
77 |       s3_path = "#{job_path}/segments/#{segment.segment_name}/"
78 |       build_s3_uri(s3_path)
79 |     end
80 | 
81 |     # Sets the job description which forms part of the Elastic MapReduce
82 |     # job flow name.
83 |     def set_job_desc(segments, max_files)
84 |       if segments.count > 0
85 |         crawl_name = segments[0].crawl.crawl_name if segments[0].crawl.present?
86 |         file_desc = max_files.nil? ? 'all files' : "#{max_files} files per segment"
87 |       end
88 | 
89 |       "Crawl: #{crawl_name} Segments: #{segments.count} Parsing: #{file_desc}"
90 |     end
91 | 
92 |     # Returns the parse job configuration from ~/.elasticrawl.jobs.yml.
93 |     def job_config
94 |       config = Config.new
95 |       config.load_config('jobs')['steps']['parse']
96 |     end
97 |   end
98 | end
99 | 


--------------------------------------------------------------------------------
/spec/unit/combine_job_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | describe Elasticrawl::CombineJob do
 4 |   describe '#set_input_jobs' do
 5 |     let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
 6 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
 7 |     let(:segment_list_1) { crawl.crawl_segments[0..1] }
 8 |     let(:segment_list_2) { [crawl.crawl_segments[2]]}
 9 | 
10 |     let(:parse_job_1) { Elasticrawl::ParseJob.new }
11 |     let(:parse_job_2) { Elasticrawl::ParseJob.new }
12 |     let(:combine_job) { Elasticrawl::CombineJob.new }
13 | 
14 |     before do
15 |       crawl.create_segments
16 |       parse_job_1.set_segments(segment_list_1)
17 |       parse_job_2.set_segments(segment_list_2)
18 | 
19 |       input_jobs = [parse_job_1.job_name, parse_job_2.job_name]
20 |       combine_job.set_input_jobs(input_jobs)
21 |     end
22 | 
23 |     it 'should have a job name based on current time' do
24 |       expect(combine_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8)
25 |     end
26 | 
27 |     it 'should have a job desc' do
28 |       expect(combine_job.job_desc.end_with?('Combining: 3 segments')).to eq true
29 |     end
30 | 
31 |     it 'should create 1 job step' do
32 |       expect(combine_job.job_steps.count).to eq 1
33 |     end
34 | 
35 |     it 'should set 1 input path per parse job' do
36 |       input_paths = combine_job.job_steps[0].input_paths
37 |       expect(input_paths.split(',').count).to eq 2
38 |     end
39 | 
40 |     it 'should set input path including parse job name' do
41 |       input_paths = combine_job.job_steps[0].input_paths
42 |       expect(input_paths.include?(parse_job_1.job_name)).to eq true
43 |     end
44 | 
45 |     it 'should set input path without segment names' do
46 |       input_paths = combine_job.job_steps[0].input_paths
47 |       segment_name = segment_list_1[0].segment_name
48 |       expect(input_paths.include?(segment_name)).to eq false
49 |     end
50 | 
51 |     it 'should set output path including job name' do
52 |       output_path = combine_job.job_steps[0].output_path
53 |       expect(output_path.include?(combine_job.job_name)).to eq true
54 |     end
55 |   end
56 | 
57 |   describe '#run' do
58 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
59 |     let(:parse_job_1) { Elasticrawl::ParseJob.new }
60 |     let(:parse_job_2) { Elasticrawl::ParseJob.new }
61 |     let(:combine_job) { Elasticrawl::CombineJob.new }
62 |     let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
63 | 
64 |     before do
65 |       crawl.create_segments
66 |       parse_job_1.set_segments(crawl.crawl_segments[0..1])
67 |       parse_job_2.set_segments([crawl.crawl_segments[2]])
68 | 
69 |       input_jobs = [parse_job_1.job_name, parse_job_2.job_name]
70 |       combine_job.set_input_jobs(input_jobs)
71 |     end
72 | 
73 |     it 'should set a job flow id' do
74 |       allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
75 |       combine_job.run
76 | 
77 |       expect(combine_job.job_flow_id).to eq job_flow_id
78 |     end
79 |   end
80 | 
81 |   describe '#log_uri' do
82 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
83 |     let(:parse_job) { Elasticrawl::ParseJob.new }
84 |     let(:job) { Elasticrawl::CombineJob.new }
85 | 
86 |     before do
87 |       crawl.create_segments
88 |       parse_job.set_segments(crawl.crawl_segments)
89 | 
90 |       job.set_input_jobs([parse_job.job_name])
91 |     end
92 | 
93 |     it 'should set a log uri including the job name' do
94 |       expect(job.log_uri).to eq "s3://elasticrawl/logs/2-combine/#{job.job_name}/"
95 |     end
96 |   end
97 | end
98 | 


--------------------------------------------------------------------------------
/lib/elasticrawl/crawl.rb:
--------------------------------------------------------------------------------
  1 | module Elasticrawl
  2 |   # Represents a web crawl released by the Common Crawl Foundation.
  3 |   # Each crawl is split into multiple crawl segments and is stored
  4 |   # in the S3 public datasets bucket.
  5 |   class Crawl < ActiveRecord::Base
  6 |     has_many :crawl_segments
  7 | 
  8 |     # Returns the status of all saved crawls and the current job history.
  9 |     def self.status(show_all = false)
 10 |       status = ['Crawl Status']
 11 |       Crawl.all.map { |crawl| status << crawl.status }
 12 | 
 13 |       if show_all == true
 14 |         header = 'Job History'
 15 |         jobs = Job.where('job_flow_id is not null').order(:id => :desc)
 16 |       else
 17 |         header = 'Job History (last 10)'
 18 |         jobs = Job.where('job_flow_id is not null').order(:id => :desc).limit(10)
 19 |       end
 20 | 
 21 |       status << ['', header]
 22 |       jobs.map { |job| status << job.history }
 23 | 
 24 |       status.join("\n")
 25 |     end
 26 | 
 27 |     # Returns the status of the current crawl.
 28 |     def status
 29 |       total = self.crawl_segments.count
 30 |       remaining = CrawlSegment.where(:crawl_id => self.id,
 31 |                                         :parse_time => nil).count
 32 |       parsed = total - remaining
 33 |       status = self.crawl_name
 34 |       status += " Segments: to parse #{remaining}, "
 35 |       status += "parsed #{parsed}, total #{total}"
 36 |     end
 37 | 
 38 |     # Checks for crawl segments in the database.  If none are found then checks
 39 |     # the S3 API and creates any segments that are found.
 40 |     def has_segments?
 41 |       if self.crawl_segments.count == 0
 42 |         segment_count = create_segments
 43 |         result = segment_count > 0
 44 |       else
 45 |         result = true
 46 |       end
 47 |     end
 48 | 
 49 |     # Creates crawl segments from the warc.paths file for this crawl.
 50 |     def create_segments
 51 |       file_paths = warc_paths(self.crawl_name)
 52 | 
 53 |       segments = parse_segments(file_paths)
 54 |       save if segments.count > 0
 55 | 
 56 |       segments.keys.each do |segment_name|
 57 |         file_count = segments[segment_name]
 58 |         CrawlSegment.create_segment(self, segment_name, file_count)
 59 |       end
 60 | 
 61 |       segments.count
 62 |     end
 63 | 
 64 |     # Returns the list of segments from the database.
 65 |     def select_segments(segments_list)
 66 |       CrawlSegment.where(:segment_name => segments_list)
 67 |     end
 68 | 
 69 |     # Returns next # segments to be parsed. The maximum is 256
 70 |     # as this is the maximum # of steps for an Elastic MapReduce job flow.
 71 |     def next_segments(max_segments = nil)
 72 |       max_segments = Elasticrawl::MAX_SEGMENTS if max_segments.nil?
 73 |       max_segments = Elasticrawl::MAX_SEGMENTS if max_segments > Elasticrawl::MAX_SEGMENTS
 74 | 
 75 |       self.crawl_segments.where(:parse_time => nil).limit(max_segments)
 76 |     end
 77 | 
 78 |     # Resets parse time of all parsed segments to null so they will be parsed
 79 |     # again. Returns the updated crawl status.
 80 |     def reset
 81 |       segments = CrawlSegment.where('crawl_id = ? and parse_time is not null',
 82 |                                     self.id)
 83 |       segments.map { |segment| segment.update_attribute(:parse_time, nil) }
 84 | 
 85 |       status
 86 |     end
 87 | 
 88 |   private
 89 |     # Gets the WARC file paths from S3 for this crawl if it exists.
 90 |     def warc_paths(crawl_name)
 91 |       s3_path = [Elasticrawl::COMMON_CRAWL_PATH,
 92 |                  crawl_name,
 93 |                  Elasticrawl::WARC_PATHS].join('/')
 94 |       begin
 95 |         s3 = AWS::S3.new
 96 |         bucket = s3.buckets[Elasticrawl::COMMON_CRAWL_BUCKET]
 97 |         object = bucket.objects[s3_path]
 98 | 
 99 |         uncompress_file(object)
100 | 
101 |       rescue AWS::Errors::Base => s3e
102 |         raise S3AccessError.new(s3e.http_response), 'Failed to get WARC paths'
103 |       rescue Exception => e
104 |         raise S3AccessError, 'Failed to get WARC paths'
105 |       end
106 |     end
107 | 
108 |     # Takes in a S3 object and returns the contents as an uncompressed string.
109 |     def uncompress_file(s3_object)
110 |       result = ''
111 | 
112 |       if s3_object.exists?
113 |         io = StringIO.new
114 |         io.write(s3_object.read)
115 |         io.rewind
116 | 
117 |         gz = Zlib::GzipReader.new(io)
118 |         result = gz.read
119 | 
120 |         gz.close
121 |       end
122 | 
123 |       result
124 |     end
125 | 
126 |     # Parses the segment names and file counts from the WARC file paths.
127 |     def parse_segments(warc_paths)
128 |       segments = Hash.new 0
129 | 
130 |       warc_paths.split.each do |warc_path|
131 |         segment_name = warc_path.split('/')[3]
132 |         segments[segment_name] += 1 if segment_name.present?
133 |       end
134 | 
135 |       segments
136 |     end
137 |   end
138 | end
139 | 


--------------------------------------------------------------------------------
/spec/unit/crawl_spec.rb:
--------------------------------------------------------------------------------
  1 | require 'spec_helper'
  2 | 
  3 | describe Elasticrawl::Crawl, type: :model do
  4 |   it { should have_many(:crawl_segments) }
  5 |   it { should have_db_column(:crawl_name).of_type(:string) }
  6 | 
  7 |   describe '#has_segments?' do
  8 |     let(:crawl_name) { 'CC-MAIN-2014-49' }
  9 |     subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) }
 10 | 
 11 |     it 'should have segments' do
 12 |       expect(subject.has_segments?).to eq true
 13 |     end
 14 |   end
 15 | 
 16 |   describe '#create_segments' do
 17 |     let(:crawl_name) { 'CC-MAIN-2014-49' }
 18 |     subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
 19 | 
 20 |     before do
 21 |       subject.create_segments
 22 |     end
 23 | 
 24 |     it 'should set crawl name' do
 25 |       expect(subject.crawl_name).to eq crawl_name
 26 |     end
 27 | 
 28 |     it 'should create correct # of segments' do
 29 |       expect(subject.crawl_segments.count).to eq 3
 30 |     end
 31 | 
 32 |     it 'should create segment names' do
 33 |       expect(subject.crawl_segments[0].segment_name).to eq '1416400372202.67'
 34 |     end
 35 | 
 36 |     it 'should create segment s3 uris' do
 37 |       expect(subject.crawl_segments[0].segment_s3_uri).to eq \
 38 |         's3://commoncrawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/'
 39 |     end
 40 | 
 41 |     it 'should set file counts' do
 42 |       expect(subject.crawl_segments[0].file_count).to eq 3
 43 |     end
 44 |   end
 45 | 
 46 |   describe '#next_segments' do
 47 |     let(:crawl_name) { 'CC-MAIN-2014-49' }
 48 |     subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
 49 | 
 50 |     before do
 51 |       subject.create_segments
 52 |     end
 53 | 
 54 |     it 'should return all segments' do
 55 |       crawl_segments = subject.next_segments
 56 | 
 57 |       expect(crawl_segments.count).to eq 3
 58 |       expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
 59 |       expect(crawl_segments[0].segment_name).to eq '1416400372202.67'
 60 |     end
 61 | 
 62 |     it 'should return first # segments' do
 63 |       crawl_segments = subject.next_segments(2)
 64 | 
 65 |       expect(crawl_segments.count).to eq 2
 66 |       expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
 67 |       expect(crawl_segments[0].segment_name).to eq '1416400372202.67'
 68 |     end
 69 |   end
 70 | 
 71 |   describe '#select_segments' do
 72 |     let(:crawl_name) { 'CC-MAIN-2014-49' }
 73 |     subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
 74 | 
 75 |     before do
 76 |       subject.create_segments
 77 |     end
 78 | 
 79 |     it 'should select no segments' do
 80 |       segments_list = ['test', 'segment']
 81 |       crawl_segments = subject.select_segments(segments_list)
 82 | 
 83 |       expect(crawl_segments.count).to eq 0
 84 |     end
 85 | 
 86 |     it 'should select only segments in list' do
 87 |       segments_list = ['1416400372202.67', '1416400372490.23']
 88 |       crawl_segments = subject.select_segments(segments_list)
 89 | 
 90 |       expect(crawl_segments.count).to eq 2
 91 |     end
 92 |   end
 93 | 
 94 |   describe '#reset' do
 95 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
 96 |     let(:job) { Elasticrawl::ParseJob.new }
 97 |     let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
 98 | 
 99 |     before do
100 |       crawl.create_segments
101 |       job.set_segments(crawl.crawl_segments[0..1])
102 | 
103 |       allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
104 |       job.run
105 | 
106 |       crawl.reset
107 |     end
108 | 
109 |     it 'should set parse time of all segments to null' do
110 |       unparsed_segments = Elasticrawl::CrawlSegment.where(:parse_time => nil).count
111 |       expect(crawl.crawl_segments.count).to eq unparsed_segments
112 |     end
113 |   end
114 | 
115 |   describe '.status' do
116 |     let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' }
117 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
118 |     let(:max_files) { 3 }
119 |     let(:job) { Elasticrawl::ParseJob.new }
120 |     let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
121 | 
122 |     before do
123 |       crawl.create_segments
124 |       job.set_segments(crawl.crawl_segments[0..1], max_files)
125 | 
126 |       allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
127 |       job.run
128 |     end
129 | 
130 |     it 'should display status of crawl segments' do
131 |       expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \
132 |         'CC-MAIN-2014-49 Segments: to parse 1, parsed 2, total 3'
133 |     end
134 | 
135 |     it 'should display parse job desc' do
136 |       crawl_status = Elasticrawl::Crawl.status.split("\n")[4]
137 | 
138 |       expect(crawl_status.include?(job.job_name)).to eq true
139 |       expect(crawl_status.include?(job.job_desc)).to eq true
140 |     end
141 |   end
142 | end
143 | 


--------------------------------------------------------------------------------
/spec/unit/parse_job_spec.rb:
--------------------------------------------------------------------------------
  1 | require 'spec_helper'
  2 | 
  3 | describe Elasticrawl::ParseJob do
  4 |   describe '#set_segments' do
  5 |     let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
  6 |     let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' }
  7 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
  8 |     let(:max_files) { 3 }
  9 |     let(:parse_job) { Elasticrawl::ParseJob.new }
 10 | 
 11 |     before do
 12 |       crawl.create_segments
 13 |       parse_job.set_segments(crawl.crawl_segments[0..1], max_files)
 14 |     end
 15 | 
 16 |     it 'should have a job name based on current time' do
 17 |       expect(parse_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8)
 18 |     end
 19 | 
 20 |     it 'should have a job desc' do
 21 |       expect(parse_job.job_desc).to eq job_desc
 22 |     end
 23 | 
 24 |     it 'should create 2 job steps' do
 25 |       expect(parse_job.job_steps.count).to eq 2
 26 |     end
 27 | 
 28 |     it 'should set steps input path to segment uri' do
 29 |       input_path = parse_job.job_steps[0].input_paths
 30 |       segment_uri = crawl.crawl_segments[0].segment_s3_uri
 31 | 
 32 |       expect(input_path.starts_with?(segment_uri)).to eq true
 33 |     end
 34 | 
 35 |     it 'should set output path' do
 36 |       output_path = parse_job.job_steps[0].output_path
 37 |       segment_name = crawl.crawl_segments[0].segment_name
 38 | 
 39 |       expect(output_path.include?(parse_job.job_name)).to eq true
 40 |       expect(output_path.include?(segment_name)).to eq true
 41 |     end
 42 |   end
 43 | 
 44 |   describe '#confirm_message' do
 45 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
 46 |     let(:job) { Elasticrawl::ParseJob.new }
 47 |     let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: 3 files per segment' }
 48 |     let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' }
 49 | 
 50 |     let(:cluster_desc) {
 51 |       cluster_desc = <<-CLUSTER_DESC
 52 | Cluster configuration
 53 | Master: 1 m1.medium  (Spot: 0.12)
 54 | Core:   2 m1.medium  (Spot: 0.12)
 55 | Task:   --
 56 |       CLUSTER_DESC
 57 |     }
 58 | 
 59 |     before do
 60 |       crawl.create_segments
 61 |       job.set_segments(crawl.crawl_segments[0..2], 3)
 62 |     end
 63 | 
 64 |     it 'should display message including job desc' do
 65 |       expect(job.confirm_message.include?(job_desc)).to eq true
 66 |     end
 67 | 
 68 |     it 'should display message including segment desc' do
 69 |       expect(job.confirm_message.include?(segment_desc)).to eq true
 70 |     end
 71 | 
 72 |     it 'should display message including cluster desc' do
 73 |       expect(job.confirm_message.include?(cluster_desc)).to eq true
 74 |     end
 75 |   end
 76 | 
 77 |   describe '#run' do
 78 |     let(:crawl_name) { 'CC-MAIN-2014-49' }
 79 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
 80 |     let(:job) { Elasticrawl::ParseJob.new }
 81 |     let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
 82 | 
 83 |     before do
 84 |       crawl.create_segments
 85 |       job.set_segments(crawl.crawl_segments[0..1], 5)
 86 | 
 87 |       allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
 88 |       job.run
 89 |     end
 90 | 
 91 |     it 'should set a job flow id' do
 92 |       expect(job.job_flow_id).to eq job_flow_id
 93 |     end
 94 | 
 95 |     it 'should set parse time for parsed segments' do
 96 |       expect(crawl.crawl_segments[0].parse_time.present?).to eq true
 97 |       expect(crawl.crawl_segments[1].parse_time.present?).to eq true
 98 |       expect(crawl.crawl_segments[2].parse_time.present?).to eq false
 99 |     end
100 |   end
101 | 
102 |   describe '#log_uri' do
103 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
104 |     let(:job) { Elasticrawl::ParseJob.new }
105 | 
106 |     before do
107 |       crawl.create_segments
108 |       job.set_segments(crawl.crawl_segments)
109 |     end
110 | 
111 |     it 'should set a log uri including the job name' do
112 |       expect(job.log_uri).to eq "s3://elasticrawl/logs/1-parse/#{job.job_name}/"
113 |     end
114 |   end
115 | 
116 |   describe '#history' do
117 |     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
118 |     let(:job) { Elasticrawl::ParseJob.new }
119 |     let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: all files' }
120 |     let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
121 | 
122 |     before do
123 |       crawl.create_segments
124 |       job.set_segments(crawl.crawl_segments)
125 | 
126 |       allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
127 |       job.run
128 |     end
129 | 
130 |     it 'should return the job name, history and launch time' do
131 |       expect(job.history.include?(job.job_name)).to eq true
132 |       expect(job.history.include?(job.job_desc)).to eq true
133 |       expect(job.history.include?(job.created_at.strftime('%Y-%m-%d %H:%M:%S'))).to eq true
134 |     end
135 |   end
136 | end
137 | 


--------------------------------------------------------------------------------
/bin/elasticrawl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | require 'elasticrawl'
  3 | 
  4 | module Elasticrawl
  5 |   class Cli < Thor
  6 |     desc 'init S3_BUCKET_NAME', 'Creates S3 bucket and config directory'
  7 |     method_option :access_key_id, :type => :string, :desc => 'AWS Access Key ID'
  8 |     method_option :secret_access_key, :type => :string, :desc => 'AWS Secret Access Key'
  9 |     def init(s3_bucket_name)
 10 |       key = options[:access_key_id]
 11 |       secret = options[:secret_access_key]
 12 | 
 13 |       if key.nil? || secret.nil?
 14 |         config = Config.new
 15 | 
 16 |         # Prompt for credentials showing the current values.
 17 |         key = ask(config.access_key_prompt)
 18 |         secret = ask(config.secret_key_prompt)
 19 | 
 20 |         # Use current values if user has selected them.
 21 |         key = config.access_key_id if key.blank?
 22 |         secret = config.secret_access_key if secret.blank?
 23 |       end
 24 | 
 25 |       # Create new config object with updated credentials.
 26 |       config = Config.new(key, secret)
 27 | 
 28 |       if config.bucket_exists?(s3_bucket_name)
 29 |         puts('ERROR: S3 bucket already exists')
 30 |       else
 31 |         if config.dir_exists?
 32 |           puts("WARNING: Config dir #{config.config_dir} already exists")
 33 |           overwrite = agree('Overwrite? (y/n)', true)
 34 |         end
 35 | 
 36 |         puts(config.create(s3_bucket_name)) if !config.dir_exists? || overwrite == true
 37 |       end
 38 |     end
 39 | 
 40 |     desc 'parse CRAWL_NAME', 'Launches parse job against Common Crawl corpus'
 41 |     method_option :max_segments, :type => :numeric, :desc => 'number of crawl segments to parse'
 42 |     method_option :max_files, :type => :numeric, :desc => 'number of files to parse per segment'
 43 |     method_option :segment_list, :type => :array, :desc => 'list of segment names to parse'
 44 |     def parse(crawl_name)
 45 |       load_database
 46 | 
 47 |       crawl = find_crawl(crawl_name)
 48 |       if crawl.has_segments?
 49 |         segment_list = options[:segment_list]
 50 | 
 51 |         if segment_list.present?
 52 |           segments = crawl.select_segments(segment_list)
 53 |         else
 54 |           segments = crawl.next_segments(options[:max_segments])
 55 |         end
 56 | 
 57 |         if segments.count == 0
 58 |           puts('ERROR: No segments matched for parsing')
 59 |         else
 60 |           job = ParseJob.new
 61 |           job.set_segments(segments, options[:max_files])
 62 |           puts(job.confirm_message)
 63 | 
 64 |           launch = agree('Launch job? (y/n)', true)
 65 |           puts(job.run) if launch == true
 66 |         end
 67 |       else
 68 |         puts('ERROR: Crawl does not exist')
 69 |       end
 70 |     end
 71 | 
 72 |     desc 'combine', 'Launches combine job against parse job results'
 73 |     method_option :input_jobs, :type => :array, :required => true,
 74 |       :desc => 'list of input jobs to combine'
 75 |     def combine
 76 |       load_database
 77 | 
 78 |       job = CombineJob.new
 79 |       job.set_input_jobs(options[:input_jobs])
 80 |       puts(job.confirm_message)
 81 | 
 82 |       launch = agree('Launch job? (y/n)', true)
 83 |       puts(job.run) if launch == true
 84 |     end
 85 | 
 86 |     desc 'status', 'Shows crawl status and lists jobs'
 87 |     method_option :show_all, :type => :boolean, :desc => 'list all jobs'
 88 |     def status
 89 |       load_database
 90 |       puts(Crawl.status(options[:show_all]))
 91 |     end
 92 | 
 93 |     desc 'reset CRAWL_NAME', 'Resets a crawl so its segments are parsed again'
 94 |     def reset(crawl_name)
 95 |       load_database
 96 | 
 97 |       crawl = find_crawl(crawl_name)
 98 |       if crawl.has_segments?
 99 |         reset = agree('Reset crawl? (y/n)', true)
100 |         puts(crawl.reset) if reset == true
101 |       else
102 |         puts('ERROR: Crawl does not exist')
103 |       end
104 |     end
105 | 
106 |     desc 'destroy', 'Deletes S3 bucket and config directory'
107 |     def destroy
108 |       config = Config.new
109 | 
110 |       if config.dir_exists?
111 |         puts(config.delete_warning)
112 |         delete = agree('Delete? (y/n)', true)
113 |         puts(config.delete) if delete == true
114 |       else
115 |         puts('No config dir. Nothing to do')
116 |       end
117 |     end
118 | 
119 |   private
120 |     # Find a crawl record in the database.
121 |     def find_crawl(crawl_name)
122 |       Crawl.where(:crawl_name => crawl_name).first_or_initialize
123 |     end
124 | 
125 |     # Load sqlite database.
126 |     def load_database
127 |       config = Config.new
128 |       config.load_database
129 |     end
130 |   end
131 | end
132 | 
133 | begin
134 |   Elasticrawl::Cli.start(ARGV)
135 | # Show errors parsing command line arguments.
136 | rescue Thor::Error => e
137 |   puts(e.message)
138 | # Show elasticrawl errors.
139 | rescue Elasticrawl::Error => e
140 |   puts("ERROR: #{e.message}")
141 |   puts e.backtrace
142 | 
143 |   if e.http_response.present?
144 |     response = e.http_response
145 | 
146 |     puts "HTTP Response: #{response.status}"
147 |     puts response.body if response.body.present?
148 |   end
149 | end
150 | 


--------------------------------------------------------------------------------
/lib/elasticrawl/cluster.rb:
--------------------------------------------------------------------------------
  1 | module Elasticrawl
  2 |   # Configures the cluster settings for the job flow that will be launched.
  3 |   # These settings are loaded from ~/.elasticrawl/cluster.yml.
  4 |   class Cluster
  5 |     def initialize
  6 |       @master_group = instance_group('master')
  7 |       @core_group = instance_group('core')
  8 |       @task_group = instance_group('task') if has_task_group?
  9 |     end
 10 | 
 11 |     # Returns a configured job flow to the calling job.
 12 |     def create_job_flow(job, emr_config = nil)
 13 |       config = Config.new
 14 | 
 15 |       Elasticity.configure do |c|
 16 |         c.access_key = config.access_key_id
 17 |         c.secret_key = config.secret_access_key
 18 |       end
 19 | 
 20 |       job_flow = Elasticity::JobFlow.new
 21 |       job_flow.name = "Job: #{job.job_name} #{job.job_desc}"
 22 |       job_flow.log_uri = job.log_uri
 23 | 
 24 |       configure_job_flow(job_flow)
 25 |       configure_instances(job_flow)
 26 |       configure_bootstrap_actions(job_flow, emr_config)
 27 | 
 28 |       job_flow
 29 |     end
 30 | 
 31 |     # Describes the instances that will be launched.  This is used by the
 32 |     # job confirmation messages.
 33 |     def cluster_desc
 34 |       cluster_desc = <<-HERE
 35 | Cluster configuration
 36 | Master: #{instance_group_desc(@master_group)}
 37 | Core:   #{instance_group_desc(@core_group)}
 38 | Task:   #{instance_group_desc(@task_group)}
 39 | HERE
 40 |     end
 41 | 
 42 |   private
 43 |     # Set job flow properties from settings in cluster.yml.
 44 |     def configure_job_flow(job_flow)
 45 |         ec2_key_name = config_setting('ec2_key_name')
 46 |         placement = config_setting('placement')
 47 |         emr_ami_version = config_setting('emr_ami_version')
 48 |         job_flow_role = config_setting('job_flow_role')
 49 |         service_role = config_setting('service_role')
 50 |         ec2_subnet_id = config_setting('ec2_subnet_id')
 51 | 
 52 |         job_flow.ec2_subnet_id = ec2_subnet_id if ec2_subnet_id.present?
 53 |         job_flow.ec2_key_name = ec2_key_name if ec2_key_name.present?
 54 |         job_flow.placement = placement if placement.present?
 55 |         job_flow.ami_version = emr_ami_version if emr_ami_version.present?
 56 |         job_flow.job_flow_role = job_flow_role if job_flow_role.present?
 57 |         job_flow.service_role = service_role if service_role.present?
 58 |     end
 59 | 
 60 |     # Configures the instances that will be launched.  The master group has
 61 |     # a single node.  The task group is optional.
 62 |     def configure_instances(job_flow)
 63 |       job_flow.set_master_instance_group(@master_group)
 64 |       job_flow.set_core_instance_group(@core_group)
 65 |       job_flow.set_task_instance_group(@task_group) if @task_group.present?
 66 |     end
 67 | 
 68 |     # Configures bootstrap actions that will be run when each instance is
 69 |     # launched. EMR config is an XML file of Hadoop settings stored on S3.
 70 |     # There are applied to each node by a bootstrap action.
 71 |     def configure_bootstrap_actions(job_flow, emr_config = nil)
 72 |       bootstrap_scripts = config_setting('bootstrap_scripts')
 73 | 
 74 |       if bootstrap_scripts.present?
 75 |         bootstrap_scripts.each do |script_uri|
 76 |           action = Elasticity::BootstrapAction.new(script_uri, '', '')
 77 |           job_flow.add_bootstrap_action(action)
 78 |         end
 79 |       end
 80 | 
 81 |       if emr_config.present?
 82 |         action = Elasticity::HadoopFileBootstrapAction.new(emr_config)
 83 |         job_flow.add_bootstrap_action(action)
 84 |       end
 85 |     end
 86 | 
 87 |     # Returns whether cluster.yml specifies a task group.
 88 |     def has_task_group?
 89 |       task_config = config_for_group('task')
 90 |       task_config.has_key?('instance_count') && task_config['instance_count'] > 0
 91 |     end
 92 | 
 93 |     # Describes an instance group.
 94 |     def instance_group_desc(group)
 95 |       if group.present?
 96 |         if group.market == 'SPOT'
 97 |           price = "(Spot: #{group.bid_price})"
 98 |         else
 99 |           price = '(On Demand)'
100 |         end
101 | 
102 |         "#{group.count} #{group.type}  #{price}"
103 |       else
104 |         '--'
105 |       end
106 |     end
107 | 
108 |     # Configures an instance group with the instance type, # of instances and
109 |     # the bid price if spot instances are to be used.
110 |     def instance_group(group_name)
111 |       config = config_for_group(group_name)
112 | 
113 |       instance_group = Elasticity::InstanceGroup.new
114 |       instance_group.role = group_name.upcase
115 |       instance_group.type = config['instance_type']
116 | 
117 |       if config.has_key?('instance_count') && config['instance_count'] > 0
118 |         instance_group.count = config['instance_count']
119 |       end
120 | 
121 |       if config['use_spot_instances'] == true
122 |         instance_group.set_spot_instances(config['bid_price'])
123 |       end
124 | 
125 |       instance_group
126 |     end
127 | 
128 |     # Returns the config settings for an instance group.
129 |     def config_for_group(group_name)
130 |       config_setting("#{group_name}_instance_group")
131 |     end
132 | 
133 |     # Returns a config setting from cluster.yml.
134 |     def config_setting(key_name)
135 |       config = Config.new
136 |       config.load_config('cluster')[key_name]
137 |     end
138 |   end
139 | end
140 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Elasticrawl
  2 | 
  3 | * Command line tool for launching Hadoop jobs using AWS EMR (Elastic MapReduce) to process Common Crawl data.
  4 | * Elasticrawl can be used with [crawl data](http://commoncrawl.org/the-data/get-started/) from April 2014 onwards.
  5 | * A list of crawls released by Common Crawl is maintained on the [wiki](https://github.com/rossf7/elasticrawl/wiki).
  6 | * Common Crawl announce new crawls on their [blog](http://blog.commoncrawl.org/).
  7 | 
  8 | * Ships with a default configuration that launches the
  9 | [elasticrawl-examples](https://github.com/rossf7/elasticrawl-examples) jobs.
 10 | This is an implementation of the standard Hadoop Word Count example.
 11 | 
 12 | This [blog post](https://rossfairbanks.com/2015/01/03/parsing-common-crawl-using-elasticrawl.html) has a walkthrough of running the example jobs on the November 2014 crawl.
 13 | 
 14 | ## Installation
 15 | 
 16 | * Elasticrawl needs a [Ruby installation](https://www.ruby-lang.org/en/documentation/installation/) (2.1 or higher).
 17 | * Install Ruby from RubyGems.
 18 | 
 19 | ```
 20 | gem install elasticrawl --no-rdoc --no-ri
 21 | ```
 22 | 
 23 | ### Troubleshooting
 24 | 
 25 | If you get the error "EMR service role arn:aws:iam::156793023547:role/EMR_DefaultRole is invalid" when launching a cluster then you don't have the necessary IAM roles.
 26 | To fix this install the [AWS CLI](https://aws.amazon.com/cli/) and run the command below.
 27 | 
 28 | ```
 29 | aws emr create-default-roles 
 30 | ```
 31 | 
 32 | ## Commands
 33 | 
 34 | ### elasticrawl init
 35 | 
 36 | The init command takes in an S3 bucket name and your AWS credentials. The S3 bucket will be created
 37 | and will store your data and logs.
 38 | 
 39 | ```bash
 40 | ~$ elasticrawl init your-s3-bucket
 41 | 
 42 | Enter AWS Access Key ID: ************
 43 | Enter AWS Secret Access Key: ************
 44 | 
 45 | ...
 46 | 
 47 | Bucket s3://elasticrawl-test created
 48 | Config dir /Users/ross/.elasticrawl created
 49 | Config complete
 50 | ```
 51 | 
 52 | ### elasticrawl parse
 53 | 
 54 | The parse command takes in the crawl name and an optional number of segments and files to parse.
 55 | 
 56 | ```bash
 57 | ~$ elasticrawl parse CC-MAIN-2015-48 --max-segments 2 --max-files 3
 58 | Segments
 59 | Segment: 1416400372202.67 Files: 150
 60 | Segment: 1416400372490.23 Files: 124
 61 | 
 62 | Job configuration
 63 | Crawl: CC-MAIN-2015-48 Segments: 2 Parsing: 3 files per segment
 64 | 
 65 | Cluster configuration
 66 | Master: 1 m1.medium  (Spot: 0.12)
 67 | Core:   2 m1.medium  (Spot: 0.12)
 68 | Task:   --
 69 | Launch job? (y/n)
 70 | y
 71 | 
 72 | Job: 1420124830792 Job Flow ID: j-2R3MFE6TWLIUB
 73 | ```
 74 | 
 75 | ### elasticrawl combine
 76 | 
 77 | The combine command takes in the results of previous parse jobs and produces a combined set of results.
 78 | 
 79 | ```bash
 80 | ~$ elasticrawl combine --input-jobs 1420124830792
 81 | Job configuration
 82 | Combining: 2 segments
 83 | 
 84 | Cluster configuration
 85 | Master: 1 m1.medium  (Spot: 0.12)
 86 | Core:   2 m1.medium  (Spot: 0.12)
 87 | Task:   --
 88 | Launch job? (y/n)
 89 | y
 90 | 
 91 | Job: 1420129496115 Job Flow ID: j-251GXDIZGK8HL
 92 | ```
 93 | 
 94 | ### elasticrawl status
 95 | 
 96 | The status command shows crawls and your job history.
 97 | 
 98 | ```bash
 99 | ~$ elasticrawl status
100 | Crawl Status
101 | CC-MAIN-2015-48 Segments: to parse 98, parsed 2, total 100
102 | 
103 | Job History (last 10)
104 | 1420124830792 Launched: 2015-01-01 15:07:10 Crawl: CC-MAIN-2015-48 Segments: 2 Parsing: 3 files per segment
105 | ```
106 | 
107 | ### elasticrawl reset
108 | 
109 | The reset comment resets a crawl so it is parsed again.
110 | 
111 | ```bash
112 | ~$ elasticrawl reset CC-MAIN-2015-48
113 | Reset crawl? (y/n)
114 | y
115 |  CC-MAIN-2015-48 Segments: to parse 100, parsed 0, total 100
116 | ```
117 | 
118 | ### elasticrawl destroy
119 | 
120 | The destroy command deletes your S3 bucket and the ~/.elasticrawl directory.
121 | 
122 | ```bash
123 | ~$ elasticrawl destroy
124 | 
125 | WARNING:
126 | Bucket s3://elasticrawl-test and its data will be deleted
127 | Config dir /home/vagrant/.elasticrawl will be deleted
128 | Delete? (y/n)
129 | y
130 | 
131 | Bucket s3://elasticrawl-test deleted
132 | Config dir /home/vagrant/.elasticrawl deleted
133 | Config deleted
134 | ```
135 | 
136 | ## Configuring Elasticrawl
137 | 
138 | The elasticrawl init command creates the ~/elasticrawl/ directory which
139 | contains
140 | 
141 | * [aws.yml](https://github.com/rossf7/.elasticrawl/blob/master/templates/aws.yml) -
142 | stores your AWS access credentials. Or you can set the environment
143 | variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
144 | 
145 | * [cluster.yml](https://github.com/rossf7/elasticrawl/blob/master/templates/cluster.yml) -
146 | configures the EC2 instances that are launched to form your EMR cluster
147 | 
148 | * [jobs.yml](https://github.com/rossf7/elasticrawl/blob/master/templates/jobs.yml) -
149 | stores your S3 bucket name and the config for the parse and combine jobs
150 | 
151 | ## Development
152 | 
153 | Elasticrawl is developed in Ruby and requires Ruby 2.1.0 or later (Ruby 2.3 is recommended). The sqlite3 and nokogiri gems have C extensions which mean you may need to install development headers.
154 | 
155 | [![Gem Version](https://badge.fury.io/rb/elasticrawl.png)](http://badge.fury.io/rb/elasticrawl)
156 | [![Code Climate](https://codeclimate.com/github/rossf7/elasticrawl.png)](https://codeclimate.com/github/rossf7/elasticrawl)
157 | [![Build Status](https://travis-ci.org/rossf7/elasticrawl.png?branch=master)](https://travis-ci.org/rossf7/elasticrawl) 2.0.0, 2.1.8, 2.2.4, 2.3.0
158 | 
159 | ## TODO
160 | 
161 | * Add support for Streaming and Pig jobs
162 | 
163 | ## Thanks
164 | 
165 | * Thanks to everyone at Common Crawl for making this awesome dataset available!
166 | * Thanks to Robert Slifka for the [elasticity](https://github.com/rslifka/elasticity)
167 | gem which provides a nice Ruby wrapper for the EMR REST API.
168 | * Thanks to Phusion for creating Traveling Ruby.
169 | 
170 | ## Contributing
171 | 
172 | 1. Fork it
173 | 2. Create your feature branch (`git checkout -b my-new-feature`)
174 | 3. Commit your changes (`git commit -am 'Add some feature'`)
175 | 4. Push to the branch (`git push origin my-new-feature`)
176 | 5. Create new Pull Request
177 | 
178 | ## License
179 | 
180 | This code is licensed under the MIT license.
181 | 


--------------------------------------------------------------------------------
/lib/elasticrawl/config.rb:
--------------------------------------------------------------------------------
  1 | module Elasticrawl
  2 |   # Represents the current configuration which is persisted to
  3 |   # ~/.elasticrawl/ and contains 3 configuration files.
  4 |   # 
  5 |   # aws.yml     - AWS access credentials unless stored in the environment
  6 |   #               variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.
  7 |   # cluster.yml - Elastic MapReduce cluster config including instance groups.
  8 |   # jobs.yml    - Elastic MapReduce jobs config and the S3 bucket used for
  9 |   #               storing data and logs.
 10 |   #
 11 |   # This directory also contains the Elasticrawl SQLite database.
 12 |   class Config
 13 |     CONFIG_DIR = '.elasticrawl'
 14 |     DATABASE_FILE = 'elasticrawl.sqlite3'
 15 |     TEMPLATES_DIR = '../../templates'
 16 |     TEMPLATE_FILES = ['aws.yml', 'cluster.yml', 'jobs.yml']
 17 | 
 18 |     attr_reader :access_key_id
 19 |     attr_reader :secret_access_key
 20 | 
 21 |     # Sets the AWS access credentials needed for the S3 and EMR API calls.
 22 |     def initialize(access_key_id = nil, secret_access_key = nil)
 23 |       # Credentials have been provided to the init command.
 24 |       @access_key_id = access_key_id
 25 |       @secret_access_key = secret_access_key
 26 | 
 27 |       # If credentials are not set then check if they are available in aws.yml.
 28 |       if dir_exists?
 29 |         config = load_config('aws')
 30 |         key = config['access_key_id']
 31 |         secret = config['secret_access_key']
 32 | 
 33 |         @access_key_id ||= key unless key == 'ACCESS_KEY_ID'
 34 |         @secret_access_key ||= secret unless secret == 'SECRET_ACCESS_KEY'
 35 |       end
 36 | 
 37 |       # If credentials are still not set then check AWS environment variables.
 38 |       @access_key_id ||= ENV['AWS_ACCESS_KEY_ID']
 39 |       @secret_access_key ||= ENV['AWS_SECRET_ACCESS_KEY']
 40 | 
 41 |       # Set AWS credentials for use when accessing the S3 API.
 42 |       AWS.config(:access_key_id => @access_key_id,
 43 |                  :secret_access_key => @secret_access_key)
 44 |     end
 45 | 
 46 |     # Returns the location of the config directory.
 47 |     def config_dir
 48 |       File.join(Dir.home, CONFIG_DIR)
 49 |     end
 50 | 
 51 |     # Checks if the configuration directory exists.
 52 |     def dir_exists?
 53 |       Dir.exists?(config_dir)
 54 |     end
 55 | 
 56 |     # Loads a YAML configuration file.
 57 |     def load_config(config_file)
 58 |       if dir_exists?
 59 |         begin
 60 |           config_file = File.join(config_dir, "#{config_file}.yml")
 61 |           config = YAML::load(File.open(config_file))
 62 | 
 63 |         rescue StandardError => e
 64 |           raise FileAccessError, e.message
 65 |         end
 66 |       else
 67 |         raise ConfigDirMissingError, 'Config dir missing. Run init command'
 68 |       end
 69 |     end
 70 | 
 71 |     # Loads the sqlite database.  If no database exists it will be created
 72 |     # and the database migrations will be run.
 73 |     def load_database
 74 |       if dir_exists?
 75 |         config = {
 76 |           'adapter' => 'sqlite3',
 77 |           'database' => File.join(config_dir, DATABASE_FILE),
 78 |           'pool' => 5,
 79 |           'timeout' => 5000
 80 |         }
 81 | 
 82 |         begin
 83 |           ActiveRecord::Base.establish_connection(config)
 84 |           ActiveRecord::Migrator.migrate(File.join(File.dirname(__FILE__), \
 85 |             '../../db/migrate'), ENV['VERSION'] ? ENV['VERSION'].to_i : nil )
 86 | 
 87 |         rescue StandardError => e
 88 |           raise DatabaseAccessError, e.message
 89 |         end
 90 |       else
 91 |         raise ConfigDirMissingError, 'Config dir missing. Run init command'
 92 |       end
 93 |     end
 94 | 
 95 |     # Checks if a S3 bucket name is in use.
 96 |     def bucket_exists?(bucket_name)
 97 |       begin
 98 |         s3 = AWS::S3.new
 99 |         s3.buckets[bucket_name].exists?
100 | 
101 |       rescue AWS::S3::Errors::SignatureDoesNotMatch => e
102 |         raise AWSCredentialsInvalidError, 'AWS access credentials are invalid'
103 |       rescue AWS::Errors::Base => s3e
104 |         raise S3AccessError.new(s3e.http_response), s3e.message
105 |       end
106 |     end
107 | 
108 |     # Creates the S3 bucket and config directory. Deploys the config templates
109 |     # and creates the sqlite database.
110 |     def create(bucket_name)
111 |       create_bucket(bucket_name)
112 |       deploy_templates(bucket_name)
113 |       load_database
114 | 
115 |       status_message(bucket_name, 'created')
116 |     end
117 | 
118 |     # Deletes the S3 bucket and config directory.
119 |     def delete
120 |       bucket_name = load_config('jobs')['s3_bucket_name']
121 |       delete_bucket(bucket_name)
122 |       delete_config_dir
123 |       
124 |       status_message(bucket_name, 'deleted')
125 |     end
126 | 
127 |     # Displayed by destroy command to confirm deletion.
128 |     def delete_warning
129 |       bucket_name = load_config('jobs')['s3_bucket_name']
130 | 
131 |       message = ['WARNING:']
132 |       message << "Bucket s3://#{bucket_name} and its data will be deleted"
133 |       message << "Config dir #{config_dir} will be deleted"
134 | 
135 |       message.join("\n")
136 |     end
137 | 
138 |     # Displayed by init command.
139 |     def access_key_prompt
140 |       prompt = "Enter AWS Access Key ID:"
141 |       prompt += " [#{@access_key_id}]" if @access_key_id.present?
142 | 
143 |       prompt
144 |     end
145 | 
146 |     # Displayed by init command.
147 |     def secret_key_prompt
148 |       prompt = "Enter AWS Secret Access Key:"
149 |       prompt += " [#{@secret_access_key}]" if @secret_access_key.present?
150 | 
151 |       prompt
152 |     end
153 | 
154 |   private
155 |     # Creates a bucket using the S3 API.
156 |     def create_bucket(bucket_name)
157 |       begin
158 |         s3 = AWS::S3.new
159 |         s3.buckets.create(bucket_name)
160 | 
161 |       rescue AWS::Errors::Base => s3e
162 |         raise S3AccessError.new(s3e.http_response), s3e.message
163 |       end
164 |     end
165 | 
166 |     # Deletes a bucket and its contents using the S3 API.
167 |     def delete_bucket(bucket_name)
168 |       begin
169 |         s3 = AWS::S3.new
170 |         bucket = s3.buckets[bucket_name]
171 |         bucket.delete!
172 | 
173 |       rescue AWS::Errors::Base => s3e
174 |         raise S3AccessError.new(s3e.http_response), s3e.message
175 |       end
176 |     end
177 | 
178 |     # Creates config directory and copies config templates into it.
179 |     # Saves S3 bucket name to jobs.yml and AWS credentials to aws.yml.
180 |     def deploy_templates(bucket_name)
181 |       begin
182 |         Dir.mkdir(config_dir, 0755) if dir_exists? == false
183 | 
184 |         TEMPLATE_FILES.each do |template_file|
185 |           FileUtils.cp(File.join(File.dirname(__FILE__), TEMPLATES_DIR, template_file),
186 |                        File.join(config_dir, template_file))
187 |         end
188 | 
189 |         save_config('jobs', { 'BUCKET_NAME' => bucket_name })
190 |         save_aws_config
191 | 
192 |       rescue StandardError => e
193 |         raise FileAccessError, e.message
194 |       end
195 |     end
196 | 
197 |     # Saves AWS access credentials to aws.yml unless they are configured as
198 |     # environment variables.
199 |     def save_aws_config
200 |       env_key = ENV['AWS_ACCESS_KEY_ID']
201 |       env_secret = ENV['AWS_SECRET_ACCESS_KEY']
202 | 
203 |       creds = {}
204 |       creds['ACCESS_KEY_ID'] = @access_key_id unless @access_key_id == env_key
205 |       creds['SECRET_ACCESS_KEY'] = @secret_access_key \
206 |         unless @secret_access_key == env_secret
207 | 
208 |       save_config('aws', creds)
209 |     end
210 | 
211 |     # Saves config values by overwriting placeholder values in template.
212 |     def save_config(template, params)
213 |       config_file = File.join(config_dir, "#{template}.yml")
214 |       config = File.read(config_file)
215 | 
216 |       params.map { |key, value| config = config.gsub(key, value) }
217 | 
218 |       File.open(config_file, 'w') { |file| file.write(config) }
219 |     end
220 | 
221 |     # Deletes the config directory including its contents.
222 |     def delete_config_dir
223 |       begin
224 |         FileUtils.rm_r(config_dir) if dir_exists?
225 | 
226 |       rescue StandardError => e
227 |         raise FileAccessError, e.message
228 |       end
229 |     end
230 | 
231 |     # Notifies user of results of init or destroy commands.
232 |     def status_message(bucket_name, state)
233 |       message = ['', "Bucket s3://#{bucket_name} #{state}"]
234 |       message << "Config dir #{config_dir} #{state}"
235 | 
236 |       state = 'complete' if state == 'created'
237 |       message << "Config #{state}"
238 | 
239 |       message.join("\n")
240 |     end
241 |   end
242 | end
243 | 


--------------------------------------------------------------------------------