├── Gemfile ├── .travis.yml ├── lib ├── elasticrawl │ ├── version.rb │ ├── error.rb │ ├── crawl_segment.rb │ ├── job_step.rb │ ├── job.rb │ ├── combine_job.rb │ ├── parse_job.rb │ ├── crawl.rb │ ├── cluster.rb │ └── config.rb └── elasticrawl.rb ├── db └── migrate │ ├── 201412311554_add_file_count_to_crawl_segments.rb │ ├── 201401051536_create_crawls.rb │ ├── 201401141606_create_job_steps.rb │ ├── 201401101723_create_jobs.rb │ └── 201401051855_create_crawl_segments.rb ├── spec ├── fixtures │ ├── aws.yml │ ├── warc.paths │ ├── jobs.yml │ └── cluster.yml ├── unit │ ├── job_spec.rb │ ├── config_spec.rb │ ├── crawl_segment_spec.rb │ ├── cluster_spec.rb │ ├── job_step_spec.rb │ ├── combine_job_spec.rb │ ├── crawl_spec.rb │ └── parse_job_spec.rb └── spec_helper.rb ├── Rakefile ├── Cheffile ├── .gitignore ├── templates ├── aws.yml ├── jobs.yml └── cluster.yml ├── CHANGELOG.md ├── Cheffile.lock ├── LICENSE ├── elasticrawl.gemspec ├── Vagrantfile ├── bin └── elasticrawl └── README.md /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gemspec 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 2.2.6 4 | - 2.3.3 5 | - 2.4.0 6 | -------------------------------------------------------------------------------- /lib/elasticrawl/version.rb: -------------------------------------------------------------------------------- 1 | module Elasticrawl 2 | VERSION = '1.1.8' 3 | end 4 | -------------------------------------------------------------------------------- /db/migrate/201412311554_add_file_count_to_crawl_segments.rb: -------------------------------------------------------------------------------- 1 | class AddFileCountToCrawlSegments < ActiveRecord::Migration 2 | def change 3 | add_column(:crawl_segments, :file_count, :integer) 4 | end 5 | end 6 | -------------------------------------------------------------------------------- /spec/fixtures/aws.yml: -------------------------------------------------------------------------------- 1 | # Configures the AWS credentials used when accessing the AWS EMR and S3 APIs. 2 | # This file is populated by the elasticrawl init command. 3 | access_key_id: 'ACCESS_KEY_ID' 4 | secret_access_key: 'SECRET_ACCESS_KEY' 5 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler/gem_tasks' 2 | require 'rspec/core/rake_task' 3 | 4 | namespace :spec do 5 | RSpec::Core::RakeTask.new(:unit) do |t| 6 | t.pattern = 'spec/unit/*_spec.rb' 7 | end 8 | end 9 | 10 | desc 'Run unit specs' 11 | task :default => 'spec:unit' 12 | -------------------------------------------------------------------------------- /db/migrate/201401051536_create_crawls.rb: -------------------------------------------------------------------------------- 1 | class CreateCrawls < ActiveRecord::Migration 2 | def change 3 | create_table :crawls do |t| 4 | t.string :crawl_name 5 | t.timestamps(:null => false) 6 | end 7 | 8 | add_index(:crawls, :crawl_name, :unique => true) 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /Cheffile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | #^syntax detection 3 | 4 | site "https://supermarket.getchef.com/api/v1" 5 | 6 | cookbook "apt", "2.9.2" 7 | cookbook "build-essential", "2.2.4" 8 | cookbook "git", "4.3.5" 9 | cookbook "ruby_rbenv", "1.0.1" 10 | cookbook "ruby_build", "0.8.0" 11 | cookbook "vim", "2.0.0" 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | Gemfile.lock 7 | InstalledFiles 8 | _yardoc 9 | coverage 10 | doc/ 11 | lib/bundler/man 12 | pkg 13 | rdoc 14 | spec/reports 15 | test/tmp 16 | test/version_tmp 17 | tmp 18 | 19 | .vagrant 20 | cookbooks 21 | spec/fixtures/elasticrawl.sqlite3 22 | -------------------------------------------------------------------------------- /db/migrate/201401141606_create_job_steps.rb: -------------------------------------------------------------------------------- 1 | class CreateJobSteps < ActiveRecord::Migration 2 | def change 3 | create_table :job_steps do |t| 4 | t.references :job 5 | t.references :crawl_segment 6 | t.text :input_paths 7 | t.text :output_path 8 | t.timestamps(:null => false) 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /templates/aws.yml: -------------------------------------------------------------------------------- 1 | # Configures the AWS access credentials used when calling the AWS 2 | # Elastic MapReduce and S3 APIs. This file is populated by the init command. 3 | # 4 | # Instead of configuring this file you can set the environment variables 5 | # AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY. 6 | access_key_id: 'ACCESS_KEY_ID' 7 | secret_access_key: 'SECRET_ACCESS_KEY' 8 | -------------------------------------------------------------------------------- /db/migrate/201401101723_create_jobs.rb: -------------------------------------------------------------------------------- 1 | class CreateJobs < ActiveRecord::Migration 2 | def change 3 | create_table :jobs do |t| 4 | t.string :type 5 | t.string :job_name 6 | t.string :job_desc 7 | t.integer :max_files 8 | t.string :job_flow_id 9 | t.timestamps(:null => false) 10 | end 11 | 12 | add_index(:jobs, :job_name, :unique => true) 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## v1.1.6 / 2016-06-26 2 | * Change CommonCrawl bucket to s3://commoncrawl 3 | 4 | ## v1.1.3 / 2015-02-04 5 | * Upgrade Traveling Ruby to 20150204-2.1.5 6 | 7 | ## v1.1.2 / 2015-01-27 8 | * Improve error handling for S3 API calls 9 | 10 | ## v1.1.1 / 2015-01-27 11 | * Use Traveling Ruby to build deploy packages 12 | 13 | ## v1.1.0 / 2015-01-03 14 | * Show file counts for each segment 15 | 16 | ## v1.0.0 / 2014-02-04 17 | * Initial release 18 | -------------------------------------------------------------------------------- /spec/unit/job_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Elasticrawl::Job, type: :model do 4 | it { should have_many(:job_steps) } 5 | it { should have_db_column(:type).of_type(:string) } 6 | it { should have_db_column(:job_name).of_type(:string) } 7 | it { should have_db_column(:job_desc).of_type(:string) } 8 | it { should have_db_column(:max_files).of_type(:integer) } 9 | it { should have_db_column(:job_flow_id).of_type(:string) } 10 | end 11 | -------------------------------------------------------------------------------- /db/migrate/201401051855_create_crawl_segments.rb: -------------------------------------------------------------------------------- 1 | class CreateCrawlSegments < ActiveRecord::Migration 2 | def change 3 | create_table :crawl_segments do |t| 4 | t.references :crawl 5 | t.string :segment_name 6 | t.string :segment_s3_uri 7 | t.datetime :parse_time 8 | t.timestamps(:null => false) 9 | end 10 | 11 | add_index(:crawl_segments, :segment_name, :unique => true) 12 | add_index(:crawl_segments, :segment_s3_uri, :unique => true) 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /spec/unit/config_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Elasticrawl::Config do 4 | describe '#load_config' do 5 | subject { Elasticrawl::Config.new } 6 | 7 | it 'should return a hash of config data' do 8 | config_data = subject.load_config('jobs') 9 | expect(config_data).to be_a Hash 10 | end 11 | 12 | it 'should load yaml config file' do 13 | config_data = subject.load_config('jobs') 14 | expect(config_data['s3_bucket_name']).to eq 'elasticrawl' 15 | end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /Cheffile.lock: -------------------------------------------------------------------------------- 1 | SITE 2 | remote: https://supermarket.getchef.com/api/v1 3 | specs: 4 | apt (2.9.2) 5 | build-essential (2.2.4) 6 | chef_handler (1.2.0) 7 | dmg (2.3.0) 8 | git (4.3.5) 9 | build-essential (>= 0.0.0) 10 | dmg (>= 0.0.0) 11 | windows (>= 0.0.0) 12 | yum-epel (>= 0.0.0) 13 | ruby_build (0.8.0) 14 | ruby_rbenv (1.0.1) 15 | ruby_build (>= 0.0.0) 16 | vim (2.0.0) 17 | windows (1.39.0) 18 | chef_handler (>= 0.0.0) 19 | yum (3.8.2) 20 | yum-epel (0.6.5) 21 | yum (~> 3.2) 22 | 23 | DEPENDENCIES 24 | apt (= 2.9.2) 25 | build-essential (= 2.2.4) 26 | git (= 4.3.5) 27 | ruby_build (= 0.8.0) 28 | ruby_rbenv (= 1.0.1) 29 | vim (= 2.0.0) 30 | 31 | -------------------------------------------------------------------------------- /spec/fixtures/warc.paths: -------------------------------------------------------------------------------- 1 | crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00001-ip-10-235-23-156.ec2.internal.warc.gz 2 | crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz 3 | crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00003-ip-10-235-23-156.ec2.internal.warc.gz 4 | crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz 5 | crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz 6 | crawl-data/CC-MAIN-2014-49/segments/1416400372542.20/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz 7 | -------------------------------------------------------------------------------- /lib/elasticrawl.rb: -------------------------------------------------------------------------------- 1 | require 'aws-sdk' 2 | require 'active_record' 3 | require 'active_support' 4 | require 'elasticity' 5 | require 'highline/import' 6 | require 'thor' 7 | 8 | module Elasticrawl 9 | # S3 locations 10 | COMMON_CRAWL_BUCKET = 'commoncrawl' 11 | COMMON_CRAWL_PATH = 'crawl-data' 12 | SEGMENTS_PATH = 'segments' 13 | WARC_PATHS = 'warc.paths.gz' 14 | MAX_SEGMENTS = 256 15 | 16 | require 'elasticrawl/version' 17 | 18 | require 'elasticrawl/config' 19 | require 'elasticrawl/error' 20 | 21 | require 'elasticrawl/cluster' 22 | require 'elasticrawl/crawl' 23 | require 'elasticrawl/crawl_segment' 24 | require 'elasticrawl/job' 25 | require 'elasticrawl/combine_job' 26 | require 'elasticrawl/parse_job' 27 | require 'elasticrawl/job_step' 28 | end 29 | -------------------------------------------------------------------------------- /lib/elasticrawl/error.rb: -------------------------------------------------------------------------------- 1 | module Elasticrawl 2 | # Base error class extends standard error. 3 | class Error < StandardError 4 | attr_reader :http_response 5 | 6 | def initialize(response = nil) 7 | @http_response = response 8 | end 9 | end 10 | 11 | # AWS access credentials are invalid. 12 | class AWSCredentialsInvalidError < Error; end 13 | 14 | # Config directory does not exist. 15 | class ConfigDirMissingError < Error; end 16 | 17 | # Database error accessing sqlite database. 18 | class DatabaseAccessError < Error; end 19 | 20 | # Error accessing AWS Elastic MapReduce API. 21 | class ElasticMapReduceAccessError < Error; end 22 | 23 | # Error accessing config directory. 24 | class FileAccessError < Error; end 25 | 26 | # Error accessing AWS S3 API. 27 | class S3AccessError < Error; end 28 | end 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Ross Fairbanks 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require 'elasticrawl' 2 | require 'rspec' 3 | require 'database_cleaner' 4 | require 'shoulda-matchers' 5 | 6 | RSpec.configure do |config| 7 | # Run each test in a transaction and rollback data on completion. 8 | DatabaseCleaner.strategy = :transaction 9 | 10 | # Use Shoulda matchers for schema tests. 11 | config.include(Shoulda::Matchers::ActiveRecord, type: :model) 12 | 13 | config.before(:each) do 14 | # Stub S3 call to get WARC file paths 15 | warc_paths = IO.read(File.join(File.dirname(__FILE__), 'fixtures', 'warc.paths')) 16 | allow_any_instance_of(Elasticrawl::Crawl).to receive(:warc_paths).and_return(warc_paths) 17 | 18 | # Load config from spec/fixtures/ rather than ~/.elasticrawl/ 19 | config_dir = File.join(File.dirname(__FILE__), 'fixtures') 20 | allow_any_instance_of(Elasticrawl::Config).to receive(:config_dir).and_return(config_dir) 21 | 22 | # Load sqlite database. For testing this is stored at db/elasticrawl.sqlite3 23 | config = Elasticrawl::Config.new 24 | config.load_database 25 | 26 | DatabaseCleaner.start 27 | end 28 | 29 | config.after(:each) do 30 | DatabaseCleaner.clean 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /spec/unit/crawl_segment_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Elasticrawl::CrawlSegment, type: :model do 4 | it { should belong_to(:crawl) } 5 | it { should have_many(:job_steps) } 6 | it { should have_db_column(:segment_name).of_type(:string) } 7 | it { should have_db_column(:segment_s3_uri).of_type(:string) } 8 | it { should have_db_column(:parse_time).of_type(:datetime) } 9 | it { should have_db_column(:file_count).of_type(:integer) } 10 | 11 | describe '.create_segment' do 12 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 13 | let(:segment_name) { '1416400372202.67' } 14 | let(:file_count) { 3 } 15 | let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' } 16 | subject { Elasticrawl::CrawlSegment.create_segment(crawl, 17 | segment_name, 18 | file_count) } 19 | it 'should have a segment name' do 20 | expect(subject.segment_name).to eq segment_name 21 | end 22 | 23 | it 'should have an s3 uri' do 24 | expect(subject.segment_s3_uri).to eq \ 25 | "s3://commoncrawl/crawl-data/#{crawl.crawl_name}/segments/#{segment_name}/" 26 | end 27 | 28 | it 'should have a file count' do 29 | expect(subject.file_count).to eq file_count 30 | end 31 | 32 | it 'should have a segment description' do 33 | expect(subject.segment_desc).to eq segment_desc 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /lib/elasticrawl/crawl_segment.rb: -------------------------------------------------------------------------------- 1 | module Elasticrawl 2 | # Represents a segment of a web crawl released by the Common Crawl Foundation. 3 | # Each segment contains archive, metadata and text files. 4 | class CrawlSegment < ActiveRecord::Base 5 | belongs_to :crawl 6 | has_many :job_steps 7 | 8 | # Description shows name and number of files in the segment. 9 | def segment_desc 10 | "Segment: #{segment_name} Files: #{file_count}" 11 | end 12 | 13 | # Creates a crawl segment based on its S3 path if it does not exist. 14 | def self.create_segment(crawl, segment_name, file_count) 15 | s3_uri = build_s3_uri(crawl.crawl_name, segment_name) 16 | 17 | segment = CrawlSegment.where(:crawl_id => crawl.id, 18 | :segment_name => segment_name, 19 | :segment_s3_uri => s3_uri, 20 | :file_count => file_count).first_or_create 21 | end 22 | 23 | private 24 | # Generates the S3 location where this segment is stored. 25 | def self.build_s3_uri(crawl_name, segment_name) 26 | s3_path = ['', 27 | Elasticrawl::COMMON_CRAWL_PATH, 28 | crawl_name, 29 | Elasticrawl::SEGMENTS_PATH, 30 | segment_name, 31 | ''] 32 | 33 | URI::Generic.build(:scheme => 's3', 34 | :host => Elasticrawl::COMMON_CRAWL_BUCKET, 35 | :path => s3_path.join('/')).to_s 36 | end 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /spec/fixtures/jobs.yml: -------------------------------------------------------------------------------- 1 | # Configures the AWS Elastic MapReduce jobs launched against the Common Crawl 2 | # corpus. 3 | 4 | # An S3 bucket is created by the init command and is used to store data and logs. 5 | s3_bucket_name: 'elasticrawl' 6 | 7 | # A parse step is created per Common Crawl segment. A combine step takes the 8 | # results from multiple segments to create a single set of output files. 9 | 10 | # The parse input filter is used to specify the Common Crawl file type. 11 | 12 | # WARC: 'warc/*.warc.gz' - Full HTTP requests and responses. 13 | # WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files. 14 | # WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files. 15 | 16 | # The EMR config is an XML file that sets Hadoop properties. If a config file 17 | # is specified then a bootstrap action is run on each node to apply it. 18 | steps: 19 | # Parse step for the Example Elasticrawl JAR. This does a word count 20 | # against the text extractions of the corpus. 21 | parse: 22 | jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar' 23 | class: 'com.rossfairbanks.elasticrawl.examples.WordCount' 24 | input_filter: 'wet/*.warc.wet.gz' 25 | emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml' 26 | # Combine step for the Example Elasticrawl JAR. 27 | combine: 28 | jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar' 29 | class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner' 30 | input_filter: 'part-*' 31 | emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml' 32 | -------------------------------------------------------------------------------- /templates/jobs.yml: -------------------------------------------------------------------------------- 1 | # Configures the AWS Elastic MapReduce jobs launched against the Common Crawl 2 | # corpus. 3 | 4 | # An S3 bucket is created by the init command and is used to store data and logs. 5 | s3_bucket_name: 'BUCKET_NAME' 6 | 7 | # A parse step is created per Common Crawl segment. A combine step takes the 8 | # results from multiple segments to create a single set of output files. 9 | 10 | # The parse input filter is used to specify the Common Crawl file type. 11 | 12 | # WARC: 'warc/*.warc.gz' - Full HTTP requests and responses. 13 | # WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files. 14 | # WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files. 15 | 16 | # The EMR config is an XML file that sets Hadoop properties. If a config file 17 | # is specified then a bootstrap action is run on each node to apply it. 18 | steps: 19 | # Parse step for the Example Elasticrawl JAR. This does a word count 20 | # against the text extractions of the corpus. 21 | parse: 22 | jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar' 23 | class: 'com.rossfairbanks.elasticrawl.examples.WordCount' 24 | input_filter: 'wet/*.warc.wet.gz' 25 | emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml' 26 | # Combine step for the Example Elasticrawl JAR. 27 | combine: 28 | jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar' 29 | class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner' 30 | input_filter: 'part-*' 31 | emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml' 32 | -------------------------------------------------------------------------------- /spec/unit/cluster_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Elasticrawl::Cluster do 4 | describe '#create_job_flow' do 5 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 6 | let(:job) { Elasticrawl::ParseJob.new } 7 | let(:cluster) { Elasticrawl::Cluster.new } 8 | subject { cluster.create_job_flow(job) } 9 | 10 | before do 11 | job.set_segments(crawl.crawl_segments) 12 | end 13 | 14 | it 'should be an Elasticity::JobFlow' do 15 | expect(subject).to be_a Elasticity::JobFlow 16 | end 17 | 18 | it 'should have a job flow name' do 19 | expect(subject.name).to eq "Job: #{job.job_name} #{job.job_desc}" 20 | end 21 | 22 | it 'should have a log uri' do 23 | expect(subject.log_uri).to eq job.log_uri 24 | end 25 | 26 | it 'should have an ec2 key name' do 27 | expect(subject.ec2_key_name).to eq 'elasticrawl' 28 | end 29 | 30 | it 'should have a placement az name' do 31 | expect(subject.placement).to eq 'us-east-1c' 32 | end 33 | 34 | it 'should have an ami version' do 35 | expect(subject.ami_version).to eq 'latest' 36 | end 37 | end 38 | 39 | describe '#cluster_desc' do 40 | let(:cluster_desc) { 41 | cluster_desc = <<-HERE 42 | Cluster configuration 43 | Master: 1 m1.medium (Spot: 0.12) 44 | Core: 2 m1.medium (Spot: 0.12) 45 | Task: -- 46 | HERE 47 | } 48 | subject { Elasticrawl::Cluster.new } 49 | 50 | it 'should describe configured instance groups' do 51 | expect(subject.cluster_desc).to eq cluster_desc 52 | end 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /elasticrawl.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'elasticrawl/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = 'elasticrawl' 8 | spec.version = Elasticrawl::VERSION 9 | spec.authors = ['Ross Fairbanks'] 10 | spec.email = ['ross@rossfairbanks.com'] 11 | spec.summary = %q{Launch AWS Elastic MapReduce jobs that process Common Crawl data.} 12 | spec.description = %q{Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process Common Crawl data.} 13 | spec.homepage = 'https://github.com/rossf7/elasticrawl' 14 | spec.license = 'MIT' 15 | 16 | spec.files = `git ls-files`.split($/) 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ['lib'] 20 | 21 | spec.add_dependency 'activerecord', '~> 4.2.5' 22 | spec.add_dependency 'activesupport', '~> 4.2.5' 23 | spec.add_dependency 'aws-sdk', '~> 1.66.0' 24 | spec.add_dependency 'elasticity', '~> 6.0.5' 25 | spec.add_dependency 'highline', '~> 1.7.8' 26 | spec.add_dependency 'sqlite3', '~> 1.3.11' 27 | spec.add_dependency 'thor', '~> 0.19.1' 28 | 29 | spec.add_development_dependency 'rake', '~> 10.4.2' 30 | spec.add_development_dependency 'bundler', '~> 1.14.4' 31 | spec.add_development_dependency 'rspec', '~> 3.4.0' 32 | spec.add_development_dependency 'database_cleaner', '~> 1.5.1' 33 | spec.add_development_dependency 'shoulda-matchers', '~> 3.0.1' 34 | end 35 | -------------------------------------------------------------------------------- /lib/elasticrawl/job_step.rb: -------------------------------------------------------------------------------- 1 | module Elasticrawl 2 | # Represents an Elastic MapReduce job flow step. For a parse job this will 3 | # process a single Common Crawl segment. For a combine job a single step 4 | # will aggregate the results of multiple parse jobs. 5 | class JobStep < ActiveRecord::Base 6 | belongs_to :job 7 | belongs_to :crawl_segment 8 | 9 | # Returns a custom jar step that is configured with the jar location, 10 | # class name and input and output paths. 11 | # 12 | # For parse jobs optionally specifies the maximum # of Common Crawl 13 | # data files to process before the job exits. 14 | def job_flow_step(job_config) 15 | jar = job_config['jar'] 16 | max_files = self.job.max_files 17 | 18 | step_args = [] 19 | step_args[0] = job_config['class'] 20 | step_args[1] = self.input_paths 21 | step_args[2] = self.output_path 22 | # All arguments must be strings. 23 | step_args[3] = max_files.to_s if max_files.present? 24 | 25 | step = Elasticity::CustomJarStep.new(jar) 26 | step.name = set_step_name 27 | step.arguments = step_args 28 | 29 | step 30 | end 31 | 32 | private 33 | # Sets the Elastic MapReduce job flow step name based on the type of job it 34 | # belongs to. 35 | def set_step_name 36 | case self.job.type 37 | when 'Elasticrawl::ParseJob' 38 | if self.crawl_segment.present? 39 | max_files = self.job.max_files || 'all' 40 | "#{self.crawl_segment.segment_desc} Parsing: #{max_files}" 41 | end 42 | when 'Elasticrawl::CombineJob' 43 | paths = self.input_paths.split(',') 44 | "Combining #{paths.count} jobs" 45 | end 46 | end 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /spec/fixtures/cluster.yml: -------------------------------------------------------------------------------- 1 | # Configures the Elastic MapReduce cluster that is launched to run parse and 2 | # combine jobs. The list of EC2 instance types can be found at 3 | # http://aws.amazon.com/ec2/instance-types/#instance-details 4 | 5 | # Using spot instances is recommended to reduce costs. However if the spot 6 | # price rises above your bid price the cluster may be terminated. Elasticrawl 7 | # tries to reduce the effect of this by parsing each Commmon Crawl segment 8 | # in a separate job flow step. 9 | 10 | # The master node manages the cluster. 11 | master_instance_group: 12 | instance_type: m1.medium 13 | use_spot_instances: true 14 | bid_price: 0.120 15 | 16 | # Core nodes run map and reduce tasks and store data using HDFS. 17 | core_instance_group: 18 | instance_type: m1.medium 19 | instance_count: 2 20 | use_spot_instances: true 21 | bid_price: 0.120 22 | 23 | # Task nodes are optional and only run map and reduce tasks. 24 | task_instance_group: 25 | instance_type: m1.small 26 | instance_count: 0 27 | use_spot_instances: true 28 | bid_price: 0.080 29 | 30 | # Array of bootstrap scripts that will be applied when the cluster nodes are 31 | # initialized. The example installs the Ganglia distributed monitoring system. 32 | bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia'] 33 | 34 | # Specifying an EC2 key pair allows SSH access to the master node. This also 35 | # allows accessing the Hadoop Web UI over an SSH tunnel. 36 | ec2_key_name: 'elasticrawl' 37 | 38 | # Availability Zone (AZ) to launch instances in. An AZ in the US-East region is 39 | # recommended since the Common Crawl corpus is stored there. Otherwise inter 40 | # region data transfer charges will apply. 41 | placement: 'us-east-1c' 42 | 43 | # The AMI version to use when launching instances. 44 | emr_ami_version: 'latest' 45 | -------------------------------------------------------------------------------- /templates/cluster.yml: -------------------------------------------------------------------------------- 1 | # Configures the Elastic MapReduce cluster that is launched to run parse and 2 | # combine jobs. The list of EC2 instance types can be found at 3 | # http://aws.amazon.com/ec2/instance-types/#instance-details 4 | 5 | # Using spot instances is recommended to reduce costs. However if the spot 6 | # price rises above your bid price the cluster may be terminated. Elasticrawl 7 | # tries to reduce the effect of this by parsing each Commmon Crawl segment 8 | # in a separate job flow step. 9 | 10 | # The master node manages the cluster. 11 | master_instance_group: 12 | instance_type: m1.medium 13 | use_spot_instances: true 14 | bid_price: 0.120 15 | 16 | # Core nodes run map and reduce tasks and store data using HDFS. 17 | core_instance_group: 18 | instance_type: m1.medium 19 | instance_count: 2 20 | use_spot_instances: true 21 | bid_price: 0.120 22 | 23 | # Task nodes are optional and only run map and reduce tasks. 24 | task_instance_group: 25 | instance_type: m1.small 26 | instance_count: 0 27 | use_spot_instances: true 28 | bid_price: 0.080 29 | 30 | # Array of bootstrap scripts that will be applied when the cluster nodes are 31 | # initialized. The example installs the Ganglia distributed monitoring system. 32 | bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia'] 33 | 34 | # Specifying an EC2 key pair allows SSH access to the master node. This also 35 | # allows accessing the Hadoop Web UI over an SSH tunnel. 36 | ec2_key_name: # 'key-pair-name' 37 | 38 | # Availability Zone (AZ) to launch instances in. An AZ in the US-East region is 39 | # recommended since the Common Crawl corpus is stored there. Otherwise inter 40 | # region data transfer charges will apply. 41 | placement: 'us-east-1a' 42 | 43 | # The AMI version to use when launching instances. 44 | emr_ami_version: 'latest' 45 | 46 | # Default instance profile 47 | job_flow_role: 'EMR_EC2_DefaultRole' 48 | 49 | # Default service role 50 | service_role: 'EMR_DefaultRole' 51 | 52 | # Subnet ID. Required for new Amazon accounts launching more powerful instance types. 53 | ec2_subnet_id: 'subnet-name' 54 | -------------------------------------------------------------------------------- /spec/unit/job_step_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Elasticrawl::JobStep, type: :model do 4 | it { should belong_to(:job) } 5 | it { should belong_to(:crawl_segment) } 6 | it { should have_db_column(:input_paths).of_type(:text) } 7 | it { should have_db_column(:output_path).of_type(:text) } 8 | 9 | describe '#job_flow_step' do 10 | let(:job) { Elasticrawl::ParseJob.create(:job_name => '1389789645620', 11 | :max_files => 3) } 12 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 13 | let(:crawl_segment) { crawl.crawl_segments[0] } 14 | let(:input_paths) { 15 | 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1368696381249/wet/*.warc.wet.gz' 16 | } 17 | let(:output_path) { 18 | 's3://elasticrawl/data/1-parse/1389789645620/segments/1368696381249/' 19 | } 20 | let(:config) { 21 | { 'jar' => 's3://elasticrawl/jar/elasticrawl-example-0.0.1.jar', 22 | 'class' => 'com.rossfairbanks.commoncrawl.elasticrawl.ParserDriver' 23 | } 24 | } 25 | 26 | let(:job_step) { Elasticrawl::JobStep.create(:job => job, 27 | :crawl_segment => crawl_segment, 28 | :input_paths => input_paths, 29 | :output_path => output_path) } 30 | subject { job_step.job_flow_step(config) } 31 | 32 | it 'should be a CustomJarStep' do 33 | expect(subject).to be_a Elasticity::CustomJarStep 34 | end 35 | 36 | it 'should have a jar location' do 37 | expect(subject.jar).to eq config['jar'] 38 | end 39 | 40 | it 'should have 4 jar args' do 41 | expect(subject.arguments.count).to eq 4 42 | end 43 | 44 | it 'should have a class argument' do 45 | expect(subject.arguments[0]).to eq config['class'] 46 | end 47 | 48 | it 'should have an input path arg' do 49 | expect(subject.arguments[1]).to eq input_paths 50 | end 51 | 52 | it 'should have an output path arg' do 53 | expect(subject.arguments[2]).to eq output_path 54 | end 55 | 56 | it 'should have a max files arg' do 57 | expect(subject.arguments[3]).to eq '3' 58 | end 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | Vagrant.configure("2") do |config| 5 | # All Vagrant configuration is done here. The most common configuration 6 | # options are documented and commented below. For a complete reference, 7 | # please see the online documentation at vagrantup.com. 8 | 9 | # Increase RAM to 1 GB 10 | config.vm.provider "virtualbox" do |vbox| 11 | vbox.customize ["modifyvm", :id, "--memory", 1024] 12 | end 13 | 14 | # Elasticrawl launches Hadoop jobs for the CommonCrawl dataset using the AWS EMR service. 15 | config.vm.define :elasticrawl do |elasticrawl| 16 | elasticrawl.vm.box = "elasticrawl" 17 | 18 | # Ubuntu Server 14.04 LTS 19 | elasticrawl.vm.box = "ubuntu/trusty64" 20 | 21 | # Network config 22 | elasticrawl.vm.network :public_network 23 | 24 | # Synced folder for creating deploy packages 25 | elasticrawl.vm.synced_folder "../traveling-elasticrawl/", "/traveling-elasticrawl/" 26 | 27 | # Provision using Chef Solo 28 | elasticrawl.vm.provision "chef_solo" do |chef| 29 | chef.cookbooks_path = "cookbooks" 30 | chef.add_recipe "apt" 31 | chef.add_recipe "build-essential" 32 | chef.add_recipe "ruby_build" 33 | chef.add_recipe "ruby_rbenv::user" 34 | chef.add_recipe "git" 35 | chef.add_recipe "vim" 36 | 37 | chef.json = { 38 | "rbenv" => { 39 | "user_installs" => [ 40 | { 41 | "user" => "vagrant", 42 | "rubies" => ["2.0.0-p648", "2.1.8", "2.2.4", "2.3.0"], 43 | "global" => "2.2.4", 44 | "gems" => { 45 | "2.0.0-p648" => [ 46 | { "name" => "bundler", 47 | "version" => "1.11.2" } 48 | ], 49 | "2.1.8" => [ 50 | { "name" => "bundler", 51 | "version" => "1.11.2" } 52 | ], 53 | "2.2.4" => [ 54 | { "name" => "bundler", 55 | "version" => "1.11.2" } 56 | ], 57 | "2.3.0" => [ 58 | { "name" => "bundler", 59 | "version" => "1.11.2" } 60 | ] 61 | } 62 | } 63 | ] 64 | } 65 | } 66 | 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /lib/elasticrawl/job.rb: -------------------------------------------------------------------------------- 1 | module Elasticrawl 2 | # The base job class that is extended by ParseJob and CombineJob. 3 | class Job < ActiveRecord::Base 4 | has_many :job_steps 5 | 6 | # Displays a confirmation message showing the configuration of the 7 | # Elastic MapReduce job flow and cluster. 8 | def confirm_message 9 | cluster = Cluster.new 10 | 11 | case self.type 12 | when 'Elasticrawl::ParseJob' 13 | message = segment_list 14 | else 15 | message = [] 16 | end 17 | 18 | message.push('Job configuration') 19 | message.push(self.job_desc) 20 | message.push('') 21 | message.push(cluster.cluster_desc) 22 | 23 | message.join("\n") 24 | end 25 | 26 | # Displays the Job Name and Elastic MapReduce Job Flow ID if the job was 27 | # launched successfully. 28 | def result_message 29 | "\nJob: #{self.job_name} Job Flow ID: #{self.job_flow_id}" 30 | end 31 | 32 | # Displays the history of the current job. Called by the status command. 33 | def history 34 | launch_time = "Launched: #{self.created_at.strftime('%Y-%m-%d %H:%M:%S')}" 35 | "#{self.job_name} #{launch_time} #{self.job_desc}" 36 | end 37 | 38 | protected 39 | # Calls the Elastic MapReduce API to create a Job Flow. Returns the Job Flow ID. 40 | def run_job_flow(emr_config) 41 | cluster = Cluster.new 42 | job_flow = cluster.create_job_flow(self, emr_config) 43 | 44 | job_steps.each do |step| 45 | job_flow.add_step(step.job_flow_step(job_config)) 46 | end 47 | 48 | begin 49 | job_flow.run 50 | 51 | rescue StandardError => e 52 | raise ElasticMapReduceAccessError, e.message 53 | end 54 | end 55 | 56 | # Returns an S3 location for storing either data or logs. 57 | def build_s3_uri(s3_path) 58 | URI::Generic.build(:scheme => 's3', 59 | :host => bucket_name, 60 | :path => s3_path).to_s 61 | end 62 | 63 | # Returns the S3 bucket name configured by the user using the init command. 64 | def bucket_name 65 | config = Config.new 66 | config.load_config('jobs')['s3_bucket_name'] 67 | end 68 | 69 | # Sets the job name which is the current Unix timestamp in milliseconds. 70 | # This is the same naming format used for Common Crawl segment names. 71 | def set_job_name 72 | (Time.now.to_f * 1000).to_i.to_s 73 | end 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /lib/elasticrawl/combine_job.rb: -------------------------------------------------------------------------------- 1 | module Elasticrawl 2 | # Represents an Elastic MapReduce job flow that combines the results of 3 | # multiple Elasticrawl Parse jobs. Parse jobs write their results per 4 | # segment. Combine jobs aggregate parse results into a single set of files. 5 | # 6 | # Inherits from Job which is the ActiveRecord model class. 7 | class CombineJob < Job 8 | # Takes in an array of parse jobs that are to be combined. Creates a single 9 | # job step whose input paths are the outputs of the parse jobs. 10 | def set_input_jobs(input_jobs) 11 | segment_count = 0 12 | input_paths = [] 13 | 14 | input_jobs.each do |job_name| 15 | input_job = Job.where(:job_name => job_name, 16 | :type => 'Elasticrawl::ParseJob').first_or_initialize 17 | step_count = input_job.job_steps.count 18 | 19 | if step_count > 0 20 | segment_count += step_count 21 | input_paths << set_input_path(input_job) 22 | end 23 | end 24 | 25 | self.job_name = set_job_name 26 | self.job_desc = set_job_desc(segment_count) 27 | job_steps.push(create_job_step(input_paths.join(','))) 28 | end 29 | 30 | # Runs the job by calling the Elastic MapReduce API. 31 | def run 32 | emr_config = job_config['emr_config'] 33 | job_flow_id = run_job_flow(emr_config) 34 | 35 | if job_flow_id.present? 36 | self.job_flow_id = job_flow_id 37 | self.save 38 | self.result_message 39 | end 40 | end 41 | 42 | # Returns the S3 location for storing Elastic MapReduce job logs. 43 | def log_uri 44 | s3_path = "/logs/2-combine/#{self.job_name}/" 45 | build_s3_uri(s3_path) 46 | end 47 | 48 | private 49 | # Returns a single job step. The input paths are a CSV list of parse 50 | # job outputs. 51 | def create_job_step(input_paths) 52 | JobStep.create(:job => self, 53 | :input_paths => input_paths, 54 | :output_path => set_output_path) 55 | end 56 | 57 | # Returns the S3 location for reading a parse job. A wildcard is 58 | # used for the segment names. The input filter depends on the output 59 | # file type of the parse job and what type of compression is used. 60 | def set_input_path(input_job) 61 | job_name = input_job.job_name 62 | input_filter = job_config['input_filter'] 63 | 64 | s3_path = "/data/1-parse/#{job_name}/segments/*/#{input_filter}" 65 | build_s3_uri(s3_path) 66 | end 67 | 68 | # Returns the S3 location for storing the combine job results. 69 | def set_output_path 70 | s3_path = "/data/2-combine/#{self.job_name}/" 71 | build_s3_uri(s3_path) 72 | end 73 | 74 | # Sets the job description which forms part of the Elastic MapReduce 75 | # job flow name. 76 | def set_job_desc(segment_count) 77 | "Combining: #{segment_count} segments" 78 | end 79 | 80 | # Returns the combine job configuration from ~/.elasticrawl.jobs.yml. 81 | def job_config 82 | config = Config.new 83 | config.load_config('jobs')['steps']['combine'] 84 | end 85 | end 86 | end 87 | -------------------------------------------------------------------------------- /lib/elasticrawl/parse_job.rb: -------------------------------------------------------------------------------- 1 | module Elasticrawl 2 | # Represents an Elastic MapReduce job flow that parses segments of 3 | # Common Crawl data. A job step is created per segment. 4 | # 5 | # Inherits from Job which is the ActiveRecord model class. 6 | class ParseJob < Job 7 | # Populates the job from the list of segments to be parsed. 8 | def set_segments(crawl_segments, max_files = nil) 9 | self.job_name = set_job_name 10 | self.job_desc = set_job_desc(crawl_segments, max_files) 11 | self.max_files = max_files 12 | 13 | crawl_segments.each do |segment| 14 | self.job_steps.push(create_job_step(segment)) 15 | end 16 | end 17 | 18 | # Runs the job by calling Elastic MapReduce API. If successful the 19 | # parse time is set for each segment. 20 | def run 21 | emr_config = job_config['emr_config'] 22 | job_flow_id = run_job_flow(emr_config) 23 | 24 | if job_flow_id.present? 25 | self.job_flow_id = job_flow_id 26 | 27 | self.job_steps.each do |step| 28 | segment = step.crawl_segment 29 | segment.parse_time = DateTime.now 30 | segment.save 31 | end 32 | 33 | self.save 34 | self.result_message 35 | end 36 | end 37 | 38 | # Returns the S3 location for storing Elastic MapReduce job logs. 39 | def log_uri 40 | s3_path = "/logs/1-parse/#{self.job_name}/" 41 | build_s3_uri(s3_path) 42 | end 43 | 44 | # Return list of segment descriptions. 45 | def segment_list 46 | segments = ['Segments'] 47 | 48 | job_steps.each do |job_step| 49 | if job_step.crawl_segment.present? 50 | segment = job_step.crawl_segment 51 | segments.push(segment.segment_desc) 52 | end 53 | end 54 | 55 | segments.push('') 56 | end 57 | 58 | private 59 | # Creates a job step for the crawl segment. 60 | def create_job_step(segment) 61 | JobStep.create(:job => self, 62 | :crawl_segment => segment, 63 | :input_paths => segment_input(segment), 64 | :output_path => segment_output(segment)) 65 | end 66 | 67 | # Returns the S3 location for reading a crawl segment. The input filter 68 | # determines which type of Common Crawl data files are parsed. 69 | def segment_input(segment) 70 | segment.segment_s3_uri + job_config['input_filter'] 71 | end 72 | 73 | # Returns the S3 location for storing the step results. This includes 74 | # the segment name. 75 | def segment_output(segment) 76 | job_path = "/data/1-parse/#{self.job_name}" 77 | s3_path = "#{job_path}/segments/#{segment.segment_name}/" 78 | build_s3_uri(s3_path) 79 | end 80 | 81 | # Sets the job description which forms part of the Elastic MapReduce 82 | # job flow name. 83 | def set_job_desc(segments, max_files) 84 | if segments.count > 0 85 | crawl_name = segments[0].crawl.crawl_name if segments[0].crawl.present? 86 | file_desc = max_files.nil? ? 'all files' : "#{max_files} files per segment" 87 | end 88 | 89 | "Crawl: #{crawl_name} Segments: #{segments.count} Parsing: #{file_desc}" 90 | end 91 | 92 | # Returns the parse job configuration from ~/.elasticrawl.jobs.yml. 93 | def job_config 94 | config = Config.new 95 | config.load_config('jobs')['steps']['parse'] 96 | end 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /spec/unit/combine_job_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Elasticrawl::CombineJob do 4 | describe '#set_input_jobs' do 5 | let(:job_name) { (Time.now.to_f * 1000).to_i.to_s } 6 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 7 | let(:segment_list_1) { crawl.crawl_segments[0..1] } 8 | let(:segment_list_2) { [crawl.crawl_segments[2]]} 9 | 10 | let(:parse_job_1) { Elasticrawl::ParseJob.new } 11 | let(:parse_job_2) { Elasticrawl::ParseJob.new } 12 | let(:combine_job) { Elasticrawl::CombineJob.new } 13 | 14 | before do 15 | crawl.create_segments 16 | parse_job_1.set_segments(segment_list_1) 17 | parse_job_2.set_segments(segment_list_2) 18 | 19 | input_jobs = [parse_job_1.job_name, parse_job_2.job_name] 20 | combine_job.set_input_jobs(input_jobs) 21 | end 22 | 23 | it 'should have a job name based on current time' do 24 | expect(combine_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8) 25 | end 26 | 27 | it 'should have a job desc' do 28 | expect(combine_job.job_desc.end_with?('Combining: 3 segments')).to eq true 29 | end 30 | 31 | it 'should create 1 job step' do 32 | expect(combine_job.job_steps.count).to eq 1 33 | end 34 | 35 | it 'should set 1 input path per parse job' do 36 | input_paths = combine_job.job_steps[0].input_paths 37 | expect(input_paths.split(',').count).to eq 2 38 | end 39 | 40 | it 'should set input path including parse job name' do 41 | input_paths = combine_job.job_steps[0].input_paths 42 | expect(input_paths.include?(parse_job_1.job_name)).to eq true 43 | end 44 | 45 | it 'should set input path without segment names' do 46 | input_paths = combine_job.job_steps[0].input_paths 47 | segment_name = segment_list_1[0].segment_name 48 | expect(input_paths.include?(segment_name)).to eq false 49 | end 50 | 51 | it 'should set output path including job name' do 52 | output_path = combine_job.job_steps[0].output_path 53 | expect(output_path.include?(combine_job.job_name)).to eq true 54 | end 55 | end 56 | 57 | describe '#run' do 58 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 59 | let(:parse_job_1) { Elasticrawl::ParseJob.new } 60 | let(:parse_job_2) { Elasticrawl::ParseJob.new } 61 | let(:combine_job) { Elasticrawl::CombineJob.new } 62 | let(:job_flow_id) { 'j-3QHDKKBT6VAIS' } 63 | 64 | before do 65 | crawl.create_segments 66 | parse_job_1.set_segments(crawl.crawl_segments[0..1]) 67 | parse_job_2.set_segments([crawl.crawl_segments[2]]) 68 | 69 | input_jobs = [parse_job_1.job_name, parse_job_2.job_name] 70 | combine_job.set_input_jobs(input_jobs) 71 | end 72 | 73 | it 'should set a job flow id' do 74 | allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id) 75 | combine_job.run 76 | 77 | expect(combine_job.job_flow_id).to eq job_flow_id 78 | end 79 | end 80 | 81 | describe '#log_uri' do 82 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 83 | let(:parse_job) { Elasticrawl::ParseJob.new } 84 | let(:job) { Elasticrawl::CombineJob.new } 85 | 86 | before do 87 | crawl.create_segments 88 | parse_job.set_segments(crawl.crawl_segments) 89 | 90 | job.set_input_jobs([parse_job.job_name]) 91 | end 92 | 93 | it 'should set a log uri including the job name' do 94 | expect(job.log_uri).to eq "s3://elasticrawl/logs/2-combine/#{job.job_name}/" 95 | end 96 | end 97 | end 98 | -------------------------------------------------------------------------------- /lib/elasticrawl/crawl.rb: -------------------------------------------------------------------------------- 1 | module Elasticrawl 2 | # Represents a web crawl released by the Common Crawl Foundation. 3 | # Each crawl is split into multiple crawl segments and is stored 4 | # in the S3 public datasets bucket. 5 | class Crawl < ActiveRecord::Base 6 | has_many :crawl_segments 7 | 8 | # Returns the status of all saved crawls and the current job history. 9 | def self.status(show_all = false) 10 | status = ['Crawl Status'] 11 | Crawl.all.map { |crawl| status << crawl.status } 12 | 13 | if show_all == true 14 | header = 'Job History' 15 | jobs = Job.where('job_flow_id is not null').order(:id => :desc) 16 | else 17 | header = 'Job History (last 10)' 18 | jobs = Job.where('job_flow_id is not null').order(:id => :desc).limit(10) 19 | end 20 | 21 | status << ['', header] 22 | jobs.map { |job| status << job.history } 23 | 24 | status.join("\n") 25 | end 26 | 27 | # Returns the status of the current crawl. 28 | def status 29 | total = self.crawl_segments.count 30 | remaining = CrawlSegment.where(:crawl_id => self.id, 31 | :parse_time => nil).count 32 | parsed = total - remaining 33 | status = self.crawl_name 34 | status += " Segments: to parse #{remaining}, " 35 | status += "parsed #{parsed}, total #{total}" 36 | end 37 | 38 | # Checks for crawl segments in the database. If none are found then checks 39 | # the S3 API and creates any segments that are found. 40 | def has_segments? 41 | if self.crawl_segments.count == 0 42 | segment_count = create_segments 43 | result = segment_count > 0 44 | else 45 | result = true 46 | end 47 | end 48 | 49 | # Creates crawl segments from the warc.paths file for this crawl. 50 | def create_segments 51 | file_paths = warc_paths(self.crawl_name) 52 | 53 | segments = parse_segments(file_paths) 54 | save if segments.count > 0 55 | 56 | segments.keys.each do |segment_name| 57 | file_count = segments[segment_name] 58 | CrawlSegment.create_segment(self, segment_name, file_count) 59 | end 60 | 61 | segments.count 62 | end 63 | 64 | # Returns the list of segments from the database. 65 | def select_segments(segments_list) 66 | CrawlSegment.where(:segment_name => segments_list) 67 | end 68 | 69 | # Returns next # segments to be parsed. The maximum is 256 70 | # as this is the maximum # of steps for an Elastic MapReduce job flow. 71 | def next_segments(max_segments = nil) 72 | max_segments = Elasticrawl::MAX_SEGMENTS if max_segments.nil? 73 | max_segments = Elasticrawl::MAX_SEGMENTS if max_segments > Elasticrawl::MAX_SEGMENTS 74 | 75 | self.crawl_segments.where(:parse_time => nil).limit(max_segments) 76 | end 77 | 78 | # Resets parse time of all parsed segments to null so they will be parsed 79 | # again. Returns the updated crawl status. 80 | def reset 81 | segments = CrawlSegment.where('crawl_id = ? and parse_time is not null', 82 | self.id) 83 | segments.map { |segment| segment.update_attribute(:parse_time, nil) } 84 | 85 | status 86 | end 87 | 88 | private 89 | # Gets the WARC file paths from S3 for this crawl if it exists. 90 | def warc_paths(crawl_name) 91 | s3_path = [Elasticrawl::COMMON_CRAWL_PATH, 92 | crawl_name, 93 | Elasticrawl::WARC_PATHS].join('/') 94 | begin 95 | s3 = AWS::S3.new 96 | bucket = s3.buckets[Elasticrawl::COMMON_CRAWL_BUCKET] 97 | object = bucket.objects[s3_path] 98 | 99 | uncompress_file(object) 100 | 101 | rescue AWS::Errors::Base => s3e 102 | raise S3AccessError.new(s3e.http_response), 'Failed to get WARC paths' 103 | rescue Exception => e 104 | raise S3AccessError, 'Failed to get WARC paths' 105 | end 106 | end 107 | 108 | # Takes in a S3 object and returns the contents as an uncompressed string. 109 | def uncompress_file(s3_object) 110 | result = '' 111 | 112 | if s3_object.exists? 113 | io = StringIO.new 114 | io.write(s3_object.read) 115 | io.rewind 116 | 117 | gz = Zlib::GzipReader.new(io) 118 | result = gz.read 119 | 120 | gz.close 121 | end 122 | 123 | result 124 | end 125 | 126 | # Parses the segment names and file counts from the WARC file paths. 127 | def parse_segments(warc_paths) 128 | segments = Hash.new 0 129 | 130 | warc_paths.split.each do |warc_path| 131 | segment_name = warc_path.split('/')[3] 132 | segments[segment_name] += 1 if segment_name.present? 133 | end 134 | 135 | segments 136 | end 137 | end 138 | end 139 | -------------------------------------------------------------------------------- /spec/unit/crawl_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Elasticrawl::Crawl, type: :model do 4 | it { should have_many(:crawl_segments) } 5 | it { should have_db_column(:crawl_name).of_type(:string) } 6 | 7 | describe '#has_segments?' do 8 | let(:crawl_name) { 'CC-MAIN-2014-49' } 9 | subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) } 10 | 11 | it 'should have segments' do 12 | expect(subject.has_segments?).to eq true 13 | end 14 | end 15 | 16 | describe '#create_segments' do 17 | let(:crawl_name) { 'CC-MAIN-2014-49' } 18 | subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) } 19 | 20 | before do 21 | subject.create_segments 22 | end 23 | 24 | it 'should set crawl name' do 25 | expect(subject.crawl_name).to eq crawl_name 26 | end 27 | 28 | it 'should create correct # of segments' do 29 | expect(subject.crawl_segments.count).to eq 3 30 | end 31 | 32 | it 'should create segment names' do 33 | expect(subject.crawl_segments[0].segment_name).to eq '1416400372202.67' 34 | end 35 | 36 | it 'should create segment s3 uris' do 37 | expect(subject.crawl_segments[0].segment_s3_uri).to eq \ 38 | 's3://commoncrawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/' 39 | end 40 | 41 | it 'should set file counts' do 42 | expect(subject.crawl_segments[0].file_count).to eq 3 43 | end 44 | end 45 | 46 | describe '#next_segments' do 47 | let(:crawl_name) { 'CC-MAIN-2014-49' } 48 | subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) } 49 | 50 | before do 51 | subject.create_segments 52 | end 53 | 54 | it 'should return all segments' do 55 | crawl_segments = subject.next_segments 56 | 57 | expect(crawl_segments.count).to eq 3 58 | expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name 59 | expect(crawl_segments[0].segment_name).to eq '1416400372202.67' 60 | end 61 | 62 | it 'should return first # segments' do 63 | crawl_segments = subject.next_segments(2) 64 | 65 | expect(crawl_segments.count).to eq 2 66 | expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name 67 | expect(crawl_segments[0].segment_name).to eq '1416400372202.67' 68 | end 69 | end 70 | 71 | describe '#select_segments' do 72 | let(:crawl_name) { 'CC-MAIN-2014-49' } 73 | subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) } 74 | 75 | before do 76 | subject.create_segments 77 | end 78 | 79 | it 'should select no segments' do 80 | segments_list = ['test', 'segment'] 81 | crawl_segments = subject.select_segments(segments_list) 82 | 83 | expect(crawl_segments.count).to eq 0 84 | end 85 | 86 | it 'should select only segments in list' do 87 | segments_list = ['1416400372202.67', '1416400372490.23'] 88 | crawl_segments = subject.select_segments(segments_list) 89 | 90 | expect(crawl_segments.count).to eq 2 91 | end 92 | end 93 | 94 | describe '#reset' do 95 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 96 | let(:job) { Elasticrawl::ParseJob.new } 97 | let(:job_flow_id) { 'j-3QHDKKBT6VAIS' } 98 | 99 | before do 100 | crawl.create_segments 101 | job.set_segments(crawl.crawl_segments[0..1]) 102 | 103 | allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id) 104 | job.run 105 | 106 | crawl.reset 107 | end 108 | 109 | it 'should set parse time of all segments to null' do 110 | unparsed_segments = Elasticrawl::CrawlSegment.where(:parse_time => nil).count 111 | expect(crawl.crawl_segments.count).to eq unparsed_segments 112 | end 113 | end 114 | 115 | describe '.status' do 116 | let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' } 117 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 118 | let(:max_files) { 3 } 119 | let(:job) { Elasticrawl::ParseJob.new } 120 | let(:job_flow_id) { 'j-3QHDKKBT6VAIS' } 121 | 122 | before do 123 | crawl.create_segments 124 | job.set_segments(crawl.crawl_segments[0..1], max_files) 125 | 126 | allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id) 127 | job.run 128 | end 129 | 130 | it 'should display status of crawl segments' do 131 | expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \ 132 | 'CC-MAIN-2014-49 Segments: to parse 1, parsed 2, total 3' 133 | end 134 | 135 | it 'should display parse job desc' do 136 | crawl_status = Elasticrawl::Crawl.status.split("\n")[4] 137 | 138 | expect(crawl_status.include?(job.job_name)).to eq true 139 | expect(crawl_status.include?(job.job_desc)).to eq true 140 | end 141 | end 142 | end 143 | -------------------------------------------------------------------------------- /spec/unit/parse_job_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Elasticrawl::ParseJob do 4 | describe '#set_segments' do 5 | let(:job_name) { (Time.now.to_f * 1000).to_i.to_s } 6 | let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' } 7 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 8 | let(:max_files) { 3 } 9 | let(:parse_job) { Elasticrawl::ParseJob.new } 10 | 11 | before do 12 | crawl.create_segments 13 | parse_job.set_segments(crawl.crawl_segments[0..1], max_files) 14 | end 15 | 16 | it 'should have a job name based on current time' do 17 | expect(parse_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8) 18 | end 19 | 20 | it 'should have a job desc' do 21 | expect(parse_job.job_desc).to eq job_desc 22 | end 23 | 24 | it 'should create 2 job steps' do 25 | expect(parse_job.job_steps.count).to eq 2 26 | end 27 | 28 | it 'should set steps input path to segment uri' do 29 | input_path = parse_job.job_steps[0].input_paths 30 | segment_uri = crawl.crawl_segments[0].segment_s3_uri 31 | 32 | expect(input_path.starts_with?(segment_uri)).to eq true 33 | end 34 | 35 | it 'should set output path' do 36 | output_path = parse_job.job_steps[0].output_path 37 | segment_name = crawl.crawl_segments[0].segment_name 38 | 39 | expect(output_path.include?(parse_job.job_name)).to eq true 40 | expect(output_path.include?(segment_name)).to eq true 41 | end 42 | end 43 | 44 | describe '#confirm_message' do 45 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 46 | let(:job) { Elasticrawl::ParseJob.new } 47 | let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: 3 files per segment' } 48 | let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' } 49 | 50 | let(:cluster_desc) { 51 | cluster_desc = <<-CLUSTER_DESC 52 | Cluster configuration 53 | Master: 1 m1.medium (Spot: 0.12) 54 | Core: 2 m1.medium (Spot: 0.12) 55 | Task: -- 56 | CLUSTER_DESC 57 | } 58 | 59 | before do 60 | crawl.create_segments 61 | job.set_segments(crawl.crawl_segments[0..2], 3) 62 | end 63 | 64 | it 'should display message including job desc' do 65 | expect(job.confirm_message.include?(job_desc)).to eq true 66 | end 67 | 68 | it 'should display message including segment desc' do 69 | expect(job.confirm_message.include?(segment_desc)).to eq true 70 | end 71 | 72 | it 'should display message including cluster desc' do 73 | expect(job.confirm_message.include?(cluster_desc)).to eq true 74 | end 75 | end 76 | 77 | describe '#run' do 78 | let(:crawl_name) { 'CC-MAIN-2014-49' } 79 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => crawl_name) } 80 | let(:job) { Elasticrawl::ParseJob.new } 81 | let(:job_flow_id) { 'j-3QHDKKBT6VAIS' } 82 | 83 | before do 84 | crawl.create_segments 85 | job.set_segments(crawl.crawl_segments[0..1], 5) 86 | 87 | allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id) 88 | job.run 89 | end 90 | 91 | it 'should set a job flow id' do 92 | expect(job.job_flow_id).to eq job_flow_id 93 | end 94 | 95 | it 'should set parse time for parsed segments' do 96 | expect(crawl.crawl_segments[0].parse_time.present?).to eq true 97 | expect(crawl.crawl_segments[1].parse_time.present?).to eq true 98 | expect(crawl.crawl_segments[2].parse_time.present?).to eq false 99 | end 100 | end 101 | 102 | describe '#log_uri' do 103 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 104 | let(:job) { Elasticrawl::ParseJob.new } 105 | 106 | before do 107 | crawl.create_segments 108 | job.set_segments(crawl.crawl_segments) 109 | end 110 | 111 | it 'should set a log uri including the job name' do 112 | expect(job.log_uri).to eq "s3://elasticrawl/logs/1-parse/#{job.job_name}/" 113 | end 114 | end 115 | 116 | describe '#history' do 117 | let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } 118 | let(:job) { Elasticrawl::ParseJob.new } 119 | let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: all files' } 120 | let(:job_flow_id) { 'j-3QHDKKBT6VAIS' } 121 | 122 | before do 123 | crawl.create_segments 124 | job.set_segments(crawl.crawl_segments) 125 | 126 | allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id) 127 | job.run 128 | end 129 | 130 | it 'should return the job name, history and launch time' do 131 | expect(job.history.include?(job.job_name)).to eq true 132 | expect(job.history.include?(job.job_desc)).to eq true 133 | expect(job.history.include?(job.created_at.strftime('%Y-%m-%d %H:%M:%S'))).to eq true 134 | end 135 | end 136 | end 137 | -------------------------------------------------------------------------------- /bin/elasticrawl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'elasticrawl' 3 | 4 | module Elasticrawl 5 | class Cli < Thor 6 | desc 'init S3_BUCKET_NAME', 'Creates S3 bucket and config directory' 7 | method_option :access_key_id, :type => :string, :desc => 'AWS Access Key ID' 8 | method_option :secret_access_key, :type => :string, :desc => 'AWS Secret Access Key' 9 | def init(s3_bucket_name) 10 | key = options[:access_key_id] 11 | secret = options[:secret_access_key] 12 | 13 | if key.nil? || secret.nil? 14 | config = Config.new 15 | 16 | # Prompt for credentials showing the current values. 17 | key = ask(config.access_key_prompt) 18 | secret = ask(config.secret_key_prompt) 19 | 20 | # Use current values if user has selected them. 21 | key = config.access_key_id if key.blank? 22 | secret = config.secret_access_key if secret.blank? 23 | end 24 | 25 | # Create new config object with updated credentials. 26 | config = Config.new(key, secret) 27 | 28 | if config.bucket_exists?(s3_bucket_name) 29 | puts('ERROR: S3 bucket already exists') 30 | else 31 | if config.dir_exists? 32 | puts("WARNING: Config dir #{config.config_dir} already exists") 33 | overwrite = agree('Overwrite? (y/n)', true) 34 | end 35 | 36 | puts(config.create(s3_bucket_name)) if !config.dir_exists? || overwrite == true 37 | end 38 | end 39 | 40 | desc 'parse CRAWL_NAME', 'Launches parse job against Common Crawl corpus' 41 | method_option :max_segments, :type => :numeric, :desc => 'number of crawl segments to parse' 42 | method_option :max_files, :type => :numeric, :desc => 'number of files to parse per segment' 43 | method_option :segment_list, :type => :array, :desc => 'list of segment names to parse' 44 | def parse(crawl_name) 45 | load_database 46 | 47 | crawl = find_crawl(crawl_name) 48 | if crawl.has_segments? 49 | segment_list = options[:segment_list] 50 | 51 | if segment_list.present? 52 | segments = crawl.select_segments(segment_list) 53 | else 54 | segments = crawl.next_segments(options[:max_segments]) 55 | end 56 | 57 | if segments.count == 0 58 | puts('ERROR: No segments matched for parsing') 59 | else 60 | job = ParseJob.new 61 | job.set_segments(segments, options[:max_files]) 62 | puts(job.confirm_message) 63 | 64 | launch = agree('Launch job? (y/n)', true) 65 | puts(job.run) if launch == true 66 | end 67 | else 68 | puts('ERROR: Crawl does not exist') 69 | end 70 | end 71 | 72 | desc 'combine', 'Launches combine job against parse job results' 73 | method_option :input_jobs, :type => :array, :required => true, 74 | :desc => 'list of input jobs to combine' 75 | def combine 76 | load_database 77 | 78 | job = CombineJob.new 79 | job.set_input_jobs(options[:input_jobs]) 80 | puts(job.confirm_message) 81 | 82 | launch = agree('Launch job? (y/n)', true) 83 | puts(job.run) if launch == true 84 | end 85 | 86 | desc 'status', 'Shows crawl status and lists jobs' 87 | method_option :show_all, :type => :boolean, :desc => 'list all jobs' 88 | def status 89 | load_database 90 | puts(Crawl.status(options[:show_all])) 91 | end 92 | 93 | desc 'reset CRAWL_NAME', 'Resets a crawl so its segments are parsed again' 94 | def reset(crawl_name) 95 | load_database 96 | 97 | crawl = find_crawl(crawl_name) 98 | if crawl.has_segments? 99 | reset = agree('Reset crawl? (y/n)', true) 100 | puts(crawl.reset) if reset == true 101 | else 102 | puts('ERROR: Crawl does not exist') 103 | end 104 | end 105 | 106 | desc 'destroy', 'Deletes S3 bucket and config directory' 107 | def destroy 108 | config = Config.new 109 | 110 | if config.dir_exists? 111 | puts(config.delete_warning) 112 | delete = agree('Delete? (y/n)', true) 113 | puts(config.delete) if delete == true 114 | else 115 | puts('No config dir. Nothing to do') 116 | end 117 | end 118 | 119 | private 120 | # Find a crawl record in the database. 121 | def find_crawl(crawl_name) 122 | Crawl.where(:crawl_name => crawl_name).first_or_initialize 123 | end 124 | 125 | # Load sqlite database. 126 | def load_database 127 | config = Config.new 128 | config.load_database 129 | end 130 | end 131 | end 132 | 133 | begin 134 | Elasticrawl::Cli.start(ARGV) 135 | # Show errors parsing command line arguments. 136 | rescue Thor::Error => e 137 | puts(e.message) 138 | # Show elasticrawl errors. 139 | rescue Elasticrawl::Error => e 140 | puts("ERROR: #{e.message}") 141 | puts e.backtrace 142 | 143 | if e.http_response.present? 144 | response = e.http_response 145 | 146 | puts "HTTP Response: #{response.status}" 147 | puts response.body if response.body.present? 148 | end 149 | end 150 | -------------------------------------------------------------------------------- /lib/elasticrawl/cluster.rb: -------------------------------------------------------------------------------- 1 | module Elasticrawl 2 | # Configures the cluster settings for the job flow that will be launched. 3 | # These settings are loaded from ~/.elasticrawl/cluster.yml. 4 | class Cluster 5 | def initialize 6 | @master_group = instance_group('master') 7 | @core_group = instance_group('core') 8 | @task_group = instance_group('task') if has_task_group? 9 | end 10 | 11 | # Returns a configured job flow to the calling job. 12 | def create_job_flow(job, emr_config = nil) 13 | config = Config.new 14 | 15 | Elasticity.configure do |c| 16 | c.access_key = config.access_key_id 17 | c.secret_key = config.secret_access_key 18 | end 19 | 20 | job_flow = Elasticity::JobFlow.new 21 | job_flow.name = "Job: #{job.job_name} #{job.job_desc}" 22 | job_flow.log_uri = job.log_uri 23 | 24 | configure_job_flow(job_flow) 25 | configure_instances(job_flow) 26 | configure_bootstrap_actions(job_flow, emr_config) 27 | 28 | job_flow 29 | end 30 | 31 | # Describes the instances that will be launched. This is used by the 32 | # job confirmation messages. 33 | def cluster_desc 34 | cluster_desc = <<-HERE 35 | Cluster configuration 36 | Master: #{instance_group_desc(@master_group)} 37 | Core: #{instance_group_desc(@core_group)} 38 | Task: #{instance_group_desc(@task_group)} 39 | HERE 40 | end 41 | 42 | private 43 | # Set job flow properties from settings in cluster.yml. 44 | def configure_job_flow(job_flow) 45 | ec2_key_name = config_setting('ec2_key_name') 46 | placement = config_setting('placement') 47 | emr_ami_version = config_setting('emr_ami_version') 48 | job_flow_role = config_setting('job_flow_role') 49 | service_role = config_setting('service_role') 50 | ec2_subnet_id = config_setting('ec2_subnet_id') 51 | 52 | job_flow.ec2_subnet_id = ec2_subnet_id if ec2_subnet_id.present? 53 | job_flow.ec2_key_name = ec2_key_name if ec2_key_name.present? 54 | job_flow.placement = placement if placement.present? 55 | job_flow.ami_version = emr_ami_version if emr_ami_version.present? 56 | job_flow.job_flow_role = job_flow_role if job_flow_role.present? 57 | job_flow.service_role = service_role if service_role.present? 58 | end 59 | 60 | # Configures the instances that will be launched. The master group has 61 | # a single node. The task group is optional. 62 | def configure_instances(job_flow) 63 | job_flow.set_master_instance_group(@master_group) 64 | job_flow.set_core_instance_group(@core_group) 65 | job_flow.set_task_instance_group(@task_group) if @task_group.present? 66 | end 67 | 68 | # Configures bootstrap actions that will be run when each instance is 69 | # launched. EMR config is an XML file of Hadoop settings stored on S3. 70 | # There are applied to each node by a bootstrap action. 71 | def configure_bootstrap_actions(job_flow, emr_config = nil) 72 | bootstrap_scripts = config_setting('bootstrap_scripts') 73 | 74 | if bootstrap_scripts.present? 75 | bootstrap_scripts.each do |script_uri| 76 | action = Elasticity::BootstrapAction.new(script_uri, '', '') 77 | job_flow.add_bootstrap_action(action) 78 | end 79 | end 80 | 81 | if emr_config.present? 82 | action = Elasticity::HadoopFileBootstrapAction.new(emr_config) 83 | job_flow.add_bootstrap_action(action) 84 | end 85 | end 86 | 87 | # Returns whether cluster.yml specifies a task group. 88 | def has_task_group? 89 | task_config = config_for_group('task') 90 | task_config.has_key?('instance_count') && task_config['instance_count'] > 0 91 | end 92 | 93 | # Describes an instance group. 94 | def instance_group_desc(group) 95 | if group.present? 96 | if group.market == 'SPOT' 97 | price = "(Spot: #{group.bid_price})" 98 | else 99 | price = '(On Demand)' 100 | end 101 | 102 | "#{group.count} #{group.type} #{price}" 103 | else 104 | '--' 105 | end 106 | end 107 | 108 | # Configures an instance group with the instance type, # of instances and 109 | # the bid price if spot instances are to be used. 110 | def instance_group(group_name) 111 | config = config_for_group(group_name) 112 | 113 | instance_group = Elasticity::InstanceGroup.new 114 | instance_group.role = group_name.upcase 115 | instance_group.type = config['instance_type'] 116 | 117 | if config.has_key?('instance_count') && config['instance_count'] > 0 118 | instance_group.count = config['instance_count'] 119 | end 120 | 121 | if config['use_spot_instances'] == true 122 | instance_group.set_spot_instances(config['bid_price']) 123 | end 124 | 125 | instance_group 126 | end 127 | 128 | # Returns the config settings for an instance group. 129 | def config_for_group(group_name) 130 | config_setting("#{group_name}_instance_group") 131 | end 132 | 133 | # Returns a config setting from cluster.yml. 134 | def config_setting(key_name) 135 | config = Config.new 136 | config.load_config('cluster')[key_name] 137 | end 138 | end 139 | end 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Elasticrawl 2 | 3 | * Command line tool for launching Hadoop jobs using AWS EMR (Elastic MapReduce) to process Common Crawl data. 4 | * Elasticrawl can be used with [crawl data](http://commoncrawl.org/the-data/get-started/) from April 2014 onwards. 5 | * A list of crawls released by Common Crawl is maintained on the [wiki](https://github.com/rossf7/elasticrawl/wiki). 6 | * Common Crawl announce new crawls on their [blog](http://blog.commoncrawl.org/). 7 | 8 | * Ships with a default configuration that launches the 9 | [elasticrawl-examples](https://github.com/rossf7/elasticrawl-examples) jobs. 10 | This is an implementation of the standard Hadoop Word Count example. 11 | 12 | This [blog post](https://rossfairbanks.com/2015/01/03/parsing-common-crawl-using-elasticrawl.html) has a walkthrough of running the example jobs on the November 2014 crawl. 13 | 14 | ## Installation 15 | 16 | * Elasticrawl needs a [Ruby installation](https://www.ruby-lang.org/en/documentation/installation/) (2.1 or higher). 17 | * Install Ruby from RubyGems. 18 | 19 | ``` 20 | gem install elasticrawl --no-rdoc --no-ri 21 | ``` 22 | 23 | ### Troubleshooting 24 | 25 | If you get the error "EMR service role arn:aws:iam::156793023547:role/EMR_DefaultRole is invalid" when launching a cluster then you don't have the necessary IAM roles. 26 | To fix this install the [AWS CLI](https://aws.amazon.com/cli/) and run the command below. 27 | 28 | ``` 29 | aws emr create-default-roles 30 | ``` 31 | 32 | ## Commands 33 | 34 | ### elasticrawl init 35 | 36 | The init command takes in an S3 bucket name and your AWS credentials. The S3 bucket will be created 37 | and will store your data and logs. 38 | 39 | ```bash 40 | ~$ elasticrawl init your-s3-bucket 41 | 42 | Enter AWS Access Key ID: ************ 43 | Enter AWS Secret Access Key: ************ 44 | 45 | ... 46 | 47 | Bucket s3://elasticrawl-test created 48 | Config dir /Users/ross/.elasticrawl created 49 | Config complete 50 | ``` 51 | 52 | ### elasticrawl parse 53 | 54 | The parse command takes in the crawl name and an optional number of segments and files to parse. 55 | 56 | ```bash 57 | ~$ elasticrawl parse CC-MAIN-2015-48 --max-segments 2 --max-files 3 58 | Segments 59 | Segment: 1416400372202.67 Files: 150 60 | Segment: 1416400372490.23 Files: 124 61 | 62 | Job configuration 63 | Crawl: CC-MAIN-2015-48 Segments: 2 Parsing: 3 files per segment 64 | 65 | Cluster configuration 66 | Master: 1 m1.medium (Spot: 0.12) 67 | Core: 2 m1.medium (Spot: 0.12) 68 | Task: -- 69 | Launch job? (y/n) 70 | y 71 | 72 | Job: 1420124830792 Job Flow ID: j-2R3MFE6TWLIUB 73 | ``` 74 | 75 | ### elasticrawl combine 76 | 77 | The combine command takes in the results of previous parse jobs and produces a combined set of results. 78 | 79 | ```bash 80 | ~$ elasticrawl combine --input-jobs 1420124830792 81 | Job configuration 82 | Combining: 2 segments 83 | 84 | Cluster configuration 85 | Master: 1 m1.medium (Spot: 0.12) 86 | Core: 2 m1.medium (Spot: 0.12) 87 | Task: -- 88 | Launch job? (y/n) 89 | y 90 | 91 | Job: 1420129496115 Job Flow ID: j-251GXDIZGK8HL 92 | ``` 93 | 94 | ### elasticrawl status 95 | 96 | The status command shows crawls and your job history. 97 | 98 | ```bash 99 | ~$ elasticrawl status 100 | Crawl Status 101 | CC-MAIN-2015-48 Segments: to parse 98, parsed 2, total 100 102 | 103 | Job History (last 10) 104 | 1420124830792 Launched: 2015-01-01 15:07:10 Crawl: CC-MAIN-2015-48 Segments: 2 Parsing: 3 files per segment 105 | ``` 106 | 107 | ### elasticrawl reset 108 | 109 | The reset comment resets a crawl so it is parsed again. 110 | 111 | ```bash 112 | ~$ elasticrawl reset CC-MAIN-2015-48 113 | Reset crawl? (y/n) 114 | y 115 | CC-MAIN-2015-48 Segments: to parse 100, parsed 0, total 100 116 | ``` 117 | 118 | ### elasticrawl destroy 119 | 120 | The destroy command deletes your S3 bucket and the ~/.elasticrawl directory. 121 | 122 | ```bash 123 | ~$ elasticrawl destroy 124 | 125 | WARNING: 126 | Bucket s3://elasticrawl-test and its data will be deleted 127 | Config dir /home/vagrant/.elasticrawl will be deleted 128 | Delete? (y/n) 129 | y 130 | 131 | Bucket s3://elasticrawl-test deleted 132 | Config dir /home/vagrant/.elasticrawl deleted 133 | Config deleted 134 | ``` 135 | 136 | ## Configuring Elasticrawl 137 | 138 | The elasticrawl init command creates the ~/elasticrawl/ directory which 139 | contains 140 | 141 | * [aws.yml](https://github.com/rossf7/.elasticrawl/blob/master/templates/aws.yml) - 142 | stores your AWS access credentials. Or you can set the environment 143 | variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY 144 | 145 | * [cluster.yml](https://github.com/rossf7/elasticrawl/blob/master/templates/cluster.yml) - 146 | configures the EC2 instances that are launched to form your EMR cluster 147 | 148 | * [jobs.yml](https://github.com/rossf7/elasticrawl/blob/master/templates/jobs.yml) - 149 | stores your S3 bucket name and the config for the parse and combine jobs 150 | 151 | ## Development 152 | 153 | Elasticrawl is developed in Ruby and requires Ruby 2.1.0 or later (Ruby 2.3 is recommended). The sqlite3 and nokogiri gems have C extensions which mean you may need to install development headers. 154 | 155 | [![Gem Version](https://badge.fury.io/rb/elasticrawl.png)](http://badge.fury.io/rb/elasticrawl) 156 | [![Code Climate](https://codeclimate.com/github/rossf7/elasticrawl.png)](https://codeclimate.com/github/rossf7/elasticrawl) 157 | [![Build Status](https://travis-ci.org/rossf7/elasticrawl.png?branch=master)](https://travis-ci.org/rossf7/elasticrawl) 2.0.0, 2.1.8, 2.2.4, 2.3.0 158 | 159 | ## TODO 160 | 161 | * Add support for Streaming and Pig jobs 162 | 163 | ## Thanks 164 | 165 | * Thanks to everyone at Common Crawl for making this awesome dataset available! 166 | * Thanks to Robert Slifka for the [elasticity](https://github.com/rslifka/elasticity) 167 | gem which provides a nice Ruby wrapper for the EMR REST API. 168 | * Thanks to Phusion for creating Traveling Ruby. 169 | 170 | ## Contributing 171 | 172 | 1. Fork it 173 | 2. Create your feature branch (`git checkout -b my-new-feature`) 174 | 3. Commit your changes (`git commit -am 'Add some feature'`) 175 | 4. Push to the branch (`git push origin my-new-feature`) 176 | 5. Create new Pull Request 177 | 178 | ## License 179 | 180 | This code is licensed under the MIT license. 181 | -------------------------------------------------------------------------------- /lib/elasticrawl/config.rb: -------------------------------------------------------------------------------- 1 | module Elasticrawl 2 | # Represents the current configuration which is persisted to 3 | # ~/.elasticrawl/ and contains 3 configuration files. 4 | # 5 | # aws.yml - AWS access credentials unless stored in the environment 6 | # variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY. 7 | # cluster.yml - Elastic MapReduce cluster config including instance groups. 8 | # jobs.yml - Elastic MapReduce jobs config and the S3 bucket used for 9 | # storing data and logs. 10 | # 11 | # This directory also contains the Elasticrawl SQLite database. 12 | class Config 13 | CONFIG_DIR = '.elasticrawl' 14 | DATABASE_FILE = 'elasticrawl.sqlite3' 15 | TEMPLATES_DIR = '../../templates' 16 | TEMPLATE_FILES = ['aws.yml', 'cluster.yml', 'jobs.yml'] 17 | 18 | attr_reader :access_key_id 19 | attr_reader :secret_access_key 20 | 21 | # Sets the AWS access credentials needed for the S3 and EMR API calls. 22 | def initialize(access_key_id = nil, secret_access_key = nil) 23 | # Credentials have been provided to the init command. 24 | @access_key_id = access_key_id 25 | @secret_access_key = secret_access_key 26 | 27 | # If credentials are not set then check if they are available in aws.yml. 28 | if dir_exists? 29 | config = load_config('aws') 30 | key = config['access_key_id'] 31 | secret = config['secret_access_key'] 32 | 33 | @access_key_id ||= key unless key == 'ACCESS_KEY_ID' 34 | @secret_access_key ||= secret unless secret == 'SECRET_ACCESS_KEY' 35 | end 36 | 37 | # If credentials are still not set then check AWS environment variables. 38 | @access_key_id ||= ENV['AWS_ACCESS_KEY_ID'] 39 | @secret_access_key ||= ENV['AWS_SECRET_ACCESS_KEY'] 40 | 41 | # Set AWS credentials for use when accessing the S3 API. 42 | AWS.config(:access_key_id => @access_key_id, 43 | :secret_access_key => @secret_access_key) 44 | end 45 | 46 | # Returns the location of the config directory. 47 | def config_dir 48 | File.join(Dir.home, CONFIG_DIR) 49 | end 50 | 51 | # Checks if the configuration directory exists. 52 | def dir_exists? 53 | Dir.exists?(config_dir) 54 | end 55 | 56 | # Loads a YAML configuration file. 57 | def load_config(config_file) 58 | if dir_exists? 59 | begin 60 | config_file = File.join(config_dir, "#{config_file}.yml") 61 | config = YAML::load(File.open(config_file)) 62 | 63 | rescue StandardError => e 64 | raise FileAccessError, e.message 65 | end 66 | else 67 | raise ConfigDirMissingError, 'Config dir missing. Run init command' 68 | end 69 | end 70 | 71 | # Loads the sqlite database. If no database exists it will be created 72 | # and the database migrations will be run. 73 | def load_database 74 | if dir_exists? 75 | config = { 76 | 'adapter' => 'sqlite3', 77 | 'database' => File.join(config_dir, DATABASE_FILE), 78 | 'pool' => 5, 79 | 'timeout' => 5000 80 | } 81 | 82 | begin 83 | ActiveRecord::Base.establish_connection(config) 84 | ActiveRecord::Migrator.migrate(File.join(File.dirname(__FILE__), \ 85 | '../../db/migrate'), ENV['VERSION'] ? ENV['VERSION'].to_i : nil ) 86 | 87 | rescue StandardError => e 88 | raise DatabaseAccessError, e.message 89 | end 90 | else 91 | raise ConfigDirMissingError, 'Config dir missing. Run init command' 92 | end 93 | end 94 | 95 | # Checks if a S3 bucket name is in use. 96 | def bucket_exists?(bucket_name) 97 | begin 98 | s3 = AWS::S3.new 99 | s3.buckets[bucket_name].exists? 100 | 101 | rescue AWS::S3::Errors::SignatureDoesNotMatch => e 102 | raise AWSCredentialsInvalidError, 'AWS access credentials are invalid' 103 | rescue AWS::Errors::Base => s3e 104 | raise S3AccessError.new(s3e.http_response), s3e.message 105 | end 106 | end 107 | 108 | # Creates the S3 bucket and config directory. Deploys the config templates 109 | # and creates the sqlite database. 110 | def create(bucket_name) 111 | create_bucket(bucket_name) 112 | deploy_templates(bucket_name) 113 | load_database 114 | 115 | status_message(bucket_name, 'created') 116 | end 117 | 118 | # Deletes the S3 bucket and config directory. 119 | def delete 120 | bucket_name = load_config('jobs')['s3_bucket_name'] 121 | delete_bucket(bucket_name) 122 | delete_config_dir 123 | 124 | status_message(bucket_name, 'deleted') 125 | end 126 | 127 | # Displayed by destroy command to confirm deletion. 128 | def delete_warning 129 | bucket_name = load_config('jobs')['s3_bucket_name'] 130 | 131 | message = ['WARNING:'] 132 | message << "Bucket s3://#{bucket_name} and its data will be deleted" 133 | message << "Config dir #{config_dir} will be deleted" 134 | 135 | message.join("\n") 136 | end 137 | 138 | # Displayed by init command. 139 | def access_key_prompt 140 | prompt = "Enter AWS Access Key ID:" 141 | prompt += " [#{@access_key_id}]" if @access_key_id.present? 142 | 143 | prompt 144 | end 145 | 146 | # Displayed by init command. 147 | def secret_key_prompt 148 | prompt = "Enter AWS Secret Access Key:" 149 | prompt += " [#{@secret_access_key}]" if @secret_access_key.present? 150 | 151 | prompt 152 | end 153 | 154 | private 155 | # Creates a bucket using the S3 API. 156 | def create_bucket(bucket_name) 157 | begin 158 | s3 = AWS::S3.new 159 | s3.buckets.create(bucket_name) 160 | 161 | rescue AWS::Errors::Base => s3e 162 | raise S3AccessError.new(s3e.http_response), s3e.message 163 | end 164 | end 165 | 166 | # Deletes a bucket and its contents using the S3 API. 167 | def delete_bucket(bucket_name) 168 | begin 169 | s3 = AWS::S3.new 170 | bucket = s3.buckets[bucket_name] 171 | bucket.delete! 172 | 173 | rescue AWS::Errors::Base => s3e 174 | raise S3AccessError.new(s3e.http_response), s3e.message 175 | end 176 | end 177 | 178 | # Creates config directory and copies config templates into it. 179 | # Saves S3 bucket name to jobs.yml and AWS credentials to aws.yml. 180 | def deploy_templates(bucket_name) 181 | begin 182 | Dir.mkdir(config_dir, 0755) if dir_exists? == false 183 | 184 | TEMPLATE_FILES.each do |template_file| 185 | FileUtils.cp(File.join(File.dirname(__FILE__), TEMPLATES_DIR, template_file), 186 | File.join(config_dir, template_file)) 187 | end 188 | 189 | save_config('jobs', { 'BUCKET_NAME' => bucket_name }) 190 | save_aws_config 191 | 192 | rescue StandardError => e 193 | raise FileAccessError, e.message 194 | end 195 | end 196 | 197 | # Saves AWS access credentials to aws.yml unless they are configured as 198 | # environment variables. 199 | def save_aws_config 200 | env_key = ENV['AWS_ACCESS_KEY_ID'] 201 | env_secret = ENV['AWS_SECRET_ACCESS_KEY'] 202 | 203 | creds = {} 204 | creds['ACCESS_KEY_ID'] = @access_key_id unless @access_key_id == env_key 205 | creds['SECRET_ACCESS_KEY'] = @secret_access_key \ 206 | unless @secret_access_key == env_secret 207 | 208 | save_config('aws', creds) 209 | end 210 | 211 | # Saves config values by overwriting placeholder values in template. 212 | def save_config(template, params) 213 | config_file = File.join(config_dir, "#{template}.yml") 214 | config = File.read(config_file) 215 | 216 | params.map { |key, value| config = config.gsub(key, value) } 217 | 218 | File.open(config_file, 'w') { |file| file.write(config) } 219 | end 220 | 221 | # Deletes the config directory including its contents. 222 | def delete_config_dir 223 | begin 224 | FileUtils.rm_r(config_dir) if dir_exists? 225 | 226 | rescue StandardError => e 227 | raise FileAccessError, e.message 228 | end 229 | end 230 | 231 | # Notifies user of results of init or destroy commands. 232 | def status_message(bucket_name, state) 233 | message = ['', "Bucket s3://#{bucket_name} #{state}"] 234 | message << "Config dir #{config_dir} #{state}" 235 | 236 | state = 'complete' if state == 'created' 237 | message << "Config #{state}" 238 | 239 | message.join("\n") 240 | end 241 | end 242 | end 243 | --------------------------------------------------------------------------------