├── data
    ├── .gitkeep
    └── create_tables.sql
├── ap
    ├── exception.rb
    ├── logger.rb
    ├── downloader.rb
    ├── crawler.rb
    ├── replayer.rb
    └── importer.rb
├── config
    ├── s3.yml.example
    ├── ap.yml.example
    └── database.yml.example
├── Gemfile
├── .gitignore
├── Gemfile.lock
├── posthook
    └── posthook.rb.example
├── upload_replay.rb
├── crawl.rb
├── LICENSE
└── README.md


/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ap/exception.rb:
--------------------------------------------------------------------------------
1 | class AbortException < Exception
2 | end


--------------------------------------------------------------------------------
/config/s3.yml.example:
--------------------------------------------------------------------------------
1 | access_key_id:
2 | secret_access_key:
3 | bucket:
4 | directory: ap_replays
5 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | gem 'trollop', '2.0'
4 | gem 'mysql2', '0.3.11'
5 | gem 'aws-s3', '0.6.3'


--------------------------------------------------------------------------------
/config/ap.yml.example:
--------------------------------------------------------------------------------
1 | host: electionsonline.ap.org
2 | user:
3 | pass:
4 | environment: development
5 | interval: 300
6 | states: MA
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | config/ap.yml
2 | config/database.yml
3 | config/s3.yml
4 | data/*
5 | !data/create_tables.sql
6 | posthook/*
7 | !posthook/posthook.rb.example
8 | deploy.rb
9 | 


--------------------------------------------------------------------------------
/ap/logger.rb:
--------------------------------------------------------------------------------
 1 | STDOUT.sync = true
 2 | 
 3 | module AP
 4 |   class Logger
 5 | 
 6 |     def log(str)
 7 |       puts "#{Time.now.strftime('%m-%d %H:%M:%S')} - #{str}"
 8 |     end
 9 | 
10 |     def err(str)
11 |       log("ERROR: " + str)
12 |     end
13 | 
14 |   end
15 | end


--------------------------------------------------------------------------------
/config/database.yml.example:
--------------------------------------------------------------------------------
 1 | development:
 2 |   database: elections
 3 |   username:
 4 |   password:
 5 |   host: localhost
 6 | 
 7 | staging:
 8 |   database: elections
 9 |   username:
10 |   password:
11 |   host:
12 | 
13 | production:
14 |   database: elections
15 |   username:
16 |   password:
17 |   host:
18 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: https://rubygems.org/
 3 |   specs:
 4 |     aws-s3 (0.6.3)
 5 |       builder
 6 |       mime-types
 7 |       xml-simple
 8 |     builder (3.0.4)
 9 |     mime-types (1.21)
10 |     mysql2 (0.3.11)
11 |     trollop (2.0)
12 |     xml-simple (1.1.2)
13 | 
14 | PLATFORMS
15 |   ruby
16 | 
17 | DEPENDENCIES
18 |   aws-s3 (= 0.6.3)
19 |   mysql2 (= 0.3.11)
20 |   trollop (= 2.0)
21 | 


--------------------------------------------------------------------------------
/posthook/posthook.rb.example:
--------------------------------------------------------------------------------
 1 | module AP
 2 |   class Posthook
 3 | 
 4 |     def initialize(crawler)
 5 |       @crawler = crawler
 6 |     end
 7 | 
 8 |     def run
 9 |       @crawler.logger.log "Running posthook #{"for #{@crawler.updated_states.keys.join(', ')}" if @crawler.updated_states.size > 0}"
10 | 
11 |       # Do stuff
12 | 
13 |       @crawler.logger.log "Finished running posthook" if @crawler.updated_states.size > 0
14 |     end
15 | 
16 |   end
17 | end


--------------------------------------------------------------------------------
/upload_replay.rb:
--------------------------------------------------------------------------------
 1 | require 'trollop'
 2 | require 'yaml'
 3 | require 'aws/s3'
 4 | 
 5 | dir = "#{File.expand_path(File.dirname(__FILE__))}"
 6 | datadir = "#{dir}/data"
 7 | 
 8 | params = Trollop::options do
 9 |   opt :date, "Specify date of recording to upload (e.g. 20120521)", :type => :string
10 | end
11 | 
12 | @s3_config = YAML.load_file("#{dir}/config/s3.yml")
13 | AWS::S3::Base.establish_connection!(:access_key_id => @s3_config['access_key_id'], :secret_access_key => @s3_config['secret_access_key'])
14 | 
15 | params[:date] = Dir.glob("#{datadir}/20*").reject{|f| f.index('.tar.gz')}.map{|f| f.split('/').last}.sort.last
16 | 
17 | puts "Gzipping and uploading replay from #{params[:date]}"
18 | system "cd #{datadir} && tar -czf #{params[:date]}.tar.gz #{params[:date]}"
19 | AWS::S3::Base.establish_connection!(:access_key_id => @s3_config['access_key_id'], :secret_access_key => @s3_config['secret_access_key'])
20 | file = "#{datadir}/#{params[:date]}.tar.gz"
21 | s3_file = "#{@s3_config['directory']}/#{params[:date]}.tar.gz"
22 | AWS::S3::S3Object.store(s3_file, open(file), @s3_config['bucket'], :access => :private)
23 | 


--------------------------------------------------------------------------------
/crawl.rb:
--------------------------------------------------------------------------------
 1 | require 'trollop'
 2 | 
 3 | dir = "#{File.expand_path(File.dirname(__FILE__))}"
 4 | Dir["#{dir}/ap/*.rb"].each {|f| require f }
 5 | Dir["#{dir}/posthook/*.rb"].each {|f| require f }
 6 | 
 7 | params = Trollop::options do
 8 |   opt :states, "Comma-separated states to download", :type => :string, :default => nil
 9 |   opt :initialize, "Create initial set of results records", :default => false
10 |   opt :once, "Only download and import data once", :default => false
11 |   opt :clean, "Clean the data directories for specified states before downloading", :default => false
12 |   opt :interval, "Interval (in seconds) at which AP data will be downloaded", :type => :int, :default => nil
13 |   opt :posthook, "Run posthook after first iteration, even if results didn't change", :default => false
14 |   opt :record, "Record this run", :default => false
15 |   opt :replay, "Replay the most recent run", :default => false
16 |   opt :replaydate, "Specify date of replay to run (e.g. 20120521)", :type => :string
17 |   opt :replaytime, "Set the results to their state at the specified time", :default => nil, :type => :string
18 |   opt :replaytimefrom, "Run the replay from the specified time onward", :default => nil, :type => :string
19 |   opt :replaytimeto, "Run the replay up to the specified time", :default => nil, :type => :string
20 | end
21 | 
22 | AP::Crawler.new(dir, params).crawl
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, The Huffington Post
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 |   1. Redistributions of source code must retain the above copyright notice,
 8 |   this list of conditions and the following disclaimer.
 9 | 
10 |   2. Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/ap/downloader.rb:
--------------------------------------------------------------------------------
 1 | require 'net/ftp'
 2 | require 'fileutils'
 3 | require 'timeout'
 4 | require 'digest/md5'
 5 | 
 6 | module AP
 7 |   class Downloader
 8 | 
 9 |     def initialize(crawler)
10 |       @crawler = crawler
11 |     end
12 | 
13 |     def download
14 |       @crawler.logger.log "Started downloading"
15 |       connect
16 |       @crawler.params[:states].each{|state| download_state(state)}
17 |       disconnect
18 |       @crawler.logger.log "Finished downloading"
19 |     end
20 | 
21 |   private
22 | 
23 |     def connect
24 |       @ftp = Net::FTP.new(@crawler.ap_config['host'])
25 |       @ftp.login(@crawler.ap_config['user'], @crawler.ap_config['pass'])
26 |       @ftp.passive = true
27 |     end
28 | 
29 |     def disconnect
30 |       @ftp.close
31 |     end
32 | 
33 |     def download_state(state)
34 |       ftp_dir = "/#{state}/dbready"
35 |       local_dir = "#{@crawler.datadir}/#{state}"
36 |       FileUtils.remove_dir("#{local_dir}", true) if @crawler.params[:clean]
37 |       FileUtils.makedirs(local_dir) unless File.exists?(local_dir)
38 | 
39 |       # Downloaded files depend on parameters
40 |       files = ["#{state}_Results.txt", "#{state}_Race.txt"]
41 |       files += ["#{state}_Candidate.txt"] if @crawler.params[:initialize]
42 | 
43 |       download_files(files, ftp_dir, local_dir, state)
44 |     end
45 | 
46 |     def download_files(files, ftp_dir, local_dir, state)
47 |       files.each do |file|
48 |         local_file = "#{local_dir}/#{file}"
49 | 
50 |         begin
51 |           timeout(20) do
52 |             # Use the file's mtime on the ftp server to see if it changed before downloading
53 |             old_tm = File.exists?("#{local_file}.mtime") ? File.read("#{local_file}.mtime") : nil
54 |             new_tm = @ftp.mtime("#{ftp_dir}/#{file}").to_i.to_s
55 |             next if new_tm == old_tm
56 | 
57 |             @ftp.getbinaryfile("#{ftp_dir}/#{file}", "#{local_file}", 1024)
58 | 
59 |             # Hash the old and new files to see if they changed (oftentimes, files on the ftp server will have a new timestamp but be unchanged)
60 |             old_md5 = File.exists?("#{local_file}.md5") ? File.read("#{local_file}.md5") : nil
61 |             new_md5 = Digest::MD5.hexdigest(File.read(local_file))
62 |             next if old_md5 == new_md5
63 | 
64 |             # Record the downloaded files for processing by the importer
65 |             @crawler.new_files << [local_file, new_tm, new_md5]
66 |             @crawler.updated_states[state] ||= 1
67 |           end
68 | 
69 |         # Trap timeouts and a transient ftp error seen on higher traffic nights so they don't halt the crawler
70 |         rescue Exception, Timeout::Error => e
71 |           if !e.to_s.include?("The system cannot find the file")
72 |             @crawler.logger.err "FTP ERR for #{ftp_dir}/#{file}: #{e.to_s}"
73 |             FileUtils.rm_f("#{local_dir}/#{file}")
74 |             disconnect
75 |             connect
76 |           end
77 |         end
78 |       end
79 |     end
80 | 
81 |   end
82 | end
83 | 


--------------------------------------------------------------------------------
/ap/crawler.rb:
--------------------------------------------------------------------------------
 1 | module AP
 2 |   class Crawler
 3 | 
 4 |     STATES = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"]
 5 |     attr_accessor :dir, :datadir, :params, :env, :ap_config, :logger, :downloader, :importer, :replayer, :new_files, :updated_states
 6 | 
 7 |     def initialize(dir, params)
 8 |       @dir = dir
 9 |       @datadir = "#{@dir}/data"
10 |       @params = params
11 |       @ap_config = YAML::load(File.open("#{@dir}/config/ap.yml"))
12 |       @env = @ap_config['environment'] && @ap_config['environment'].size > 0 ? @ap_config['environment'] : "development"
13 | 
14 |       # Set some defaults from config file
15 |       @params[:interval] = @ap_config['interval'] if @params[:interval].nil?
16 |       @params[:states] = @ap_config['states'] if @params[:states].nil?
17 |       @params[:states] = (@params[:states] == 'all' ? STATES : (STATES & @params[:states].split(",")))
18 | 
19 |       # Some parameters are dependent on others
20 |       @params[:replay] = true if @params[:replaydate] && @params[:replaydate].size > 0
21 |       @params[:replaytimefrom] = (@params[:replaytimefrom] || @params[:replaytime] || 0).to_i
22 |       @params[:replaytimeto] = (@params[:replaytimeto] || @params[:replaytime] || 999999).to_i
23 |       @params[:initialize] = true if @params[:replay]
24 |       @params[:once] = true if @params[:initialize] && !@params[:record] && !@params[:replay]
25 |       @params[:clean] = true if @params[:record]
26 |       @params[:initialize] = true if @params[:record]
27 | 
28 |       @logger = AP::Logger.new
29 |       @downloader = AP::Downloader.new(self)
30 |       @importer = AP::Importer.new(self)
31 |       @replayer = AP::Replayer.new(self)
32 |       @posthook = AP::Posthook.new(self) if defined?(AP::Posthook)
33 |     end
34 | 
35 |     def crawl
36 |       while true do
37 |         tm_start = Time.now.to_i
38 | 
39 |         begin
40 |           @new_files = []
41 |           @updated_states = {}
42 | 
43 |           # Everything happens here
44 |           @params[:replay] ? @replayer.replay : @downloader.download
45 |           if @new_files.size > 0
46 |             @importer.import
47 |             @replayer.record if @params[:record]
48 |           end
49 | 
50 |           # Run posthook if results changed or param is set
51 |           if @posthook && (@new_files.size > 0 || @params[:posthook])
52 |             @posthook.run
53 |             @params[:posthook] = false
54 |           end
55 | 
56 |           # Sleep for a bit after the first round of a replay so you can ctrl-Z and do whatever
57 |           if @params[:initialize] && @params[:replay]
58 |             @logger.log "Sleeping at initial state *************"
59 |             sleep 5
60 |           end
61 | 
62 |         rescue AbortException => e
63 |           @logger.err e.to_s
64 |           raise e
65 |         rescue Exception => e
66 |           # Reconnect to mysql if connection dropped, otherwise, log any errors and continue
67 |           @importer.connect if e.to_s.include?('MySQL server has gone away')
68 |           @logger.err e.to_s
69 |         end
70 | 
71 |         @params[:clean] = false
72 |         @params[:initialize] = false if @params[:record] || @params[:replay]
73 |         break if @params[:once] || (@params[:replay] && @replayer.done)
74 | 
75 |         # Sleep for remaining time
76 |         s = @params[:interval] - (Time.now.to_i - tm_start)
77 |         @logger.log "Sleeping for #{s} seconds" if s > 0
78 |         sleep(s < 0 ? 0 : s)
79 |       end
80 |     end
81 | 
82 |   end
83 | end
84 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AP Elections Data Loader
 2 | 
 3 | Many news organizations use data from The Associated Press to power their election results reporting and real-time interactive maps. The code in this repository has been used by The Huffington Post since the 2012 Iowa caucuses to build results maps for elections including the [Republican primaries](http://elections.huffingtonpost.com/2012/primaries), the [general election](http://elections.huffingtonpost.com/2012/results) and the [Wisconsin recall](http://elections.huffingtonpost.com/2012/wisconsin-recall-results) in 2012 as well as the special elections in [South Carolina](http://elections.huffingtonpost.com/2013/mark-sanford-vs-elizabeth-colbert-busch-sc1) and [Massachusetts](http://elections.huffingtonpost.com/2013/massachusetts-senate-results) in 2013.
 4 | 
 5 | This repository is not affiliated with The Associated Press. You must have a contract with the AP and an account on its FTP server to use this code.
 6 | 
 7 | This repository has a single purpose: to get results off the AP's FTP server and into MySQL as fast as possible. It does not contain methods to query those results, and does not make assumptions about the front-end used to display the loaded data.
 8 | 
 9 | 
10 | ## Getting started
11 | 
12 | 1. Install the necessary gems:
13 | 
14 | 		bundle install
15 | 
16 | 2. Create local copies of the example config files:
17 | 
18 | 		cp config/ap.yml.example config/ap.yml
19 | 		cp config/database.yml.example config/database.yml
20 | 
21 | 3. Enter your AP credentials into `config/ap.yml`, your database credentials into `config/database.yml`, and ensure the database referenced in database.yml exists locally.
22 | 
23 | 4. Import the AP's current Massachusetts data:
24 | 
25 | 		ruby crawl.rb --initialize --states=MA
26 | 
27 | The results data from the AP FTP server is now loaded into the `ap_races`, `ap_results` and `ap_candidates` tables in MySQL. On subsequent imports for the current election in Massachusetts, you do not need to include the `initialize` option. The full list of options is described below.
28 | 
29 | ## Replays
30 | 
31 | The AP conducts tests of its live results reporting in the weeks leading up to an election. With the `record` and `replay` parameters, you can record these tests and replay them at a later time, which is useful for development. Recordings can be easily stored on s3, which means you can make them accessible to other developers.
32 | 
33 | To record an AP test, start recording before the test begins, and stop it after the test is over:
34 | 
35 | 	ruby crawl.rb --record
36 | 
37 | You can now replay that test at any time:
38 | 
39 | 	ruby crawl.rb --replay
40 | 
41 | To store the recording on s3, create an `s3.yml` config file from the example file provided, fill in your account information, and upload it:
42 | 
43 | 	ruby upload_replay.rb
44 | 
45 | Once uploaded, you can run that replay from any machine that has a corresponding `s3.yml`:
46 | 
47 | 	ruby crawl.rb --replay
48 | 
49 | By default, the newest replay will always be run, but you can change that with the `replaydate` option.
50 | 
51 | ## Posthooks
52 | 
53 | Posthooks allow you to create code that is run every time new results are imported. For example, at the Huffington Post, we often bake out static pages each time results are updated.
54 | 
55 | To add a posthook, copy the example file:
56 | 
57 | 	cp posthook/posthook.rb.example posthook/posthook.rb
58 | 
59 | Each time results have been updated, the `run` method in your posthook will be called. You can add any code you need to that file, and add libraries or other external dependencies to the posthook directory.
60 | 
61 | ## All Options
62 | 
63 | The following options are available to `crawl.rb`. Any option listed without examples is boolean and defaults to false.
64 | 
65 | - `states`: Comma-separated states to download
66 |     - examples: `MA`, `MA,CA`, `all`
67 | - `initialize`: Create initial set of results records
68 | - `once`: Only download and import data once
69 | - `clean`: Clean the data directories for specified states before downloading
70 | - `interval`: Interval in seconds at which AP data will be downloaded
71 |     - examples: `300`, `600`
72 | - `posthook`: Run posthook after first iteration, even if results didn't change
73 | - `record`: Record this run
74 | - `replay`: Replay the most recent run
75 | - `replaydate`: Specify date of replay to run
76 |     - examples: `20130521`, `20130523`
77 | - `replaytime`: Set the results to their state at the specified time.
78 | - `replaytimefrom`: Run the replay from the specified time onward.
79 | - `replaytimeto`: Run the replay up to the specified time.
80 | - `help`: Show help dialog
81 | 
82 | ## Authors
83 | 
84 | - Jay Boice, jay.boice@huffingtonpost.com
85 | - Aaron Bycoffe, bycoffe@huffingtonpost.com
86 | 
87 | ## Copyright
88 | 
89 | Copyright &copy; 2013 The Huffington Post. See LICENSE for details.
90 | 


--------------------------------------------------------------------------------
/ap/replayer.rb:
--------------------------------------------------------------------------------
  1 | require 'aws/s3'
  2 | require 'yaml'
  3 | 
  4 | module AP
  5 |   class Replayer
  6 | 
  7 |     attr_accessor :done, :timekeys, :timekey_idx
  8 | 
  9 |     def initialize(crawler)
 10 |       @crawler = crawler
 11 |       @done = false
 12 |       @timekey_idx = 0
 13 |     end
 14 | 
 15 |     def replay
 16 |       raise AbortException, "Can't run replays in production environment" if ['production', 'internal'].include?(@crawler.env)
 17 | 
 18 |       get_replay if @timekey_idx == 0
 19 |       timekey = @timekeys[@timekey_idx]
 20 |       @crawler.logger.log "Started replaying #{timekey}"
 21 | 
 22 |       archive_dir = "#{@crawler.datadir}/#{@crawler.params[:replaydate]}/#{timekey}"
 23 |       new_states = Dir.glob("#{archive_dir}/*").map{|d| d.split('/').last}.uniq
 24 |       new_states = new_states & @crawler.params[:states] if @crawler.params[:states]
 25 | 
 26 |       new_states.each do |state_abbr|
 27 |         state_dir = "#{@crawler.datadir}/#{state_abbr}"
 28 |         system "mkdir -p #{state_dir}"
 29 |         state_archive_dir = "#{archive_dir}/#{state_abbr}"
 30 |         files = ["#{state_abbr}_Results.txt", "#{state_abbr}_Race.txt", "#{state_abbr}_Candidate.txt"]
 31 |         files.each do |file|
 32 |           archive_file = "#{state_archive_dir}/#{file.split('/').last}"
 33 |           next unless File.exists?(archive_file)
 34 |           local_file = "#{state_dir}/#{file.split('/').last}"
 35 |           system("cp #{archive_file} #{local_file}")
 36 |           @crawler.new_files << [local_file, nil, nil]
 37 |         end
 38 |         @crawler.updated_states[state_abbr] ||= 1
 39 |       end
 40 | 
 41 |       @timekey_idx += 1
 42 |       @done = true if @timekey_idx >= @timekeys.size
 43 |       @crawler.logger.log "Finished replaying"
 44 |     end
 45 | 
 46 |     def record
 47 |       @crawler.logger.log "Started recording"
 48 |       dt1 = Time.now.strftime('%Y%m%d')
 49 |       dt2 = Time.now.strftime('%H%M%S')
 50 |       @crawler.updated_states.keys.each do |state_abbr|
 51 |         record_state(state_abbr, @crawler.new_files.select{|file| file.first.index("#{state_abbr}_")}, dt1, dt2)
 52 |       end
 53 |       @crawler.logger.log "Finished recording"
 54 |     end
 55 | 
 56 |   private
 57 | 
 58 |     def get_replay
 59 |       download_latest_from_s3 if File.exists?("#{@crawler.dir}/config/s3.yml")
 60 |       @crawler.params[:replaydate] = Dir.glob("#{@crawler.datadir}/20*").reject{|f| f.index('.tar.gz')}.map{|f| f.split('/').last}.sort.last unless @crawler.params[:replaydate]
 61 |       if @crawler.params[:replaydate].nil?
 62 |         raise AbortException, "No replay was found locally or on s3, exiting"
 63 |       end
 64 |       if !File.exists?("#{@crawler.datadir}/#{@crawler.params[:replaydate]}/")
 65 |         raise AbortException, "A replay for #{@crawler.params[:replaydate]} was not found"
 66 |       end
 67 |       @timekeys = Dir.glob("#{@crawler.datadir}/#{@crawler.params[:replaydate]}/*").map{|d| d.split('/').last}.uniq.sort
 68 |       @timekeys.select! { |x| x.to_i >= @crawler.params[:replaytimefrom] && x.to_i <= @crawler.params[:replaytimeto] }
 69 |     end
 70 | 
 71 |     def download_latest_from_s3
 72 |       @s3_config = YAML.load_file("#{@crawler.dir}/config/s3.yml")
 73 |       begin
 74 |         AWS::S3::Base.establish_connection!(:access_key_id => @s3_config['access_key_id'], :secret_access_key => @s3_config['secret_access_key'])
 75 |         bucket = AWS::S3::Bucket.find(@s3_config['bucket'])
 76 |         s3_files = bucket.objects(:prefix => "#{@s3_config['directory']}/").map{|o| o.key.split('/')[1, 1].first}
 77 |       rescue Exception => e
 78 |         raise AbortException, e.to_s
 79 |       end
 80 |       if s3_files.size == 0
 81 |         raise AbortException, "No replays were found on s3 in the bucket and directory specified"
 82 |       end
 83 |       s3_date = @crawler.params[:replaydate] || s3_files.sort.last.split('.').first
 84 | 
 85 |       local_gzip = "#{@crawler.datadir}/#{s3_date}.tar.gz"
 86 |       unless File.exist?(local_gzip)
 87 |         puts "Downloading replay from #{s3_date}..."
 88 |         s3_object = bucket.objects(:prefix => "#{@s3_config['directory']}/#{s3_date}.tar.gz").first
 89 |         if s3_object.nil?
 90 |           raise AbortException, "A replay from #{s3_date} wasn't found on s3"
 91 |         end
 92 |         File.open(local_gzip, 'w') {|f| f.write(s3_object.value)}
 93 |         system "tar -zxvf #{local_gzip} -C #{@crawler.datadir}/"
 94 |       end
 95 |     end
 96 | 
 97 |     def record_state(state_abbr, files, dt1, dt2)
 98 |       archive_dir = "#{@crawler.datadir}/#{dt1}/#{dt2}/#{state_abbr}/"
 99 |       system "mkdir -p #{archive_dir}"
100 |       files.each do |file|
101 |         archive_file = "#{archive_dir}#{file.first.split('/').last}"
102 |         system "cp #{file.first} #{archive_file}"
103 |       end
104 |     end
105 | 
106 |   end
107 | end
108 | 


--------------------------------------------------------------------------------
/data/create_tables.sql:
--------------------------------------------------------------------------------
  1 | CREATE TABLE IF NOT EXISTS `ap_races` (
  2 |   `test_flag` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL,
  3 |   `id` bigint(20) NOT NULL DEFAULT '0',
  4 |   `race_number` int(11) DEFAULT NULL,
  5 |   `election_date` datetime DEFAULT NULL,
  6 |   `state_postal` varchar(2) COLLATE utf8_unicode_ci DEFAULT NULL,
  7 |   `county_number` int(11) DEFAULT NULL,
  8 |   `fips_code` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
  9 |   `county_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 10 |   `office_id` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL,
 11 |   `race_type_id` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL,
 12 |   `seat_number` int(11) DEFAULT NULL,
 13 |   `office_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 14 |   `seat_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 15 |   `race_type_party` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 16 |   `race_type` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 17 |   `office_description` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 18 |   `number_of_winners` int(11) DEFAULT NULL,
 19 |   `number_in_runoff` int(11) DEFAULT NULL,
 20 |   `precincts_reporting` int(11) DEFAULT NULL,
 21 |   `total_precincts` int(11) DEFAULT NULL,
 22 |   `last_updated` datetime DEFAULT NULL,
 23 |   PRIMARY KEY (`id`)
 24 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 25 | 
 26 | CREATE TABLE IF NOT EXISTS `stage_ap_races` (
 27 |   `test_flag` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL,
 28 |   `race_county_id` bigint(20) DEFAULT NULL,
 29 |   `race_number` int(11) DEFAULT NULL,
 30 |   `election_date` datetime DEFAULT NULL,
 31 |   `state_postal` varchar(2) COLLATE utf8_unicode_ci DEFAULT NULL,
 32 |   `county_number` int(11) DEFAULT NULL,
 33 |   `fips_code` int(11) DEFAULT NULL,
 34 |   `county_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 35 |   `office_id` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL,
 36 |   `race_type_id` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL,
 37 |   `seat_number` int(11) DEFAULT NULL,
 38 |   `office_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 39 |   `seat_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 40 |   `race_type_party` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 41 |   `race_type` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 42 |   `office_description` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 43 |   `number_of_winners` int(11) DEFAULT NULL,
 44 |   `number_in_runoff` int(11) DEFAULT NULL,
 45 |   `precincts_reporting` int(11) DEFAULT NULL,
 46 |   `total_precincts` int(11) DEFAULT NULL,
 47 |   `ap_race_id` bigint(20) DEFAULT NULL,
 48 |   KEY `index_stage_ap_races_on_ap_race_id` (`ap_race_id`)
 49 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 50 | 
 51 | CREATE TABLE IF NOT EXISTS `ap_results` (
 52 |   `test_flag` varchar(1) COLLATE utf8_unicode_ci NOT NULL,
 53 |   `ap_race_id` bigint(20) NOT NULL DEFAULT '0',
 54 |   `ap_candidate_id` int(11) NOT NULL DEFAULT '0',
 55 |   `party` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 56 |   `incumbent` tinyint(1) DEFAULT NULL,
 57 |   `vote_count` int(11) DEFAULT NULL,
 58 |   `winner` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL,
 59 |   `natl_order` int(11) DEFAULT NULL,
 60 |   `winner_override` int(11) DEFAULT NULL,
 61 |   PRIMARY KEY (`ap_candidate_id`,`ap_race_id`),
 62 |   KEY `index_ap_results_on_ap_race_id` (`ap_race_id`),
 63 |   KEY `index_ap_results_on_ap_candidate_id` (`ap_candidate_id`)
 64 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 65 | 
 66 | CREATE TABLE IF NOT EXISTS `stage_ap_results` (
 67 |   `test_flag` varchar(1) COLLATE utf8_unicode_ci NOT NULL,
 68 |   `race_county_id` bigint(20) DEFAULT NULL,
 69 |   `candidate_id` int(11) DEFAULT NULL,
 70 |   `party` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 71 |   `incumbent` tinyint(1) DEFAULT NULL,
 72 |   `vote_count` int(11) DEFAULT NULL,
 73 |   `winner` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL,
 74 |   `natl_order` int(11) DEFAULT NULL
 75 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 76 | 
 77 | CREATE TABLE IF NOT EXISTS `ap_candidates` (
 78 |   `test_flag` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL,
 79 |   `id` int(11) NOT NULL DEFAULT '0',
 80 |   `candidate_number` int(11) DEFAULT NULL,
 81 |   `first_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 82 |   `middle_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 83 |   `last_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 84 |   `junior` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 85 |   `use_junior` tinyint(1) DEFAULT NULL,
 86 |   `politician_id` int(11) DEFAULT NULL,
 87 |   PRIMARY KEY (`id`)
 88 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 89 | 
 90 | CREATE TABLE IF NOT EXISTS `stage_ap_candidates` (
 91 |   `test_flag` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL,
 92 |   `candidate_id` int(11) DEFAULT NULL,
 93 |   `candidate_number` int(11) DEFAULT NULL,
 94 |   `first_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 95 |   `middle_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 96 |   `last_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 97 |   `junior` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
 98 |   `use_junior` tinyint(1) DEFAULT NULL,
 99 |   `politician_id` int(11) DEFAULT NULL
100 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
101 | 


--------------------------------------------------------------------------------
/ap/importer.rb:
--------------------------------------------------------------------------------
  1 | require 'mysql2'
  2 | require 'yaml'
  3 | 
  4 | module AP
  5 |   class Importer
  6 | 
  7 |     def initialize(crawler)
  8 |       @crawler = crawler
  9 |       @db_config = YAML::load(File.open("#{@crawler.dir}/config/database.yml"))[@crawler.env]
 10 |       connect
 11 | 
 12 |       # Add a test_flag filter based on the environment -- important
 13 |       @test_flag_where = "test_flag #{['production', 'internal'].include?(@crawler.env) ? "= 'l'" : "in ('l', 't')"}"
 14 | 
 15 |       # Create tables if they don't exist when initializing
 16 |       create_tables if @crawler.params[:initialize]
 17 |     end
 18 | 
 19 |     def import
 20 |       @crawler.logger.log "Started importing"
 21 |       @crawler.logger.log "New data in #{@crawler.new_files.map{|file| file.first.split('/').last}.join(', ')}" if @crawler.new_files.size > 0
 22 | 
 23 |       @crawler.updated_states.keys.each do |state_abbr|
 24 |         @crawler.logger.log "Importing #{state_abbr}"
 25 |         stage_state(state_abbr)
 26 |         @crawler.params[:initialize] ? initialize_state(state_abbr) : merge_state(state_abbr)
 27 |       end
 28 | 
 29 |       # Wait to cache new files until they're fully merged so the crawler can be killed between downloading and importing
 30 |       @crawler.new_files.each do |file, tm, md5|
 31 |         File.open("#{file}.mtime", 'w') {|f| f.write(tm)}
 32 |         File.open("#{file}.md5", 'w') {|f| f.write(md5)}
 33 |       end
 34 | 
 35 |       @crawler.logger.log "Finished importing"
 36 |     end
 37 | 
 38 |   private
 39 | 
 40 |     # Load the newly downloaded files into a staging table
 41 |     def stage_state(state_abbr)
 42 |       first_file = @crawler.new_files.select{|file| file.first.index("#{state_abbr}_")}.first.first
 43 |       state_path = first_file.split('/')[0, first_file.split('/').size - 1].join('/')
 44 | 
 45 |       files = [["_Race.txt", "ap_races"], ["_Results.txt", "ap_results"]]
 46 |       files += [["_Candidate.txt", "ap_candidates"]] if @crawler.params[:initialize]
 47 | 
 48 |       files.each do |f|
 49 |         q "truncate stage_#{f.last}"
 50 |         next unless File.exists? "#{state_path}/#{state_abbr}#{f.first}"
 51 |         load_data = "'#{state_path}/#{state_abbr}#{f.first}' into table stage_#{f.last} fields terminated by ';'"
 52 |         begin
 53 |           q "load data local infile #{load_data}"
 54 |         rescue Exception
 55 |           q "load data infile #{load_data}"
 56 |         end
 57 |       end
 58 | 
 59 |       q "update stage_ap_races set ap_race_id = concat(date_format(election_date, '%y%m'), race_county_id)"
 60 |     end
 61 | 
 62 |     # Create new records in production (non-staging) table if necessary
 63 |     def initialize_state(state_abbr)
 64 |       election_date = q("select election_date from stage_ap_races limit 1").first["election_date"].strftime("%Y-%m-%d")
 65 |       q "start transaction"
 66 | 
 67 |       q <<-eos
 68 |         delete ap_candidates from ap_candidates
 69 |           inner join ap_results on ap_results.ap_candidate_id = ap_candidates.id
 70 |           inner join ap_races on ap_results.ap_race_id = ap_races.id
 71 |         where ap_races.state_postal = '#{state_abbr}' and
 72 |           ap_races.election_date = '#{election_date}'
 73 |       eos
 74 | 
 75 |       q <<-eos
 76 |         insert into ap_candidates
 77 |           select * from stage_ap_candidates
 78 |           where stage_ap_candidates.#{@test_flag_where}
 79 |       eos
 80 | 
 81 |       q <<-eos
 82 |         delete ap_results from ap_results
 83 |           inner join ap_races on ap_results.ap_race_id = ap_races.id
 84 |         where ap_races.state_postal = '#{state_abbr}' and
 85 |           ap_races.election_date = '#{election_date}'
 86 |       eos
 87 | 
 88 |       q <<-eos
 89 |         insert into ap_results (test_flag, ap_race_id, ap_candidate_id, party, incumbent, vote_count, winner, natl_order)
 90 |         select stage_ap_results.test_flag, stage_ap_races.ap_race_id, candidate_id, party, incumbent, vote_count, winner, natl_order
 91 |         from stage_ap_results
 92 |           inner join stage_ap_races on stage_ap_results.race_county_id = stage_ap_races.race_county_id
 93 |         where stage_ap_races.#{@test_flag_where} and
 94 |           stage_ap_results.#{@test_flag_where}
 95 |       eos
 96 | 
 97 |       q <<-eos
 98 |         delete ap_races from ap_races
 99 |         where ap_races.state_postal = '#{state_abbr}' and
100 |           ap_races.election_date = '#{election_date}'
101 |       eos
102 | 
103 |       q <<-eos
104 |         insert into ap_races (test_flag, id, race_number, election_date, state_postal, county_number, fips_code, county_name, office_id, race_type_id, seat_number, office_name, seat_name, race_type_party, race_type, office_description, number_of_winners, number_in_runoff, precincts_reporting, total_precincts, last_updated)
105 |         select test_flag, ap_race_id, race_number, election_date, state_postal, county_number, fips_code, county_name, office_id, race_type_id, seat_number, office_name, seat_name, race_type_party, race_type, office_description, number_of_winners, number_in_runoff, precincts_reporting, total_precincts, now()
106 |         from stage_ap_races
107 |         where stage_ap_races.#{@test_flag_where}
108 |       eos
109 | 
110 |       q "commit"
111 |     end
112 | 
113 |     # Update records in production table based on staging table
114 |     def merge_state(state_abbr)
115 |       q "start transaction"
116 | 
117 |       q <<-eos
118 |         update ap_races
119 |           inner join stage_ap_races on ap_races.id = stage_ap_races.ap_race_id
120 |         set
121 |           ap_races.test_flag = stage_ap_races.test_flag,
122 |           ap_races.race_number = stage_ap_races.race_number,
123 |           ap_races.election_date = stage_ap_races.election_date,
124 |           ap_races.state_postal = stage_ap_races.state_postal,
125 |           ap_races.county_number = stage_ap_races.county_number,
126 |           ap_races.fips_code = stage_ap_races.fips_code,
127 |           ap_races.county_name = stage_ap_races.county_name,
128 |           ap_races.office_id = stage_ap_races.office_id,
129 |           ap_races.race_type_id = stage_ap_races.race_type_id,
130 |           ap_races.seat_number = stage_ap_races.seat_number,
131 |           ap_races.office_name = stage_ap_races.office_name,
132 |           ap_races.seat_name = stage_ap_races.seat_name,
133 |           ap_races.race_type_party = stage_ap_races.race_type_party,
134 |           ap_races.race_type = stage_ap_races.race_type,
135 |           ap_races.office_description = stage_ap_races.office_description,
136 |           ap_races.number_of_winners = stage_ap_races.number_of_winners,
137 |           ap_races.number_in_runoff = stage_ap_races.number_in_runoff,
138 |           ap_races.precincts_reporting = stage_ap_races.precincts_reporting,
139 |           ap_races.total_precincts = stage_ap_races.total_precincts,
140 |           ap_races.last_updated = now()
141 |         where stage_ap_races.#{@test_flag_where};
142 |       eos
143 | 
144 |       q <<-eos
145 |         update ap_results
146 |           inner join stage_ap_races on ap_results.ap_race_id = stage_ap_races.ap_race_id
147 |           inner join stage_ap_results on stage_ap_races.race_county_id = stage_ap_results.race_county_id and ap_results.ap_candidate_id = stage_ap_results.candidate_id
148 |         set
149 |           ap_results.test_flag = stage_ap_results.test_flag,
150 |           ap_results.party = stage_ap_results.party,
151 |           ap_results.incumbent = stage_ap_results.incumbent,
152 |           ap_results.vote_count = stage_ap_results.vote_count,
153 |           ap_results.winner = stage_ap_results.winner,
154 |           ap_results.natl_order = stage_ap_results.natl_order
155 |         where stage_ap_results.#{@test_flag_where};
156 |       eos
157 | 
158 |       q "commit"
159 |     end
160 | 
161 |     def create_tables
162 |       system  "mysql -h #{@db_config["host"] || "localhost"} -u #{@db_config["username"]} --password=#{@db_config["password"]} #{@db_config["database"]} < #{@crawler.dir}/data/create_tables.sql"
163 |     end
164 | 
165 |     def connect
166 |       @db = Mysql2::Client.new(:host => @db_config["host"], :username => @db_config["username"], :password => @db_config["password"], :database => @db_config["database"])
167 |     end
168 | 
169 |     def q(sql)
170 |       #puts sql
171 |       @db.query(sql)
172 |     end
173 | 
174 |   end
175 | end


--------------------------------------------------------------------------------