├── data ├── .gitkeep └── create_tables.sql ├── ap ├── exception.rb ├── logger.rb ├── downloader.rb ├── crawler.rb ├── replayer.rb └── importer.rb ├── config ├── s3.yml.example ├── ap.yml.example └── database.yml.example ├── Gemfile ├── .gitignore ├── Gemfile.lock ├── posthook └── posthook.rb.example ├── upload_replay.rb ├── crawl.rb ├── LICENSE └── README.md /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ap/exception.rb: -------------------------------------------------------------------------------- 1 | class AbortException < Exception 2 | end -------------------------------------------------------------------------------- /config/s3.yml.example: -------------------------------------------------------------------------------- 1 | access_key_id: 2 | secret_access_key: 3 | bucket: 4 | directory: ap_replays 5 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'trollop', '2.0' 4 | gem 'mysql2', '0.3.11' 5 | gem 'aws-s3', '0.6.3' -------------------------------------------------------------------------------- /config/ap.yml.example: -------------------------------------------------------------------------------- 1 | host: electionsonline.ap.org 2 | user: 3 | pass: 4 | environment: development 5 | interval: 300 6 | states: MA 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | config/ap.yml 2 | config/database.yml 3 | config/s3.yml 4 | data/* 5 | !data/create_tables.sql 6 | posthook/* 7 | !posthook/posthook.rb.example 8 | deploy.rb 9 | -------------------------------------------------------------------------------- /ap/logger.rb: -------------------------------------------------------------------------------- 1 | STDOUT.sync = true 2 | 3 | module AP 4 | class Logger 5 | 6 | def log(str) 7 | puts "#{Time.now.strftime('%m-%d %H:%M:%S')} - #{str}" 8 | end 9 | 10 | def err(str) 11 | log("ERROR: " + str) 12 | end 13 | 14 | end 15 | end -------------------------------------------------------------------------------- /config/database.yml.example: -------------------------------------------------------------------------------- 1 | development: 2 | database: elections 3 | username: 4 | password: 5 | host: localhost 6 | 7 | staging: 8 | database: elections 9 | username: 10 | password: 11 | host: 12 | 13 | production: 14 | database: elections 15 | username: 16 | password: 17 | host: 18 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | aws-s3 (0.6.3) 5 | builder 6 | mime-types 7 | xml-simple 8 | builder (3.0.4) 9 | mime-types (1.21) 10 | mysql2 (0.3.11) 11 | trollop (2.0) 12 | xml-simple (1.1.2) 13 | 14 | PLATFORMS 15 | ruby 16 | 17 | DEPENDENCIES 18 | aws-s3 (= 0.6.3) 19 | mysql2 (= 0.3.11) 20 | trollop (= 2.0) 21 | -------------------------------------------------------------------------------- /posthook/posthook.rb.example: -------------------------------------------------------------------------------- 1 | module AP 2 | class Posthook 3 | 4 | def initialize(crawler) 5 | @crawler = crawler 6 | end 7 | 8 | def run 9 | @crawler.logger.log "Running posthook #{"for #{@crawler.updated_states.keys.join(', ')}" if @crawler.updated_states.size > 0}" 10 | 11 | # Do stuff 12 | 13 | @crawler.logger.log "Finished running posthook" if @crawler.updated_states.size > 0 14 | end 15 | 16 | end 17 | end -------------------------------------------------------------------------------- /upload_replay.rb: -------------------------------------------------------------------------------- 1 | require 'trollop' 2 | require 'yaml' 3 | require 'aws/s3' 4 | 5 | dir = "#{File.expand_path(File.dirname(__FILE__))}" 6 | datadir = "#{dir}/data" 7 | 8 | params = Trollop::options do 9 | opt :date, "Specify date of recording to upload (e.g. 20120521)", :type => :string 10 | end 11 | 12 | @s3_config = YAML.load_file("#{dir}/config/s3.yml") 13 | AWS::S3::Base.establish_connection!(:access_key_id => @s3_config['access_key_id'], :secret_access_key => @s3_config['secret_access_key']) 14 | 15 | params[:date] = Dir.glob("#{datadir}/20*").reject{|f| f.index('.tar.gz')}.map{|f| f.split('/').last}.sort.last 16 | 17 | puts "Gzipping and uploading replay from #{params[:date]}" 18 | system "cd #{datadir} && tar -czf #{params[:date]}.tar.gz #{params[:date]}" 19 | AWS::S3::Base.establish_connection!(:access_key_id => @s3_config['access_key_id'], :secret_access_key => @s3_config['secret_access_key']) 20 | file = "#{datadir}/#{params[:date]}.tar.gz" 21 | s3_file = "#{@s3_config['directory']}/#{params[:date]}.tar.gz" 22 | AWS::S3::S3Object.store(s3_file, open(file), @s3_config['bucket'], :access => :private) 23 | -------------------------------------------------------------------------------- /crawl.rb: -------------------------------------------------------------------------------- 1 | require 'trollop' 2 | 3 | dir = "#{File.expand_path(File.dirname(__FILE__))}" 4 | Dir["#{dir}/ap/*.rb"].each {|f| require f } 5 | Dir["#{dir}/posthook/*.rb"].each {|f| require f } 6 | 7 | params = Trollop::options do 8 | opt :states, "Comma-separated states to download", :type => :string, :default => nil 9 | opt :initialize, "Create initial set of results records", :default => false 10 | opt :once, "Only download and import data once", :default => false 11 | opt :clean, "Clean the data directories for specified states before downloading", :default => false 12 | opt :interval, "Interval (in seconds) at which AP data will be downloaded", :type => :int, :default => nil 13 | opt :posthook, "Run posthook after first iteration, even if results didn't change", :default => false 14 | opt :record, "Record this run", :default => false 15 | opt :replay, "Replay the most recent run", :default => false 16 | opt :replaydate, "Specify date of replay to run (e.g. 20120521)", :type => :string 17 | opt :replaytime, "Set the results to their state at the specified time", :default => nil, :type => :string 18 | opt :replaytimefrom, "Run the replay from the specified time onward", :default => nil, :type => :string 19 | opt :replaytimeto, "Run the replay up to the specified time", :default => nil, :type => :string 20 | end 21 | 22 | AP::Crawler.new(dir, params).crawl 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, The Huffington Post 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /ap/downloader.rb: -------------------------------------------------------------------------------- 1 | require 'net/ftp' 2 | require 'fileutils' 3 | require 'timeout' 4 | require 'digest/md5' 5 | 6 | module AP 7 | class Downloader 8 | 9 | def initialize(crawler) 10 | @crawler = crawler 11 | end 12 | 13 | def download 14 | @crawler.logger.log "Started downloading" 15 | connect 16 | @crawler.params[:states].each{|state| download_state(state)} 17 | disconnect 18 | @crawler.logger.log "Finished downloading" 19 | end 20 | 21 | private 22 | 23 | def connect 24 | @ftp = Net::FTP.new(@crawler.ap_config['host']) 25 | @ftp.login(@crawler.ap_config['user'], @crawler.ap_config['pass']) 26 | @ftp.passive = true 27 | end 28 | 29 | def disconnect 30 | @ftp.close 31 | end 32 | 33 | def download_state(state) 34 | ftp_dir = "/#{state}/dbready" 35 | local_dir = "#{@crawler.datadir}/#{state}" 36 | FileUtils.remove_dir("#{local_dir}", true) if @crawler.params[:clean] 37 | FileUtils.makedirs(local_dir) unless File.exists?(local_dir) 38 | 39 | # Downloaded files depend on parameters 40 | files = ["#{state}_Results.txt", "#{state}_Race.txt"] 41 | files += ["#{state}_Candidate.txt"] if @crawler.params[:initialize] 42 | 43 | download_files(files, ftp_dir, local_dir, state) 44 | end 45 | 46 | def download_files(files, ftp_dir, local_dir, state) 47 | files.each do |file| 48 | local_file = "#{local_dir}/#{file}" 49 | 50 | begin 51 | timeout(20) do 52 | # Use the file's mtime on the ftp server to see if it changed before downloading 53 | old_tm = File.exists?("#{local_file}.mtime") ? File.read("#{local_file}.mtime") : nil 54 | new_tm = @ftp.mtime("#{ftp_dir}/#{file}").to_i.to_s 55 | next if new_tm == old_tm 56 | 57 | @ftp.getbinaryfile("#{ftp_dir}/#{file}", "#{local_file}", 1024) 58 | 59 | # Hash the old and new files to see if they changed (oftentimes, files on the ftp server will have a new timestamp but be unchanged) 60 | old_md5 = File.exists?("#{local_file}.md5") ? File.read("#{local_file}.md5") : nil 61 | new_md5 = Digest::MD5.hexdigest(File.read(local_file)) 62 | next if old_md5 == new_md5 63 | 64 | # Record the downloaded files for processing by the importer 65 | @crawler.new_files << [local_file, new_tm, new_md5] 66 | @crawler.updated_states[state] ||= 1 67 | end 68 | 69 | # Trap timeouts and a transient ftp error seen on higher traffic nights so they don't halt the crawler 70 | rescue Exception, Timeout::Error => e 71 | if !e.to_s.include?("The system cannot find the file") 72 | @crawler.logger.err "FTP ERR for #{ftp_dir}/#{file}: #{e.to_s}" 73 | FileUtils.rm_f("#{local_dir}/#{file}") 74 | disconnect 75 | connect 76 | end 77 | end 78 | end 79 | end 80 | 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /ap/crawler.rb: -------------------------------------------------------------------------------- 1 | module AP 2 | class Crawler 3 | 4 | STATES = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"] 5 | attr_accessor :dir, :datadir, :params, :env, :ap_config, :logger, :downloader, :importer, :replayer, :new_files, :updated_states 6 | 7 | def initialize(dir, params) 8 | @dir = dir 9 | @datadir = "#{@dir}/data" 10 | @params = params 11 | @ap_config = YAML::load(File.open("#{@dir}/config/ap.yml")) 12 | @env = @ap_config['environment'] && @ap_config['environment'].size > 0 ? @ap_config['environment'] : "development" 13 | 14 | # Set some defaults from config file 15 | @params[:interval] = @ap_config['interval'] if @params[:interval].nil? 16 | @params[:states] = @ap_config['states'] if @params[:states].nil? 17 | @params[:states] = (@params[:states] == 'all' ? STATES : (STATES & @params[:states].split(","))) 18 | 19 | # Some parameters are dependent on others 20 | @params[:replay] = true if @params[:replaydate] && @params[:replaydate].size > 0 21 | @params[:replaytimefrom] = (@params[:replaytimefrom] || @params[:replaytime] || 0).to_i 22 | @params[:replaytimeto] = (@params[:replaytimeto] || @params[:replaytime] || 999999).to_i 23 | @params[:initialize] = true if @params[:replay] 24 | @params[:once] = true if @params[:initialize] && !@params[:record] && !@params[:replay] 25 | @params[:clean] = true if @params[:record] 26 | @params[:initialize] = true if @params[:record] 27 | 28 | @logger = AP::Logger.new 29 | @downloader = AP::Downloader.new(self) 30 | @importer = AP::Importer.new(self) 31 | @replayer = AP::Replayer.new(self) 32 | @posthook = AP::Posthook.new(self) if defined?(AP::Posthook) 33 | end 34 | 35 | def crawl 36 | while true do 37 | tm_start = Time.now.to_i 38 | 39 | begin 40 | @new_files = [] 41 | @updated_states = {} 42 | 43 | # Everything happens here 44 | @params[:replay] ? @replayer.replay : @downloader.download 45 | if @new_files.size > 0 46 | @importer.import 47 | @replayer.record if @params[:record] 48 | end 49 | 50 | # Run posthook if results changed or param is set 51 | if @posthook && (@new_files.size > 0 || @params[:posthook]) 52 | @posthook.run 53 | @params[:posthook] = false 54 | end 55 | 56 | # Sleep for a bit after the first round of a replay so you can ctrl-Z and do whatever 57 | if @params[:initialize] && @params[:replay] 58 | @logger.log "Sleeping at initial state *************" 59 | sleep 5 60 | end 61 | 62 | rescue AbortException => e 63 | @logger.err e.to_s 64 | raise e 65 | rescue Exception => e 66 | # Reconnect to mysql if connection dropped, otherwise, log any errors and continue 67 | @importer.connect if e.to_s.include?('MySQL server has gone away') 68 | @logger.err e.to_s 69 | end 70 | 71 | @params[:clean] = false 72 | @params[:initialize] = false if @params[:record] || @params[:replay] 73 | break if @params[:once] || (@params[:replay] && @replayer.done) 74 | 75 | # Sleep for remaining time 76 | s = @params[:interval] - (Time.now.to_i - tm_start) 77 | @logger.log "Sleeping for #{s} seconds" if s > 0 78 | sleep(s < 0 ? 0 : s) 79 | end 80 | end 81 | 82 | end 83 | end 84 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AP Elections Data Loader 2 | 3 | Many news organizations use data from The Associated Press to power their election results reporting and real-time interactive maps. The code in this repository has been used by The Huffington Post since the 2012 Iowa caucuses to build results maps for elections including the [Republican primaries](http://elections.huffingtonpost.com/2012/primaries), the [general election](http://elections.huffingtonpost.com/2012/results) and the [Wisconsin recall](http://elections.huffingtonpost.com/2012/wisconsin-recall-results) in 2012 as well as the special elections in [South Carolina](http://elections.huffingtonpost.com/2013/mark-sanford-vs-elizabeth-colbert-busch-sc1) and [Massachusetts](http://elections.huffingtonpost.com/2013/massachusetts-senate-results) in 2013. 4 | 5 | This repository is not affiliated with The Associated Press. You must have a contract with the AP and an account on its FTP server to use this code. 6 | 7 | This repository has a single purpose: to get results off the AP's FTP server and into MySQL as fast as possible. It does not contain methods to query those results, and does not make assumptions about the front-end used to display the loaded data. 8 | 9 | 10 | ## Getting started 11 | 12 | 1. Install the necessary gems: 13 | 14 | bundle install 15 | 16 | 2. Create local copies of the example config files: 17 | 18 | cp config/ap.yml.example config/ap.yml 19 | cp config/database.yml.example config/database.yml 20 | 21 | 3. Enter your AP credentials into `config/ap.yml`, your database credentials into `config/database.yml`, and ensure the database referenced in database.yml exists locally. 22 | 23 | 4. Import the AP's current Massachusetts data: 24 | 25 | ruby crawl.rb --initialize --states=MA 26 | 27 | The results data from the AP FTP server is now loaded into the `ap_races`, `ap_results` and `ap_candidates` tables in MySQL. On subsequent imports for the current election in Massachusetts, you do not need to include the `initialize` option. The full list of options is described below. 28 | 29 | ## Replays 30 | 31 | The AP conducts tests of its live results reporting in the weeks leading up to an election. With the `record` and `replay` parameters, you can record these tests and replay them at a later time, which is useful for development. Recordings can be easily stored on s3, which means you can make them accessible to other developers. 32 | 33 | To record an AP test, start recording before the test begins, and stop it after the test is over: 34 | 35 | ruby crawl.rb --record 36 | 37 | You can now replay that test at any time: 38 | 39 | ruby crawl.rb --replay 40 | 41 | To store the recording on s3, create an `s3.yml` config file from the example file provided, fill in your account information, and upload it: 42 | 43 | ruby upload_replay.rb 44 | 45 | Once uploaded, you can run that replay from any machine that has a corresponding `s3.yml`: 46 | 47 | ruby crawl.rb --replay 48 | 49 | By default, the newest replay will always be run, but you can change that with the `replaydate` option. 50 | 51 | ## Posthooks 52 | 53 | Posthooks allow you to create code that is run every time new results are imported. For example, at the Huffington Post, we often bake out static pages each time results are updated. 54 | 55 | To add a posthook, copy the example file: 56 | 57 | cp posthook/posthook.rb.example posthook/posthook.rb 58 | 59 | Each time results have been updated, the `run` method in your posthook will be called. You can add any code you need to that file, and add libraries or other external dependencies to the posthook directory. 60 | 61 | ## All Options 62 | 63 | The following options are available to `crawl.rb`. Any option listed without examples is boolean and defaults to false. 64 | 65 | - `states`: Comma-separated states to download 66 | - examples: `MA`, `MA,CA`, `all` 67 | - `initialize`: Create initial set of results records 68 | - `once`: Only download and import data once 69 | - `clean`: Clean the data directories for specified states before downloading 70 | - `interval`: Interval in seconds at which AP data will be downloaded 71 | - examples: `300`, `600` 72 | - `posthook`: Run posthook after first iteration, even if results didn't change 73 | - `record`: Record this run 74 | - `replay`: Replay the most recent run 75 | - `replaydate`: Specify date of replay to run 76 | - examples: `20130521`, `20130523` 77 | - `replaytime`: Set the results to their state at the specified time. 78 | - `replaytimefrom`: Run the replay from the specified time onward. 79 | - `replaytimeto`: Run the replay up to the specified time. 80 | - `help`: Show help dialog 81 | 82 | ## Authors 83 | 84 | - Jay Boice, jay.boice@huffingtonpost.com 85 | - Aaron Bycoffe, bycoffe@huffingtonpost.com 86 | 87 | ## Copyright 88 | 89 | Copyright © 2013 The Huffington Post. See LICENSE for details. 90 | -------------------------------------------------------------------------------- /ap/replayer.rb: -------------------------------------------------------------------------------- 1 | require 'aws/s3' 2 | require 'yaml' 3 | 4 | module AP 5 | class Replayer 6 | 7 | attr_accessor :done, :timekeys, :timekey_idx 8 | 9 | def initialize(crawler) 10 | @crawler = crawler 11 | @done = false 12 | @timekey_idx = 0 13 | end 14 | 15 | def replay 16 | raise AbortException, "Can't run replays in production environment" if ['production', 'internal'].include?(@crawler.env) 17 | 18 | get_replay if @timekey_idx == 0 19 | timekey = @timekeys[@timekey_idx] 20 | @crawler.logger.log "Started replaying #{timekey}" 21 | 22 | archive_dir = "#{@crawler.datadir}/#{@crawler.params[:replaydate]}/#{timekey}" 23 | new_states = Dir.glob("#{archive_dir}/*").map{|d| d.split('/').last}.uniq 24 | new_states = new_states & @crawler.params[:states] if @crawler.params[:states] 25 | 26 | new_states.each do |state_abbr| 27 | state_dir = "#{@crawler.datadir}/#{state_abbr}" 28 | system "mkdir -p #{state_dir}" 29 | state_archive_dir = "#{archive_dir}/#{state_abbr}" 30 | files = ["#{state_abbr}_Results.txt", "#{state_abbr}_Race.txt", "#{state_abbr}_Candidate.txt"] 31 | files.each do |file| 32 | archive_file = "#{state_archive_dir}/#{file.split('/').last}" 33 | next unless File.exists?(archive_file) 34 | local_file = "#{state_dir}/#{file.split('/').last}" 35 | system("cp #{archive_file} #{local_file}") 36 | @crawler.new_files << [local_file, nil, nil] 37 | end 38 | @crawler.updated_states[state_abbr] ||= 1 39 | end 40 | 41 | @timekey_idx += 1 42 | @done = true if @timekey_idx >= @timekeys.size 43 | @crawler.logger.log "Finished replaying" 44 | end 45 | 46 | def record 47 | @crawler.logger.log "Started recording" 48 | dt1 = Time.now.strftime('%Y%m%d') 49 | dt2 = Time.now.strftime('%H%M%S') 50 | @crawler.updated_states.keys.each do |state_abbr| 51 | record_state(state_abbr, @crawler.new_files.select{|file| file.first.index("#{state_abbr}_")}, dt1, dt2) 52 | end 53 | @crawler.logger.log "Finished recording" 54 | end 55 | 56 | private 57 | 58 | def get_replay 59 | download_latest_from_s3 if File.exists?("#{@crawler.dir}/config/s3.yml") 60 | @crawler.params[:replaydate] = Dir.glob("#{@crawler.datadir}/20*").reject{|f| f.index('.tar.gz')}.map{|f| f.split('/').last}.sort.last unless @crawler.params[:replaydate] 61 | if @crawler.params[:replaydate].nil? 62 | raise AbortException, "No replay was found locally or on s3, exiting" 63 | end 64 | if !File.exists?("#{@crawler.datadir}/#{@crawler.params[:replaydate]}/") 65 | raise AbortException, "A replay for #{@crawler.params[:replaydate]} was not found" 66 | end 67 | @timekeys = Dir.glob("#{@crawler.datadir}/#{@crawler.params[:replaydate]}/*").map{|d| d.split('/').last}.uniq.sort 68 | @timekeys.select! { |x| x.to_i >= @crawler.params[:replaytimefrom] && x.to_i <= @crawler.params[:replaytimeto] } 69 | end 70 | 71 | def download_latest_from_s3 72 | @s3_config = YAML.load_file("#{@crawler.dir}/config/s3.yml") 73 | begin 74 | AWS::S3::Base.establish_connection!(:access_key_id => @s3_config['access_key_id'], :secret_access_key => @s3_config['secret_access_key']) 75 | bucket = AWS::S3::Bucket.find(@s3_config['bucket']) 76 | s3_files = bucket.objects(:prefix => "#{@s3_config['directory']}/").map{|o| o.key.split('/')[1, 1].first} 77 | rescue Exception => e 78 | raise AbortException, e.to_s 79 | end 80 | if s3_files.size == 0 81 | raise AbortException, "No replays were found on s3 in the bucket and directory specified" 82 | end 83 | s3_date = @crawler.params[:replaydate] || s3_files.sort.last.split('.').first 84 | 85 | local_gzip = "#{@crawler.datadir}/#{s3_date}.tar.gz" 86 | unless File.exist?(local_gzip) 87 | puts "Downloading replay from #{s3_date}..." 88 | s3_object = bucket.objects(:prefix => "#{@s3_config['directory']}/#{s3_date}.tar.gz").first 89 | if s3_object.nil? 90 | raise AbortException, "A replay from #{s3_date} wasn't found on s3" 91 | end 92 | File.open(local_gzip, 'w') {|f| f.write(s3_object.value)} 93 | system "tar -zxvf #{local_gzip} -C #{@crawler.datadir}/" 94 | end 95 | end 96 | 97 | def record_state(state_abbr, files, dt1, dt2) 98 | archive_dir = "#{@crawler.datadir}/#{dt1}/#{dt2}/#{state_abbr}/" 99 | system "mkdir -p #{archive_dir}" 100 | files.each do |file| 101 | archive_file = "#{archive_dir}#{file.first.split('/').last}" 102 | system "cp #{file.first} #{archive_file}" 103 | end 104 | end 105 | 106 | end 107 | end 108 | -------------------------------------------------------------------------------- /data/create_tables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS `ap_races` ( 2 | `test_flag` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL, 3 | `id` bigint(20) NOT NULL DEFAULT '0', 4 | `race_number` int(11) DEFAULT NULL, 5 | `election_date` datetime DEFAULT NULL, 6 | `state_postal` varchar(2) COLLATE utf8_unicode_ci DEFAULT NULL, 7 | `county_number` int(11) DEFAULT NULL, 8 | `fips_code` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 9 | `county_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 10 | `office_id` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL, 11 | `race_type_id` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL, 12 | `seat_number` int(11) DEFAULT NULL, 13 | `office_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 14 | `seat_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 15 | `race_type_party` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 16 | `race_type` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 17 | `office_description` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 18 | `number_of_winners` int(11) DEFAULT NULL, 19 | `number_in_runoff` int(11) DEFAULT NULL, 20 | `precincts_reporting` int(11) DEFAULT NULL, 21 | `total_precincts` int(11) DEFAULT NULL, 22 | `last_updated` datetime DEFAULT NULL, 23 | PRIMARY KEY (`id`) 24 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 25 | 26 | CREATE TABLE IF NOT EXISTS `stage_ap_races` ( 27 | `test_flag` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL, 28 | `race_county_id` bigint(20) DEFAULT NULL, 29 | `race_number` int(11) DEFAULT NULL, 30 | `election_date` datetime DEFAULT NULL, 31 | `state_postal` varchar(2) COLLATE utf8_unicode_ci DEFAULT NULL, 32 | `county_number` int(11) DEFAULT NULL, 33 | `fips_code` int(11) DEFAULT NULL, 34 | `county_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 35 | `office_id` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL, 36 | `race_type_id` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL, 37 | `seat_number` int(11) DEFAULT NULL, 38 | `office_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 39 | `seat_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 40 | `race_type_party` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 41 | `race_type` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 42 | `office_description` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 43 | `number_of_winners` int(11) DEFAULT NULL, 44 | `number_in_runoff` int(11) DEFAULT NULL, 45 | `precincts_reporting` int(11) DEFAULT NULL, 46 | `total_precincts` int(11) DEFAULT NULL, 47 | `ap_race_id` bigint(20) DEFAULT NULL, 48 | KEY `index_stage_ap_races_on_ap_race_id` (`ap_race_id`) 49 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 50 | 51 | CREATE TABLE IF NOT EXISTS `ap_results` ( 52 | `test_flag` varchar(1) COLLATE utf8_unicode_ci NOT NULL, 53 | `ap_race_id` bigint(20) NOT NULL DEFAULT '0', 54 | `ap_candidate_id` int(11) NOT NULL DEFAULT '0', 55 | `party` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 56 | `incumbent` tinyint(1) DEFAULT NULL, 57 | `vote_count` int(11) DEFAULT NULL, 58 | `winner` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL, 59 | `natl_order` int(11) DEFAULT NULL, 60 | `winner_override` int(11) DEFAULT NULL, 61 | PRIMARY KEY (`ap_candidate_id`,`ap_race_id`), 62 | KEY `index_ap_results_on_ap_race_id` (`ap_race_id`), 63 | KEY `index_ap_results_on_ap_candidate_id` (`ap_candidate_id`) 64 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 65 | 66 | CREATE TABLE IF NOT EXISTS `stage_ap_results` ( 67 | `test_flag` varchar(1) COLLATE utf8_unicode_ci NOT NULL, 68 | `race_county_id` bigint(20) DEFAULT NULL, 69 | `candidate_id` int(11) DEFAULT NULL, 70 | `party` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 71 | `incumbent` tinyint(1) DEFAULT NULL, 72 | `vote_count` int(11) DEFAULT NULL, 73 | `winner` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL, 74 | `natl_order` int(11) DEFAULT NULL 75 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 76 | 77 | CREATE TABLE IF NOT EXISTS `ap_candidates` ( 78 | `test_flag` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL, 79 | `id` int(11) NOT NULL DEFAULT '0', 80 | `candidate_number` int(11) DEFAULT NULL, 81 | `first_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 82 | `middle_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 83 | `last_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 84 | `junior` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 85 | `use_junior` tinyint(1) DEFAULT NULL, 86 | `politician_id` int(11) DEFAULT NULL, 87 | PRIMARY KEY (`id`) 88 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 89 | 90 | CREATE TABLE IF NOT EXISTS `stage_ap_candidates` ( 91 | `test_flag` varchar(1) COLLATE utf8_unicode_ci DEFAULT NULL, 92 | `candidate_id` int(11) DEFAULT NULL, 93 | `candidate_number` int(11) DEFAULT NULL, 94 | `first_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 95 | `middle_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 96 | `last_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 97 | `junior` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 98 | `use_junior` tinyint(1) DEFAULT NULL, 99 | `politician_id` int(11) DEFAULT NULL 100 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 101 | -------------------------------------------------------------------------------- /ap/importer.rb: -------------------------------------------------------------------------------- 1 | require 'mysql2' 2 | require 'yaml' 3 | 4 | module AP 5 | class Importer 6 | 7 | def initialize(crawler) 8 | @crawler = crawler 9 | @db_config = YAML::load(File.open("#{@crawler.dir}/config/database.yml"))[@crawler.env] 10 | connect 11 | 12 | # Add a test_flag filter based on the environment -- important 13 | @test_flag_where = "test_flag #{['production', 'internal'].include?(@crawler.env) ? "= 'l'" : "in ('l', 't')"}" 14 | 15 | # Create tables if they don't exist when initializing 16 | create_tables if @crawler.params[:initialize] 17 | end 18 | 19 | def import 20 | @crawler.logger.log "Started importing" 21 | @crawler.logger.log "New data in #{@crawler.new_files.map{|file| file.first.split('/').last}.join(', ')}" if @crawler.new_files.size > 0 22 | 23 | @crawler.updated_states.keys.each do |state_abbr| 24 | @crawler.logger.log "Importing #{state_abbr}" 25 | stage_state(state_abbr) 26 | @crawler.params[:initialize] ? initialize_state(state_abbr) : merge_state(state_abbr) 27 | end 28 | 29 | # Wait to cache new files until they're fully merged so the crawler can be killed between downloading and importing 30 | @crawler.new_files.each do |file, tm, md5| 31 | File.open("#{file}.mtime", 'w') {|f| f.write(tm)} 32 | File.open("#{file}.md5", 'w') {|f| f.write(md5)} 33 | end 34 | 35 | @crawler.logger.log "Finished importing" 36 | end 37 | 38 | private 39 | 40 | # Load the newly downloaded files into a staging table 41 | def stage_state(state_abbr) 42 | first_file = @crawler.new_files.select{|file| file.first.index("#{state_abbr}_")}.first.first 43 | state_path = first_file.split('/')[0, first_file.split('/').size - 1].join('/') 44 | 45 | files = [["_Race.txt", "ap_races"], ["_Results.txt", "ap_results"]] 46 | files += [["_Candidate.txt", "ap_candidates"]] if @crawler.params[:initialize] 47 | 48 | files.each do |f| 49 | q "truncate stage_#{f.last}" 50 | next unless File.exists? "#{state_path}/#{state_abbr}#{f.first}" 51 | load_data = "'#{state_path}/#{state_abbr}#{f.first}' into table stage_#{f.last} fields terminated by ';'" 52 | begin 53 | q "load data local infile #{load_data}" 54 | rescue Exception 55 | q "load data infile #{load_data}" 56 | end 57 | end 58 | 59 | q "update stage_ap_races set ap_race_id = concat(date_format(election_date, '%y%m'), race_county_id)" 60 | end 61 | 62 | # Create new records in production (non-staging) table if necessary 63 | def initialize_state(state_abbr) 64 | election_date = q("select election_date from stage_ap_races limit 1").first["election_date"].strftime("%Y-%m-%d") 65 | q "start transaction" 66 | 67 | q <<-eos 68 | delete ap_candidates from ap_candidates 69 | inner join ap_results on ap_results.ap_candidate_id = ap_candidates.id 70 | inner join ap_races on ap_results.ap_race_id = ap_races.id 71 | where ap_races.state_postal = '#{state_abbr}' and 72 | ap_races.election_date = '#{election_date}' 73 | eos 74 | 75 | q <<-eos 76 | insert into ap_candidates 77 | select * from stage_ap_candidates 78 | where stage_ap_candidates.#{@test_flag_where} 79 | eos 80 | 81 | q <<-eos 82 | delete ap_results from ap_results 83 | inner join ap_races on ap_results.ap_race_id = ap_races.id 84 | where ap_races.state_postal = '#{state_abbr}' and 85 | ap_races.election_date = '#{election_date}' 86 | eos 87 | 88 | q <<-eos 89 | insert into ap_results (test_flag, ap_race_id, ap_candidate_id, party, incumbent, vote_count, winner, natl_order) 90 | select stage_ap_results.test_flag, stage_ap_races.ap_race_id, candidate_id, party, incumbent, vote_count, winner, natl_order 91 | from stage_ap_results 92 | inner join stage_ap_races on stage_ap_results.race_county_id = stage_ap_races.race_county_id 93 | where stage_ap_races.#{@test_flag_where} and 94 | stage_ap_results.#{@test_flag_where} 95 | eos 96 | 97 | q <<-eos 98 | delete ap_races from ap_races 99 | where ap_races.state_postal = '#{state_abbr}' and 100 | ap_races.election_date = '#{election_date}' 101 | eos 102 | 103 | q <<-eos 104 | insert into ap_races (test_flag, id, race_number, election_date, state_postal, county_number, fips_code, county_name, office_id, race_type_id, seat_number, office_name, seat_name, race_type_party, race_type, office_description, number_of_winners, number_in_runoff, precincts_reporting, total_precincts, last_updated) 105 | select test_flag, ap_race_id, race_number, election_date, state_postal, county_number, fips_code, county_name, office_id, race_type_id, seat_number, office_name, seat_name, race_type_party, race_type, office_description, number_of_winners, number_in_runoff, precincts_reporting, total_precincts, now() 106 | from stage_ap_races 107 | where stage_ap_races.#{@test_flag_where} 108 | eos 109 | 110 | q "commit" 111 | end 112 | 113 | # Update records in production table based on staging table 114 | def merge_state(state_abbr) 115 | q "start transaction" 116 | 117 | q <<-eos 118 | update ap_races 119 | inner join stage_ap_races on ap_races.id = stage_ap_races.ap_race_id 120 | set 121 | ap_races.test_flag = stage_ap_races.test_flag, 122 | ap_races.race_number = stage_ap_races.race_number, 123 | ap_races.election_date = stage_ap_races.election_date, 124 | ap_races.state_postal = stage_ap_races.state_postal, 125 | ap_races.county_number = stage_ap_races.county_number, 126 | ap_races.fips_code = stage_ap_races.fips_code, 127 | ap_races.county_name = stage_ap_races.county_name, 128 | ap_races.office_id = stage_ap_races.office_id, 129 | ap_races.race_type_id = stage_ap_races.race_type_id, 130 | ap_races.seat_number = stage_ap_races.seat_number, 131 | ap_races.office_name = stage_ap_races.office_name, 132 | ap_races.seat_name = stage_ap_races.seat_name, 133 | ap_races.race_type_party = stage_ap_races.race_type_party, 134 | ap_races.race_type = stage_ap_races.race_type, 135 | ap_races.office_description = stage_ap_races.office_description, 136 | ap_races.number_of_winners = stage_ap_races.number_of_winners, 137 | ap_races.number_in_runoff = stage_ap_races.number_in_runoff, 138 | ap_races.precincts_reporting = stage_ap_races.precincts_reporting, 139 | ap_races.total_precincts = stage_ap_races.total_precincts, 140 | ap_races.last_updated = now() 141 | where stage_ap_races.#{@test_flag_where}; 142 | eos 143 | 144 | q <<-eos 145 | update ap_results 146 | inner join stage_ap_races on ap_results.ap_race_id = stage_ap_races.ap_race_id 147 | inner join stage_ap_results on stage_ap_races.race_county_id = stage_ap_results.race_county_id and ap_results.ap_candidate_id = stage_ap_results.candidate_id 148 | set 149 | ap_results.test_flag = stage_ap_results.test_flag, 150 | ap_results.party = stage_ap_results.party, 151 | ap_results.incumbent = stage_ap_results.incumbent, 152 | ap_results.vote_count = stage_ap_results.vote_count, 153 | ap_results.winner = stage_ap_results.winner, 154 | ap_results.natl_order = stage_ap_results.natl_order 155 | where stage_ap_results.#{@test_flag_where}; 156 | eos 157 | 158 | q "commit" 159 | end 160 | 161 | def create_tables 162 | system "mysql -h #{@db_config["host"] || "localhost"} -u #{@db_config["username"]} --password=#{@db_config["password"]} #{@db_config["database"]} < #{@crawler.dir}/data/create_tables.sql" 163 | end 164 | 165 | def connect 166 | @db = Mysql2::Client.new(:host => @db_config["host"], :username => @db_config["username"], :password => @db_config["password"], :database => @db_config["database"]) 167 | end 168 | 169 | def q(sql) 170 | #puts sql 171 | @db.query(sql) 172 | end 173 | 174 | end 175 | end --------------------------------------------------------------------------------