├── log └── .gitkeep ├── .ruby-version ├── .ruby-gemset ├── Procfile ├── public ├── javascripts │ ├── application.js │ └── jquery-ujs.js └── favicon.ico ├── config ├── unicorn.rb ├── puma.rb ├── env.rb ├── boot.rb └── apps.rb ├── spec ├── fixtures │ ├── geo_no_files │ │ └── data.yaml │ ├── minimal │ │ └── data.yaml │ ├── cities_with_yml │ │ ├── more.csv │ │ ├── data.yml │ │ ├── cities51-100.csv │ │ └── cities50.csv │ ├── import_with_options │ │ ├── more_cities.csv │ │ ├── cities4.csv │ │ └── data.yaml │ ├── cities_without_yml │ │ ├── more.csv │ │ ├── cities51-100.csv │ │ └── cities50.csv │ ├── import_with_dictionary │ │ ├── more.csv │ │ ├── data.yaml │ │ ├── cities51-100.csv │ │ └── cities50.csv │ ├── invalid_utf8.csv │ ├── bom │ │ ├── bom.csv │ │ └── data.yaml │ ├── calculated_columns │ │ ├── schools.csv │ │ └── data.yaml │ ├── types │ │ ├── places.csv │ │ └── data.yaml │ ├── import_with_errors │ │ ├── cities4.csv │ │ └── data.yaml │ ├── import_with_null_value │ │ ├── null_values.csv │ │ └── data.yaml │ ├── school_names │ │ ├── data.yaml │ │ └── school_names.csv │ ├── geo │ │ ├── places.csv │ │ └── data.yaml │ ├── numeric_data │ │ └── data.yaml │ ├── nested_files │ │ ├── school2011.csv │ │ ├── school2012.csv │ │ ├── data.yaml │ │ ├── school2013.csv │ │ └── school-data.csv │ ├── nested2 │ │ ├── data.yaml │ │ └── school2013.csv │ ├── data.rb │ ├── schools │ │ ├── schools.csv │ │ └── data.yaml │ └── sample-data │ │ └── data.yaml ├── lib │ ├── expression │ │ ├── variables_spec.rb │ │ ├── eval_spec.rb │ │ └── parser_spec.rb │ ├── data_magic │ │ ├── index │ │ │ ├── importer_spec.rb │ │ │ ├── event_logger_spec.rb │ │ │ ├── document_spec.rb │ │ │ └── repository_spec.rb │ │ ├── example_spec.rb │ │ ├── import_csv_spec.rb │ │ ├── name_type_spec.rb │ │ ├── calculated_columns_spec.rb │ │ ├── create_index_spec.rb │ │ ├── import_with_nested_files_spec.rb │ │ ├── import_without_data_yaml_spec.rb │ │ ├── config_field_types_spec.rb │ │ ├── search_name_spec.rb │ │ └── config_spec.rb │ ├── zipcode_spec.rb │ ├── data_magic_spec.rb │ ├── expression_spec.rb │ └── nested_hash_spec.rb ├── spec.rake ├── tasks │ └── import_spec.rb ├── spec_helper.rb └── features │ └── web_spec.rb ├── doc ├── csv-download.png └── data-overview.png ├── Rakefile ├── .components ├── script ├── bomstrip.sh ├── makeutf8.sh ├── s3push ├── s3pull ├── s3config.rb └── bootstrap ├── config.ru ├── manifest-dev.yml ├── manifest-staging.yml ├── manifest-production.yml ├── manifest-ex.yml ├── manifest-indexing.yml ├── app ├── views │ ├── layouts │ │ └── application.erb │ ├── home.liquid │ └── category.liquid ├── index_app.rb ├── app.rb ├── stylesheets │ └── application.sass └── controllers.rb ├── .rubocop.yml ├── bin └── open-data-maker ├── lib ├── expression │ ├── variables.rb │ ├── eval.rb │ ├── expression.rb │ └── parser.rb ├── data_magic │ ├── category.rb │ ├── example.rb │ ├── index │ │ ├── builder_data.rb │ │ ├── event_logger.rb │ │ ├── document.rb │ │ ├── super_client.rb │ │ ├── output.rb │ │ ├── repository.rb │ │ ├── row_importer.rb │ │ ├── importer.rb │ │ └── document_builder.rb │ ├── index.rb │ ├── error_checker.rb │ └── query_builder.rb ├── sass_initializer.rb ├── zipcode │ └── zipcode.rb └── nested_hash.rb ├── .gitignore ├── tasks ├── es.rake └── import.rake ├── circle.yml ├── NOTES.md ├── Gemfile ├── LICENSE.md ├── DICTIONARY.md ├── notes.txt ├── sample-data └── data.yaml ├── INSTALL.md ├── README.md └── Gemfile.lock /log/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 2.2.4 2 | -------------------------------------------------------------------------------- /.ruby-gemset: -------------------------------------------------------------------------------- 1 | open-data-maker 2 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: bundle exec puma -C config/puma.rb 2 | -------------------------------------------------------------------------------- /public/javascripts/application.js: -------------------------------------------------------------------------------- 1 | // Put your application scripts here -------------------------------------------------------------------------------- /config/unicorn.rb: -------------------------------------------------------------------------------- 1 | worker_processes 5 2 | timeout 30 3 | preload_app true 4 | -------------------------------------------------------------------------------- /spec/fixtures/geo_no_files/data.yaml: -------------------------------------------------------------------------------- 1 | # data.yaml for geo tests 2 | index: place-data 3 | -------------------------------------------------------------------------------- /spec/fixtures/minimal/data.yaml: -------------------------------------------------------------------------------- 1 | # smallest possible data.yaml 2 | index: my-index 3 | -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/18F/open-data-maker/HEAD/public/favicon.ico -------------------------------------------------------------------------------- /doc/csv-download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/18F/open-data-maker/HEAD/doc/csv-download.png -------------------------------------------------------------------------------- /doc/data-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/18F/open-data-maker/HEAD/doc/data-overview.png -------------------------------------------------------------------------------- /spec/fixtures/cities_with_yml/more.csv: -------------------------------------------------------------------------------- 1 | state,city,lat,lon 2 | CA,Secret City,37.727239,-123.032229 3 | -------------------------------------------------------------------------------- /spec/fixtures/import_with_options/more_cities.csv: -------------------------------------------------------------------------------- 1 | USPS,GEOID,ANSICODE,NAME,POP10 2 | XX,0,0,YY,0 3 | -------------------------------------------------------------------------------- /spec/fixtures/cities_without_yml/more.csv: -------------------------------------------------------------------------------- 1 | state,city,lat,lon 2 | CA,Secret City,37.727239,-123.032229 3 | -------------------------------------------------------------------------------- /spec/fixtures/import_with_dictionary/more.csv: -------------------------------------------------------------------------------- 1 | state,city,lat,lon 2 | CA,Secret City,37.727239,-123.032229 3 | -------------------------------------------------------------------------------- /spec/fixtures/invalid_utf8.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/18F/open-data-maker/HEAD/spec/fixtures/invalid_utf8.csv -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler/setup' 2 | require 'padrino-core/cli/rake' 3 | 4 | 5 | task :default => :spec 6 | 7 | PadrinoTasks.init 8 | -------------------------------------------------------------------------------- /spec/fixtures/bom/bom.csv: -------------------------------------------------------------------------------- 1 | UNITID,VAL 2 | 100654,00100200 3 | 100663,00105200 4 | 100690,02503400 5 | 100706,00105500 6 | 100724,00100500 7 | -------------------------------------------------------------------------------- /spec/fixtures/calculated_columns/schools.csv: -------------------------------------------------------------------------------- 1 | UNITID,INSTNM,INT1,INT2,INT3,INT4 2 | 1,Big School,0,0,2,0 3 | 2,Small School,0,0,0,0 4 | 3,Middle School,0,1,1,0 5 | -------------------------------------------------------------------------------- /spec/fixtures/bom/data.yaml: -------------------------------------------------------------------------------- 1 | version: byte-order-mark 2 | index: test-data 3 | api: test 4 | dictionary: 5 | id: UNITID 6 | value: VAL 7 | 8 | files: 9 | - name: bom.csv 10 | -------------------------------------------------------------------------------- /spec/fixtures/types/places.csv: -------------------------------------------------------------------------------- 1 | id,state,name,lat,lon 2 | ca sf,CA,San Francisco,37.727239,-123.032229 3 | ny ny,NY,New York,40.664274,-73.938500 4 | la no,LA,New Orleans,30.068636,-89.939007 5 | -------------------------------------------------------------------------------- /.components: -------------------------------------------------------------------------------- 1 | --- 2 | :orm: none 3 | :test: rspec 4 | :mock: none 5 | :script: jquery 6 | :renderer: liquid 7 | :stylesheet: sass 8 | :namespace: OpenDataMaker 9 | :migration_format: number 10 | -------------------------------------------------------------------------------- /spec/fixtures/import_with_errors/cities4.csv: -------------------------------------------------------------------------------- 1 | USPS,GEOID,ANSICODE,NAME,POP10 2 | NY,3651000,2395220,New York,8175133 3 | CA,644000,2410877,Los Angeles,3792621 4 | IL,1714000,428803,Chicago,2695598 5 | TX,4835000,2410796,Houston,2099451 -------------------------------------------------------------------------------- /spec/fixtures/import_with_options/cities4.csv: -------------------------------------------------------------------------------- 1 | USPS,GEOID,ANSICODE,NAME,POP10 2 | NY,3651000,2395220,New York,8175133 3 | CA,644000,2410877,Los Angeles,3792621 4 | IL,1714000,428803,Chicago,2695598 5 | TX,4835000,2410796,Houston,2099451 -------------------------------------------------------------------------------- /spec/fixtures/import_with_null_value/null_values.csv: -------------------------------------------------------------------------------- 1 | USPS,GEOID,ANSICODE,NAME,POP10 2 | NY,abc123,2395220,New York,8175133 3 | CA,644000,2410877,Los Angeles,3792621 4 | IL,1714000,428803,Chicago,2695598 5 | TX,4835000,2410796,Houston,2099451 6 | -------------------------------------------------------------------------------- /script/bomstrip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | mkdir -p new 3 | 4 | for filename in ./*.csv; do 5 | awk 'NR==1{sub(/^\xef\xbb\xbf/,"")}1' "$filename" > new/$filename 6 | done 7 | 8 | #find . -print0 -type f | awk 'NR==1{sub(/^\xef\xbb\xbf/,"")}1' {} > new/{} 9 | -------------------------------------------------------------------------------- /config.ru: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env rackup 2 | # encoding: utf-8 3 | 4 | # This file can be used to start Padrino, 5 | # just execute it from the command line. 6 | 7 | require File.expand_path("../config/boot.rb", __FILE__) 8 | 9 | run Padrino.application 10 | -------------------------------------------------------------------------------- /spec/fixtures/school_names/data.yaml: -------------------------------------------------------------------------------- 1 | version: 0 2 | index: name-data 3 | api: names 4 | dictionary: 5 | id: ID 6 | school.name: 7 | source: NAME 8 | type: autocomplete 9 | school.state: STATE 10 | 11 | files: 12 | - name: school_names.csv 13 | -------------------------------------------------------------------------------- /script/makeutf8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # convert to utf8 and strip Byte Order Mark (BOM) if present 3 | mkdir -p utf8 4 | 5 | for file in *.csv; do 6 | echo "$file" 7 | iconv -f ascii -t utf-8 "$file" | awk 'NR==1{sub(/^\xef\xbb\xbf/,"")}1' > "./utf8/${file%.txt}" 8 | done 9 | -------------------------------------------------------------------------------- /manifest-dev.yml: -------------------------------------------------------------------------------- 1 | --- 2 | applications: 3 | - name: ccapi-dev 4 | command: bundle exec puma -C ./config/puma.rb 5 | instances: 1 6 | memory: 2G 7 | services: 8 | - bservice 9 | - eservice 10 | env: 11 | MAX_THREADS: 5 12 | WEB_CONCURRENCY: 3 13 | -------------------------------------------------------------------------------- /manifest-staging.yml: -------------------------------------------------------------------------------- 1 | --- 2 | applications: 3 | - name: ccapi-staging 4 | command: bundle exec puma -C ./config/puma.rb 5 | instances: 3 6 | memory: 2G 7 | services: 8 | - bservice 9 | - eservice 10 | env: 11 | MAX_THREADS: 5 12 | WEB_CONCURRENCY: 3 13 | -------------------------------------------------------------------------------- /manifest-production.yml: -------------------------------------------------------------------------------- 1 | --- 2 | applications: 3 | - name: ccapi-production 4 | command: bundle exec puma -C ./config/puma.rb 5 | instances: 3 6 | memory: 2G 7 | services: 8 | - bservice 9 | - eservice 10 | env: 11 | MAX_THREADS: 5 12 | WEB_CONCURRENCY: 3 13 | -------------------------------------------------------------------------------- /spec/fixtures/import_with_null_value/data.yaml: -------------------------------------------------------------------------------- 1 | index: city-data 2 | api: cities 3 | unique: ['name'] 4 | null_value: 'abc123' 5 | options: 6 | columns: all 7 | 8 | dictionary: 9 | state: USPS 10 | population: POP10 11 | name: NAME 12 | 13 | files: 14 | - name: null_values.csv 15 | -------------------------------------------------------------------------------- /spec/fixtures/import_with_errors/data.yaml: -------------------------------------------------------------------------------- 1 | version: fixture-type-error 2 | index: expect-errors 3 | api: nothing 4 | 5 | dictionary: 6 | state: USPS 7 | name: NAME 8 | population: 9 | source: POP10 10 | type: broken 11 | 12 | files: 13 | - name: cities4.csv 14 | add: 15 | year: 2010 16 | -------------------------------------------------------------------------------- /manifest-ex.yml: -------------------------------------------------------------------------------- 1 | --- 2 | applications: 3 | - name: ccapi-ex 4 | command: bundle exec puma -C ./config/puma.rb 5 | instances: 1 6 | memory: 2G 7 | services: 8 | - bservice 9 | - eservice 10 | env: 11 | MAX_THREADS: 5 12 | WEB_CONCURRENCY: 1 13 | INDEX_APP: enable 14 | NPROCS: 2 15 | -------------------------------------------------------------------------------- /config/puma.rb: -------------------------------------------------------------------------------- 1 | workers Integer(ENV['WEB_CONCURRENCY'] || 2) 2 | threads_count = Integer(ENV['MAX_THREADS'] || 5) 3 | threads threads_count, threads_count 4 | worker_timeout 30 5 | 6 | preload_app! 7 | 8 | rackup DefaultRackup 9 | port ENV['PORT'] || 3000 10 | environment ENV['RACK_ENV'] || 'development' 11 | -------------------------------------------------------------------------------- /manifest-indexing.yml: -------------------------------------------------------------------------------- 1 | --- 2 | applications: 3 | - name: ccapi-indexing 4 | command: bundle exec puma -C ./config/puma.rb 5 | instances: 1 6 | memory: 2G 7 | services: 8 | - bservice 9 | - eservice 10 | env: 11 | MAX_THREADS: 5 12 | WEB_CONCURRENCY: 1 13 | INDEX_APP: enable 14 | NPROCS: 2 15 | -------------------------------------------------------------------------------- /app/views/layouts/application.erb: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | <%== yield %> 10 | 11 | 12 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | AllCops: 2 | Exclude: 3 | - 'bin/**/*' 4 | - 'db/**/*' 5 | Metrics/LineLength: 6 | Enabled: false 7 | Style/CommentAnnotation: 8 | Enabled: false 9 | Style/Documentation: 10 | Enabled: false 11 | Style/DotPosition: 12 | Enabled: false 13 | Style/RedundantSelf: 14 | Enabled: false 15 | Style/StringLiterals: 16 | Enabled: false 17 | -------------------------------------------------------------------------------- /spec/fixtures/school_names/school_names.csv: -------------------------------------------------------------------------------- 1 | ID,STATE,NAME 2 | 1,AL,Stillman College 3 | 2,NY,New York University 4 | 3,AZ,Arizona State University 5 | 4,CA,University of California-Berkeley 6 | 5,MA,Berklee College of Music 7 | 6,NY,Berk Trade and Business School 8 | 7,AZ,University of Phoenix-Online Campus 9 | 8,AZ,University of Phoenix-Phoenix Campus 10 | 9,AZ,Phoenix College 11 | -------------------------------------------------------------------------------- /spec/fixtures/geo/places.csv: -------------------------------------------------------------------------------- 1 | state,city,lat,lon 2 | CA,"San Francisco",37.727239,-123.032229 3 | NY,"New York",40.664274,-73.938500 4 | CA,"Los Angeles",34.019394,-118.410825 5 | IL,Chicago,41.837551,-87.681844 6 | TX,Houston,29.780472,-95.386342 7 | PA,Philadelphia,40.009376,-75.133346 8 | CA,"San Jose",37.296867,-121.819306 9 | MA,Boston,42.331960,-71.020173 10 | WA,Seattle,47.620499,-122.350876 11 | -------------------------------------------------------------------------------- /spec/fixtures/numeric_data/data.yaml: -------------------------------------------------------------------------------- 1 | # cities100.txt 2 | # Test YAML file 3 | index: numeric-data 4 | api: cities 5 | 6 | dictionary: 7 | name: 8 | source: name 9 | type: string 10 | address: 11 | source: address 12 | type: string 13 | city: 14 | source: city 15 | type: string 16 | age: 17 | source: age 18 | type: integer 19 | height: 20 | source: height 21 | type: float 22 | -------------------------------------------------------------------------------- /bin/open-data-maker: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | Dir.chdir(File.dirname(__FILE__)+'/..') 4 | 5 | # Start the app with Padrino::Server 6 | require 'rubygems' 7 | require 'bundler/setup' 8 | require 'padrino-core/cli/launcher' 9 | 10 | ARGV.unshift('start') if ARGV.first.nil? || ARGV.first.start_with?('-') 11 | Padrino::Cli::Launcher.start ARGV 12 | 13 | # Start the app with Rack::Server 14 | #require "rack" 15 | #Rack::Server.start 16 | -------------------------------------------------------------------------------- /spec/fixtures/import_with_options/data.yaml: -------------------------------------------------------------------------------- 1 | version: fixture-import-options 2 | index: city-data 3 | api: cities 4 | options: 5 | columns: all 6 | limit_files: 1 7 | limit_rows: 3 8 | 9 | dictionary: 10 | state: USPS 11 | name: NAME 12 | population: POP10 13 | 14 | files: 15 | - name: cities4.csv 16 | add: 17 | year: 2010 18 | - name: more_cities.csv # this shouldn't get imported 19 | add: 20 | year: 1000 21 | -------------------------------------------------------------------------------- /lib/expression/variables.rb: -------------------------------------------------------------------------------- 1 | require 'parslet' 2 | 3 | class Expression 4 | class Variables < Parslet::Transform 5 | rule(:var => simple(:var)) { 6 | [String(var)] 7 | } 8 | rule(:or => { :left => subtree(:left), :right => subtree(:right) }) do 9 | (left + right) 10 | end 11 | 12 | rule(:and => { :left => subtree(:left), :right => subtree(:right) }) do 13 | (left + right) 14 | end 15 | 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | log/**/* 3 | log/*.log 4 | tmp/**/* 5 | vendor/gems/* 6 | !vendor/gems/cache/ 7 | .sass-cache/* 8 | db/*.db 9 | .*.sw* 10 | .env 11 | .*.env 12 | .cfignore 13 | .vagrant 14 | .idea/ 15 | *profile* 16 | 17 | public/stylesheets/application.css* 18 | 19 | # expect people to put their own data in /data 20 | data 21 | 22 | # another commonly used data directory 23 | real-data 24 | 25 | # contains Google API tokens 26 | client_secret.json 27 | -------------------------------------------------------------------------------- /script/s3push: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby -v 2 | 3 | require_relative 's3config.rb' 4 | 5 | @s3 = ::Aws::S3::Client.new 6 | 7 | dirname = 'real-data' 8 | bucket_name = ENV['s3_bucket'] 9 | datayamlpath = File.expand_path("../../#{dirname}/#{bucket_name}.yaml", __FILE__) 10 | 11 | puts "copying #{datayamlpath}" 12 | puts "to S3 #{bucket_name}" 13 | File.open(datayamlpath, 'r') do |file| 14 | @s3.put_object(bucket: bucket_name, key: 'data.yaml', body: file) 15 | end 16 | -------------------------------------------------------------------------------- /tasks/es.rake: -------------------------------------------------------------------------------- 1 | require 'data_magic' 2 | 3 | namespace :es do 4 | desc "delete elasticsearch index (_all for all)" 5 | task :delete, [:index_name] => :environment do |t, args| 6 | DataMagic.client.indices.delete(index: args[:index_name]) 7 | end 8 | 9 | desc "list elasticsearch indices" 10 | task :list => :environment do |t, args| 11 | result = DataMagic.client.indices.get(index: '_all').keys 12 | puts result.join("\n") 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /script/s3pull: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby -v 2 | 3 | require_relative 's3config.rb' 4 | 5 | @s3 = ::Aws::S3::Client.new 6 | 7 | bucket = ENV['s3_bucket'] 8 | 9 | dirname = 'real-data' 10 | unless File.directory?(dirname) 11 | FileUtils.mkdir_p(dirname) 12 | end 13 | datayamlpath = File.expand_path("../../#{dirname}/#{bucket}.yaml", __FILE__) 14 | 15 | File.open(datayamlpath, 'w') do |file| 16 | response = @s3.get_object(bucket: bucket, key: 'data.yaml') 17 | file << response.body.read 18 | end 19 | -------------------------------------------------------------------------------- /app/index_app.rb: -------------------------------------------------------------------------------- 1 | require 'csv' 2 | 3 | module OpenDataMaker 4 | 5 | class IndexApp < Padrino::Application 6 | register SassInitializer 7 | register Padrino::Helpers 8 | 9 | enable :sessions 10 | 11 | get '/' do 12 | DataMagic.config.scoped_index_name 13 | end 14 | 15 | get '/init' do 16 | DataMagic.init(load_now: true) 17 | "ok" 18 | end 19 | 20 | get '/reindex' do 21 | DataMagic.reindex 22 | "reindexing..." 23 | end 24 | end 25 | 26 | end 27 | -------------------------------------------------------------------------------- /lib/data_magic/category.rb: -------------------------------------------------------------------------------- 1 | Category = Struct.new(:category_id) do 2 | def assemble 3 | category_entry = DataMagic.config.data['categories'][category_id] 4 | dictionary = DataMagic.config.dictionary 5 | field_details = {} 6 | category_entry['fields'].each do |field_name| 7 | field_details[field_name] = dictionary[field_name] || { "description"=>"" } 8 | end 9 | field_details = { "field_details" => field_details } 10 | assemble = category_entry.merge(field_details) 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /lib/expression/eval.rb: -------------------------------------------------------------------------------- 1 | 2 | class Expression 3 | class Eval < Parslet::Transform 4 | rule(:var => simple(:var)) { 5 | variables[String(var)] 6 | } 7 | 8 | # in Ruby 0 is 'truthy' but that's not what most people expect 9 | rule(:or => { :left => subtree(:left), :right => subtree(:right) }) do 10 | left == 0 ? right : (left or right) 11 | end 12 | 13 | rule(:and => { :left => subtree(:left), :right => subtree(:right) }) do 14 | left == 0 ? left : (left and right) 15 | end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/sass_initializer.rb: -------------------------------------------------------------------------------- 1 | module SassInitializer 2 | def self.registered(app) 3 | # Enables support for SASS template reloading in rack applications. 4 | # See http://nex-3.com/posts/88-sass-supports-rack for more details. 5 | # Store SASS files (by default) within 'app/stylesheets'. 6 | require 'sass/plugin/rack' 7 | Sass::Plugin.options[:template_location] = Padrino.root("app/stylesheets") 8 | Sass::Plugin.options[:css_location] = Padrino.root("public/stylesheets") 9 | app.use Sass::Plugin::Rack 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/data_magic/example.rb: -------------------------------------------------------------------------------- 1 | class Example < Hashie::Mash 2 | include Hashie::Extensions::Coercion 3 | include Hashie::Extensions::MergeInitializer 4 | coerce_key :name, String 5 | coerce_key :description, String 6 | coerce_key :params, String 7 | coerce_key :endpoint, String 8 | coerce_key :link, String 9 | def initialize(hash = {}) 10 | super 11 | # we want to use this in a liquid template 12 | # so all attributes needs to be plain data, not code 13 | self[:link] = "/v1/#{endpoint}?#{params}" if self[:link].nil? 14 | end 15 | 16 | end 17 | -------------------------------------------------------------------------------- /spec/fixtures/geo/data.yaml: -------------------------------------------------------------------------------- 1 | 2 | 3 | # cities100.txt 4 | # National Places Gazetteer Files, from US Census 2010 5 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html 6 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt 7 | # head -n 101 results.txt > cities100.txt 8 | # then convertes to csv and removed " city" from after each city name 9 | dictionary: 10 | city: city 11 | location.lat: lat 12 | location.lon: lon 13 | 14 | index: place-data 15 | api: places 16 | files: 17 | - name: places.csv 18 | -------------------------------------------------------------------------------- /lib/data_magic/index/builder_data.rb: -------------------------------------------------------------------------------- 1 | module DataMagic 2 | module Index 3 | class BuilderData 4 | attr_reader :data, :options 5 | 6 | def initialize(data, options) 7 | @options = options 8 | @data = data 9 | end 10 | 11 | def additional_fields 12 | options[:mapping] || {} 13 | end 14 | 15 | def new_field_names 16 | field_names = options[:fields] || {} 17 | field_names.merge(additional_fields) 18 | end 19 | 20 | def additional_data 21 | options[:add_data] 22 | end 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /spec/lib/expression/variables_spec.rb: -------------------------------------------------------------------------------- 1 | require 'expression/parser' 2 | require 'expression/variables' 3 | 4 | describe Expression::Variables do 5 | 6 | let(:parser) { Expression::Parser.new } 7 | let(:variables) { Expression::Variables.new } 8 | it "gets one variable name" do 9 | expect(variables.apply(parser.parse('one'))).to eq(['one']) 10 | end 11 | it "preserves case " do 12 | expect(variables.apply(parser.parse('ONe'))).to eq(['ONe']) 13 | end 14 | it "multiple variables" do 15 | expect(variables.apply(parser.parse('fox or cow or goat'))).to eq(%w[fox cow goat]) 16 | end 17 | 18 | end 19 | -------------------------------------------------------------------------------- /spec/spec.rake: -------------------------------------------------------------------------------- 1 | begin 2 | require 'rspec/core/rake_task' 3 | 4 | spec_tasks = Dir['spec/*/'].each_with_object([]) do |d, result| 5 | result << File.basename(d) unless Dir["#{d}*"].empty? 6 | end 7 | 8 | spec_tasks.each do |folder| 9 | desc "Run the spec suite in #{folder}" 10 | RSpec::Core::RakeTask.new("spec:#{folder}") do |t| 11 | t.pattern = "./spec/#{folder}/**/*_spec.rb" 12 | t.rspec_opts = "--color" 13 | end 14 | end 15 | 16 | desc "Run complete application spec suite" 17 | RSpec::Core::RakeTask.new(:spec) 18 | rescue LoadError 19 | puts "RSpec is not part of this bundle, skip specs." 20 | end 21 | -------------------------------------------------------------------------------- /config/env.rb: -------------------------------------------------------------------------------- 1 | # define core environment that we need in tests and for the app 2 | 3 | # Defines our constants 4 | ENV['RACK_ENV'] ||= 'development' 5 | RACK_ENV = ENV['RACK_ENV'] unless defined?(RACK_ENV) 6 | PADRINO_ROOT = File.expand_path('../..', __FILE__) unless defined?(PADRINO_ROOT) 7 | 8 | # Load our dependencies 9 | require 'rubygems' unless defined?(Gem) 10 | require 'bundler/setup' 11 | require 'newrelic_rpm' 12 | Bundler.require(:default, RACK_ENV) 13 | 14 | # do this early so we can log during startup 15 | require './lib/data_magic/config.rb' 16 | DataMagic::Config.logger=Logger.new(STDOUT) if ENV['VCAP_APPLICATION'] # Cloud Foundry 17 | -------------------------------------------------------------------------------- /spec/lib/data_magic/index/importer_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | 4 | describe "DataMagic::Index::Importer" do 5 | before do 6 | ENV['DATA_PATH'] = './spec/fixtures/minimal' 7 | DataMagic.init(load_now: false) 8 | end 9 | after do 10 | DataMagic.destroy 11 | end 12 | 13 | it "indexes in parallel based on NPROCS" do 14 | stub_const('ENV', { 'NPROCS' => '2' }) 15 | 16 | data_str = <<-eos 17 | a,b 18 | 1,2 19 | 3,4 20 | eos 21 | data = StringIO.new(data_str) 22 | num_rows, fields = DataMagic.import_csv(data) 23 | expect(num_rows).to be(2) 24 | expect(fields).to eq(['a', 'b']) 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /spec/lib/zipcode_spec.rb: -------------------------------------------------------------------------------- 1 | require 'zipcode/zipcode' 2 | 3 | describe Zipcode do 4 | it "gives a location based on zipcode" do 5 | location = Zipcode.latlon('94132') 6 | expect(location).to eq(lat: 37.7211, lon: -122.4754) 7 | end 8 | it "supports zipcode given as a number" do 9 | location = Zipcode.latlon(94132) 10 | expect(location).to eq(lat: 37.7211, lon: -122.4754) 11 | end 12 | 13 | describe '#valid' do 14 | it "returns true if the zipcode is valid" do 15 | expect(Zipcode.valid? 94132).to eq(true) 16 | end 17 | it "returns false if the zipcode is invalid" do 18 | expect(Zipcode.valid? 00002).to eq(false) 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /spec/fixtures/types/data.yaml: -------------------------------------------------------------------------------- 1 | 2 | version: 0 3 | # cities100.txt 4 | # National Places Gazetteer Files, from US Census 2010 5 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html 6 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt 7 | # head -n 101 results.txt > cities100.txt 8 | # then convertes to csv and removed " city" from after each city name 9 | dictionary: 10 | id: 11 | source: id 12 | type: literal 13 | city.name: 14 | source: name 15 | type: name 16 | city.state: state 17 | location.lat: lat 18 | location.lon: lon 19 | 20 | index: place-data 21 | api: places 22 | files: 23 | - name: places.csv 24 | -------------------------------------------------------------------------------- /tasks/import.rake: -------------------------------------------------------------------------------- 1 | require 'data_magic' 2 | require 'ruby-prof' 3 | 4 | desc "import files from DATA_PATH, rake import[profile=true] for profile output" 5 | task :import, [:profile] => :environment do |t, args| 6 | options = {} 7 | start_time = Time.now 8 | RubyProf.start if args[:profile] 9 | 10 | DataMagic.import_with_dictionary(options) 11 | 12 | if args[:profile] 13 | result = RubyProf.stop 14 | end_time = Time.now 15 | puts "indexing complete: #{distance_of_time_in_words(end_time, start_time)}" 16 | puts "duration: #{end_time - start_time}" 17 | 18 | printer = RubyProf::MultiPrinter.new(result); 19 | printer.print(path: ".", profile: "profile", min_percent: 2) 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /spec/fixtures/cities_with_yml/data.yml: -------------------------------------------------------------------------------- 1 | # cities100.txt 2 | # National Places Gazetteer Files, from US Census 2010 3 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html 4 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt 5 | # head -n 101 results.txt > cities100.txt 6 | # then convertes to csv and removed " city" from after each city name 7 | version: fixture-import-all 8 | index: city-data 9 | api: cities 10 | global_mapping: 11 | USPS: state 12 | NAME: name 13 | POP10: population 14 | INTPTLAT: latitude 15 | INTPTLONG: longitude 16 | 17 | files: 18 | - name: cities50.csv 19 | add: 20 | category: 'top50' 21 | - name: cities51-100.csv 22 | -------------------------------------------------------------------------------- /spec/fixtures/import_with_dictionary/data.yaml: -------------------------------------------------------------------------------- 1 | # cities100.txt 2 | # National Places Gazetteer Files, from US Census 2010 3 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html 4 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt 5 | # head -n 101 results.txt > cities100.txt 6 | # then convertes to csv and removed " city" from after each city name 7 | version: fixture-import-all 8 | index: city-data 9 | api: cities 10 | dictionary: 11 | state: USPS 12 | name: NAME 13 | population: POP10 14 | latitude: INTPTLAT 15 | longitude: INTPTLONG 16 | 17 | files: 18 | - name: cities50.csv 19 | add: 20 | category: 'top50' 21 | - name: cities51-100.csv 22 | -------------------------------------------------------------------------------- /lib/expression/expression.rb: -------------------------------------------------------------------------------- 1 | require_relative 'parser' 2 | require_relative 'eval' 3 | require_relative 'variables' 4 | require 'hashie' 5 | 6 | class Expression 7 | attr_accessor :name # purely for reporting Errors 8 | attr_reader :variables 9 | 10 | def initialize(expr, name = 'unknown') 11 | @tree = Parser.new.parse(expr) 12 | @variables = Variables.new.apply(@tree) 13 | end 14 | 15 | def evaluate(vars) 16 | Hashie.stringify_keys! vars 17 | Eval.new.apply(@tree, variables: vars) 18 | end 19 | 20 | def self.find_or_create(expr, name = 'unknown') 21 | @cached_expression ||= {} 22 | @cached_expression[expr] ||= Expression.new(expr, name) 23 | @cached_expression[expr] 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /spec/lib/data_magic/example_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Example do 4 | let(:hash) do 5 | { name: 'foo', 6 | description: 'interesting thing', 7 | params: 'a=1&b=something', 8 | endpoint: 'api' } 9 | end 10 | subject(:e) { Example.new(hash) } 11 | 12 | it "has a name" do 13 | expect(e.name).to eq(hash[:name]) 14 | end 15 | it "has a description" do 16 | expect(e.description).to eq(hash[:description]) 17 | end 18 | it "has a params" do 19 | expect(e.params).to eq(hash[:params]) 20 | end 21 | it "has an endpoint" do 22 | expect(e.endpoint).to eq(hash[:endpoint]) 23 | end 24 | 25 | it "has a link" do 26 | expect(e.link).to eq("/v1/#{e.endpoint}?#{e.params}") 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /spec/fixtures/nested_files/school2011.csv: -------------------------------------------------------------------------------- 1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6 2 | 1,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1025,4048,0.92 3 | 2,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,866,45556,0.34 4 | 3,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,453,4675,0.71 5 | 4,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,572,15466,0.34 6 | 5,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1111,11266,0.86 7 | 6,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,818,23357,0.58 8 | 7,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1392,32584,0.39 9 | 8,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,718,252,0.26 10 | 9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1297,36088,0.63 11 | 10,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,635,3259,0.70 12 | -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | cache_directories: 3 | - elasticsearch-1.7.1 4 | pre: 5 | - curl -v -L -o cf-cli_amd64.deb 'https://cli.run.pivotal.io/stable?release=debian64&source=github' 6 | - sudo dpkg -i cf-cli_amd64.deb 7 | - cf -v 8 | post: 9 | - if [[ ! -e elasticsearch-1.7.1 ]]; then wget https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch-1.7.1.tar.gz && tar -xvf elasticsearch-1.7.1.tar.gz; fi 10 | - elasticsearch-1.7.1/bin/elasticsearch: {background: true} 11 | 12 | test: 13 | post: 14 | - cf api https://api.cloud.gov 15 | - cf auth $CF_USER $CF_PASSWORD 16 | - cf target -o ed -s dev 17 | - cf a 18 | 19 | deployment: 20 | development: 21 | branch: dev 22 | commands: 23 | - cf push -f manifest-dev.yml 24 | -------------------------------------------------------------------------------- /lib/data_magic/index/event_logger.rb: -------------------------------------------------------------------------------- 1 | module DataMagic 2 | module Index 3 | class EventLogger 4 | def trigger(event, *args) 5 | self.send(event, *args) 6 | end 7 | 8 | ['debug', 'info', 'warn', 'error'].each do |level| 9 | class_eval <<-RUBY, __FILE__, __LINE__ + 1 10 | def #{level}(message, object=nil, limit=nil) 11 | logger.#{level}(full_message(message, object, limit)) 12 | end 13 | RUBY 14 | end 15 | 16 | def full_message(prefix, object, limit) 17 | return prefix unless object 18 | message = "#{prefix}: " 19 | if limit 20 | message << object.inspect[0..limit] 21 | else 22 | message << object.inspect 23 | end 24 | message 25 | end 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /spec/fixtures/nested_files/school2012.csv: -------------------------------------------------------------------------------- 1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6 2 | 1,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,461,35231,0.01 3 | 2,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,986,34095,0.71 4 | 3,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1094,42579,0.39 5 | 4,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,854,37589,0.15 6 | 5,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,650,13611,0.04 7 | 6,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,797,36924,0.64 8 | 7,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,994,31799,0.60 9 | 8,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1420,30063,0.97 10 | 9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1292,42150,0.83 11 | 10,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,605,2608,0.92 12 | 11,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,2608,0.92 13 | -------------------------------------------------------------------------------- /spec/tasks/import_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'bundler/setup' 3 | require 'padrino-core/cli/rake' 4 | 5 | describe 'elastic search index management rake task' do 6 | before do 7 | PadrinoTasks.init 8 | DataMagic.init(load_now: true) 9 | end 10 | 11 | after do 12 | DataMagic.destroy 13 | end 14 | 15 | context "imports" do 16 | it "default sample-data" do 17 | ENV['DATA_PATH'] = nil 18 | expect { Rake::Task['import'].invoke }.not_to raise_exception 19 | end 20 | 21 | it "correct configuration" do 22 | dir_path = './spec/fixtures/import_with_dictionary' 23 | ENV['DATA_PATH'] = dir_path 24 | expect { Rake::Task['import'].invoke }.not_to raise_exception 25 | expect(DataMagic.config.api_endpoint_names).to eq(['cities']) 26 | end 27 | 28 | end 29 | 30 | end 31 | -------------------------------------------------------------------------------- /spec/fixtures/calculated_columns/data.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | version: Aug6-2015-08-10-23:48-0600 3 | api: fakeschool 4 | index: fakeschool-data 5 | unique: 6 | - id 7 | options: 8 | limit_files: 1 9 | limit_rows: 100 10 | 11 | dictionary: 12 | id: 13 | source: UNITID 14 | type: integer 15 | description: Unit ID for institution 16 | school.name: 17 | source: INSTNM 18 | description: Institution name 19 | integer1: 20 | source: INT1 21 | type: integer 22 | integer2: 23 | source: INT2 24 | type: integer 25 | integer3: 26 | source: INT3 27 | type: integer 28 | integer4: 29 | source: INT4 30 | type: integer 31 | summarybool: 32 | calculate: INT1 or INT2 or INT3 or INT4 33 | type: boolean 34 | description: are any of the unparsed booleans true? 35 | 36 | files: 37 | - name: schools.csv 38 | -------------------------------------------------------------------------------- /spec/lib/data_magic/import_csv_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | 4 | describe "DataMagic #import_csv" do 5 | before do 6 | ENV['DATA_PATH'] = './spec/fixtures/minimal' 7 | DataMagic.init(load_now: false) 8 | end 9 | after do 10 | DataMagic.destroy 11 | #expect(DataMagic.client.indices.get(index: '_all')).to be_empty 12 | end 13 | 14 | it "throws errors for bad format" do 15 | data = StringIO.new("not csv format") 16 | expect{DataMagic.import_csv(data)}.to raise_error(DataMagic::InvalidData) 17 | end 18 | 19 | it "reads file and reports number of rows and headers" do 20 | data_str = <<-eos 21 | a,b 22 | 1,2 23 | 3,4 24 | eos 25 | data = StringIO.new(data_str) 26 | num_rows, fields = DataMagic.import_csv(data) 27 | expect(num_rows).to be(2) 28 | expect(fields).to eq(['a', 'b']) 29 | end 30 | 31 | end 32 | -------------------------------------------------------------------------------- /lib/data_magic/index/document.rb: -------------------------------------------------------------------------------- 1 | module DataMagic 2 | module Index 3 | class Document 4 | attr_reader :data, :id 5 | 6 | def initialize(data) 7 | @data = data 8 | @id = calculate_id 9 | end 10 | 11 | def remove_ids 12 | config.data['unique'].each { |key| data.delete key } 13 | end 14 | 15 | def headers 16 | data.keys.map(&:to_s) # does this only return top level fields? 17 | end 18 | 19 | def preview(n=500) 20 | data.inspect[0..n] 21 | end 22 | 23 | def id_empty? 24 | id && id.empty? 25 | end 26 | 27 | private 28 | 29 | def calculate_id 30 | return nil if config.data['unique'].length == 0 31 | config.data['unique'].map { |field| data[field] }.join(':') 32 | end 33 | 34 | def config 35 | DataMagic.config 36 | end 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /app/app.rb: -------------------------------------------------------------------------------- 1 | require 'csv' 2 | 3 | module OpenDataMaker 4 | class App < Padrino::Application 5 | register SassInitializer 6 | register Padrino::Helpers 7 | 8 | # This app is stateless and session cookies prevent caching of API responses 9 | disable :sessions 10 | 11 | # This app has no sensitive bits and csrf protection requires sessions 12 | disable :protect_from_csrf 13 | 14 | if ENV['DATA_AUTH'] and not ENV['DATA_AUTH'].empty? 15 | auth = ENV['DATA_AUTH'] 16 | authorized_user, authorized_pass = auth.split(',') 17 | use Rack::Auth::Basic, "Restricted Area" do |username, password| 18 | username == authorized_user and password == authorized_pass 19 | end 20 | end 21 | 22 | ## app setup 23 | if ENV['RACK_ENV'] == 'test' 24 | DataMagic.init(load_now: true) 25 | else 26 | DataMagic.init(load_now: false) # don't index data 27 | end 28 | 29 | end 30 | 31 | end 32 | -------------------------------------------------------------------------------- /lib/data_magic/index/super_client.rb: -------------------------------------------------------------------------------- 1 | require 'forwardable' 2 | 3 | module DataMagic 4 | module Index 5 | class SuperClient 6 | attr_reader :client, :options 7 | 8 | def initialize(client, options) 9 | @client = client 10 | @options = options 11 | end 12 | 13 | def create_index 14 | DataMagic.create_index unless config.index_exists? 15 | end 16 | 17 | def refresh_index 18 | client.indices.refresh index: index_name 19 | end 20 | 21 | def creating? 22 | options[:nest] == nil 23 | end 24 | 25 | def allow_skips? 26 | options[:nest][:parent_missing] == 'skip' 27 | end 28 | 29 | def index_name 30 | config.scoped_index_name 31 | end 32 | 33 | def config 34 | DataMagic.config 35 | end 36 | 37 | extend Forwardable 38 | 39 | def_delegators :client, :index, :update 40 | end 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /config/boot.rb: -------------------------------------------------------------------------------- 1 | require_relative 'env.rb' 2 | 3 | ## 4 | # ## Enable devel logging 5 | # 6 | # Padrino::Logger::Config[:development][:log_level] = :devel 7 | # Padrino::Logger::Config[:development][:log_static] = true 8 | # 9 | # ## Configure your I18n 10 | # 11 | # I18n.default_locale = :en 12 | # I18n.enforce_available_locales = false 13 | # 14 | # ## Configure your HTML5 data helpers 15 | # 16 | # Padrino::Helpers::TagHelpers::DATA_ATTRIBUTES.push(:dialog) 17 | # text_field :foo, :dialog => true 18 | # Generates: 19 | # 20 | # ## Add helpers to mailer 21 | # 22 | # Mail::Message.class_eval do 23 | # include Padrino::Helpers::NumberHelpers 24 | # include Padrino::Helpers::TranslationHelpers 25 | # end 26 | 27 | ## 28 | # Add your before (RE)load hooks here 29 | # 30 | Padrino.before_load do 31 | end 32 | 33 | ## 34 | # Add your after (RE)load hooks here 35 | # 36 | Padrino.after_load do 37 | end 38 | 39 | Padrino.load! 40 | -------------------------------------------------------------------------------- /spec/fixtures/nested_files/data.yaml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | api: school 3 | index: fake-nested 4 | unique: [id] 5 | 6 | dictionary: 7 | id: UNITID 8 | name: 9 | source: INSTNM 10 | type: literal 11 | city: CITY_MAIN 12 | state: STABBR_MAIN 13 | zipcode: ZIP_MAIN 14 | sat_average: SAT_AVG 15 | location.lat: LATITUDE_MAIN 16 | location.lon: LONGITUDE_MAIN 17 | 18 | earnings.6_yrs_after_entry.median: 19 | source: earn_2002_p10 20 | description: Median earnings of students 21 | type: integer 22 | 23 | earnings.6_yrs_after_entry.percent_gt_25k: 24 | source: gt_25k_2006_p6 25 | description: Share of students earning over $25,000/year 26 | type: float 27 | 28 | files: 29 | - name: school-data.csv 30 | only: [id, name, city, state] 31 | - name: school2013.csv 32 | nest: 33 | key: 2013 34 | contents: [earnings, sat_average] 35 | - name: school2012.csv 36 | nest: 37 | key: 2012 38 | contents: [earnings, sat_average] 39 | -------------------------------------------------------------------------------- /spec/fixtures/nested2/data.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | version: Aug6-2015-08-10-23:48-0600 3 | api: fakeschool 4 | index: fakeschool-data 5 | unique: 6 | - id 7 | options: 8 | # columns: all 9 | limit_files: 1 10 | limit_rows: 100 11 | search: dictionary_only 12 | 13 | dictionary: 14 | id: 15 | source: UNITID 16 | type: integer 17 | description: Unit ID for institution 18 | ope8_id: 19 | source: OPEID 20 | type: integer 21 | description: 8-digit OPE ID for institution 22 | ope6_id: 23 | source: opeid6 24 | type: integer 25 | description: 6-digit OPE ID for institution 26 | school.name: 27 | source: INSTNM 28 | type: literal 29 | description: Institution name 30 | school.city: 31 | source: CITY_MAIN 32 | description: City 33 | school.state: 34 | source: STABBR_MAIN 35 | description: State postcode 36 | school.zip: 37 | source: ZIP_MAIN 38 | type: integer 39 | description: ZIP code 40 | 41 | files: 42 | - name: school2013.csv 43 | -------------------------------------------------------------------------------- /spec/lib/data_magic_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | require 'fixtures/data.rb' 4 | 5 | describe DataMagic do 6 | it "cleans up after itself" do 7 | DataMagic.init(load_now: true) 8 | DataMagic.destroy 9 | DataMagic.logger.info "just destroyed" 10 | #expect(DataMagic.client.indices.get(index: '_all')).to be_empty 11 | end 12 | 13 | describe '.es_field_types' do 14 | it 'returns the given fields with their specified type' do 15 | expect(described_class.es_field_types({ 'state' => 'string', land_area: 'string' })) 16 | .to eq("state" => { :type => "string" }, 17 | :land_area => { :type => "string" }) 18 | end 19 | 20 | context 'with custom type "literal"' do 21 | it 'returns string type with :index of "not_analyzed"' do 22 | expect(described_class.es_field_types({ 'state' => 'string', 'name' => 'literal' })) 23 | .to eq({"state"=>{:type=>"string"}, "name"=>{:type=>"string", :index=>"not_analyzed"}}) 24 | end 25 | end 26 | 27 | end 28 | 29 | end 30 | -------------------------------------------------------------------------------- /NOTES.md: -------------------------------------------------------------------------------- 1 | 2 | ## Data 3 | 4 | Details about the data are specified by DATA_PATH/data.yaml. 5 | Where DATA_PATH is an environment variable, which may be: 6 | 7 | * `s3://username:password@bucket_name/path` 8 | * `s3://bucket_name/path` 9 | * `s3://bucket_name` 10 | * a local path like: `./data` 11 | 12 | 13 | This file is loaded the first time it is needed and then stored in memory. The contents of `data.yaml` are stored as JSON in Elasticsearch in a single document of type `config` with id `1`. 14 | 15 | The version field of this document is checked at startup. If the new config has a new version, then we delete the whole index and re-index all of the files referred to in the `data.yaml` files section. 16 | 17 | If no data.yml or data.yaml file is found, then all CSV files in `DATA_PATH` will be loaded, and all fields in their headers will be used. 18 | 19 | ## Debugging 20 | 21 | `ES_DEBUG` environment variable will turn on verbose tracer in the Elasticsearch client 22 | 23 | optional performance profiling for rake import: `rake import[profile=true]` 24 | -------------------------------------------------------------------------------- /spec/fixtures/nested2/school2013.csv: -------------------------------------------------------------------------------- 1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6 2 | 1,Normal,AL,1,35762,5,34.7834,-86.5685,Reichert University,1195,26318,0.53 3 | 2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Montgomery School,770,6785,0.61 4 | 3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Indigo Card Community College,526,16767,0.50 5 | 4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Warm Meadow School of Fine Art,457,1836,0.09 6 | 5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Kovacek Institute of Technology,1511,19372,0.82 7 | 6,Athens,AL,1,35611,5,34.8056,-86.9651,Athens Institute,1057,49203,0.06 8 | 7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Alabama Beauty College of Auburn University,486,44097,0.50 9 | 8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Condemned Balloon Institute,616,59759,0.59 10 | 9,Tanner,AL,1,35671,5,34.6543,-86.9491,Inquisitive Farm College,971,34183,0.19 11 | 10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Enterprise University,920,42629,0.59 12 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | ENV['DATA_PATH'] = nil 2 | ENV['RACK_ENV'] ||= 'test' 3 | RACK_ENV = ENV['RACK_ENV'] unless defined?(RACK_ENV) 4 | 5 | #require File.expand_path(File.dirname(__FILE__) + "/../config/boot") 6 | require_relative '../config/env.rb' 7 | Dir[File.expand_path(File.dirname(__FILE__) + "/../app/helpers/**/*.rb")].each(&method(:require)) 8 | 9 | RSpec.configure do |config| 10 | config.include Rack::Test::Methods 11 | 12 | config.before(:type => :feature) do 13 | # load the Padrino web app defined in app/app.rb 14 | require_relative '../config/boot' 15 | end 16 | config.before do 17 | ENV['DATA_PATH'] = nil 18 | end 19 | end 20 | 21 | # You can use this method to custom specify a Rack app 22 | # you want rack-test to invoke: 23 | # 24 | # app OpenDataMaker::App 25 | # app OpenDataMaker::App.tap { |a| } 26 | # app(OpenDataMaker::App) do 27 | # set :foo, :bar 28 | # end 29 | # 30 | def app(app = nil, &blk) 31 | @app ||= block_given? ? app.instance_eval(&blk) : app 32 | @app ||= Padrino.application 33 | end 34 | -------------------------------------------------------------------------------- /spec/fixtures/nested_files/school2013.csv: -------------------------------------------------------------------------------- 1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6 2 | 1,Normal,AL,1,35762,5,34.7834,-86.5685,Reichert University,1195,26318,0.53 3 | 2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Montgomery School,770,6785,0.61 4 | 3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Indigo Card Community College,526,16767,0.50 5 | 4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Warm Meadow School of Fine Art,457,1836,0.09 6 | 5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Kovacek Institute of Technology,1511,19372,0.82 7 | 6,Athens,AL,1,35611,5,34.8056,-86.9651,Athens Institute,1057,49203,0.06 8 | 7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Alabama Beauty College of Auburn University,486,44097,0.50 9 | 8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Condemned Balloon Institute,616,59759,0.59 10 | 9,Tanner,AL,1,35671,5,34.6543,-86.9491,Inquisitive Farm College,971,34183,0.19 11 | 10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Enterprise University,920,42629,0.59 12 | -------------------------------------------------------------------------------- /lib/zipcode/zipcode.rb: -------------------------------------------------------------------------------- 1 | # Zipcode latitude and longitude data in us_zipcodes.txt 2 | # provided by [GeoNames](http://www.geonames.org/) 3 | # under under a Creative Commons Attribution 3.0 License: 4 | # http://creativecommons.org/licenses/by/3.0/ 5 | 6 | # this code is in public domain (CC0 1.0) 7 | # https://github.com/18F/open-data-maker/blob/dev/LICENSE.md 8 | 9 | require 'csv' 10 | 11 | class Zipcode 12 | @@zipcode_hash = nil 13 | 14 | def Zipcode.latlon(zipcode) 15 | zipcode = zipcode.to_s 16 | @@zipcode_hash ||= converted_zipcodes 17 | @@zipcode_hash[zipcode] 18 | end 19 | 20 | def Zipcode.valid?(zipcode) 21 | !!self.latlon(zipcode) 22 | end 23 | 24 | private 25 | def self.converted_zipcodes 26 | parsed_file = CSV.read(File.expand_path("../us_zipcodes.txt", __FILE__), { :col_sep => "\t" }) 27 | zipcode_hash = {} 28 | parsed_file.each do |row| 29 | zipcode = row[1] 30 | lat = row[9].to_f 31 | lon = row[10].to_f 32 | zipcode_hash[zipcode] = {'lat': lat, 'lon': lon} 33 | end 34 | zipcode_hash 35 | end 36 | 37 | end 38 | -------------------------------------------------------------------------------- /script/s3config.rb: -------------------------------------------------------------------------------- 1 | # configure S3 with local credentials based on environment 2 | # usage (from ruby script or irb): 3 | # require 's3config.rb' 4 | # @s3 = ::Aws::S3::Client.new 5 | 6 | require 'dotenv' 7 | 8 | branch = `echo $(git symbolic-ref --short HEAD)`.chomp 9 | 10 | if ENV['APP_ENV'] 11 | APP_ENV = ENV['APP_ENV'] 12 | puts "using APP_ENV from environment #{APP_ENV}" 13 | else 14 | case branch 15 | when "master" 16 | APP_ENV = "production" 17 | when "staging" 18 | APP_ENV = "staging" 19 | else 20 | puts "not on master or staging branch lets use dev" 21 | APP_ENV = "dev" 22 | end 23 | end 24 | 25 | Dotenv.load( 26 | File.expand_path("../../.#{APP_ENV}.env", __FILE__), 27 | File.expand_path("../../.env", __FILE__)) 28 | 29 | require 'aws-sdk' 30 | puts "app env: #{APP_ENV}" 31 | puts "bucket name: #{ENV['s3_bucket']}" 32 | 33 | 34 | s3cred = {'access_key'=> ENV['s3_access_key'], 'secret_key' => ENV['s3_secret_key']} 35 | 36 | ::Aws.config[:credentials] = ::Aws::Credentials.new(s3cred['access_key'], s3cred['secret_key']) 37 | ::Aws.config[:region] = 'us-east-1' 38 | -------------------------------------------------------------------------------- /spec/fixtures/data.rb: -------------------------------------------------------------------------------- 1 | # Ages adjusted for Springfield residents to average to 42 2 | # Heights randomly set to generate a max of 142 3 | def address_data 4 | @address_data ||= StringIO.new <<-eos 5 | name,address,city,age,height 6 | Paul,15 Penny Lane,Liverpool,10,142 7 | Michelle,600 Pennsylvania Avenue,Washington,12,1 8 | Marilyn,1313 Mockingbird Lane,Springfield,14,2 9 | Sherlock,221B Baker Street,London,16,123 10 | Clark,66 Lois Lane,Smallville,18,141 11 | Bart,742 Evergreen Terrace,Springfield,70,142 12 | Paul,19 N Square,Boston,70,55.2 13 | Peter,66 Parker Lane,New York,74,11.5123 14 | eos 15 | @address_data.rewind 16 | @address_data 17 | end 18 | 19 | def geo_data 20 | @geo_data ||= StringIO.new <<-eos 21 | state,city,lat,lon 22 | CA,San Francisco,37.727239,-123.032229 23 | NY,"New York",40.664274,-73.938500 24 | CA,"Los Angeles",34.019394,-118.410825 25 | IL,Chicago,41.837551,-87.681844 26 | TX,Houston,29.780472,-95.386342 27 | PA,Philadelphia,40.009376,-75.133346 28 | CA,"San Jose",37.296867,-121.819306 29 | MA,Boston,42.331960,-71.020173 30 | WA,Seattle,47.620499,-122.350876 31 | eos 32 | @geo_data.rewind 33 | @geo_data 34 | end 35 | -------------------------------------------------------------------------------- /spec/fixtures/nested_files/school-data.csv: -------------------------------------------------------------------------------- 1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6 2 | 1,Normal,AL,1,35762,5,34.7834,-86.5685,Reichert University,1195,26318,0.53 3 | 2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Montgomery School,770,6785,0.61 4 | 3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Indigo Card Community College,526,16767,0.50 5 | 4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Warm Meadow School of Fine Art,457,1836,0.09 6 | 5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Kovacek Institute of Technology,1511,19372,0.82 7 | 6,Athens,AL,1,35611,5,34.8056,-86.9651,Athens Institute,1057,49203,0.06 8 | 7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Alabama Beauty College of Auburn University,486,44097,0.50 9 | 8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Condemned Balloon Institute,616,59759,0.59 10 | 9,Tanner,AL,1,35671,5,34.6543,-86.9491,Inquisitive Farm College,971,34183,0.19 11 | 10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Enterprise University,920,42629,0.59 12 | 11,Montgomery,NULL,1,36117,5,32.3643,-86.2957,Auburn University at Montgomery,940,49879,0.64 13 | -------------------------------------------------------------------------------- /spec/lib/expression/eval_spec.rb: -------------------------------------------------------------------------------- 1 | require 'expression/parser' 2 | require 'expression/eval' 3 | 4 | describe Expression::Eval do 5 | 6 | let(:parser) { Expression::Parser.new } 7 | let(:eval) { Expression::Eval.new } 8 | let(:values) {{ 'f' => 0, 't' => 1 }} 9 | 10 | it "simple 'or'" do 11 | expect( 12 | eval.apply(parser.parse('t or f'), variables: values) 13 | ).to eq(1) 14 | end 15 | 16 | describe "simple 'and'" do 17 | it "true and false" do 18 | expect( 19 | eval.apply(parser.parse('t and f'), variables: values) 20 | ).to eq(0) 21 | end 22 | 23 | it "false and true" do 24 | expect( 25 | eval.apply(parser.parse('f and t'), variables: values) 26 | ).to eq(0) 27 | end 28 | end 29 | 30 | it "multiple operands" do 31 | expect( 32 | eval.apply(parser.parse('f or f or t'), variables: values) 33 | ).to eq(1) 34 | end 35 | 36 | describe "parens" do 37 | it "nested 'or'" do 38 | expect( 39 | eval.apply(parser.parse('(f or t) and t'), variables: values) 40 | ).to eq(1) 41 | end 42 | 43 | it "nested 'and'" do 44 | expect( 45 | eval.apply(parser.parse('(f and t) or f'), variables: values) 46 | ).to eq(0) 47 | end 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /spec/fixtures/schools/schools.csv: -------------------------------------------------------------------------------- 1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,PREDDEG,UGDS,MENONLY,WOMENONLY,C150_4_POOLED_SUPP,C150_L4_POOLED_SUPP,earn_2002_p10,gt_25k_2006_p6 2 | 1,Normal,AL,1,35762,5,34.7834,-86.5685,Indigo Peak School,639,1,183504,0,0,NULL,0.16,3800,0.61 3 | 2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Warm Thread Beauty College,1218,3,210739,0,0,0.62,NULL,13566,0.10 4 | 3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Arrogant Abyss University,613,1,116967,0,0,NULL,0,1177,0.84 5 | 4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Greasy Marsh Institute,590,1,81254,0,1,NULL,NULL,54146,0.49 6 | 5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Harber Institute of Technology,1355,1,256538,1,0,0,0.91,38553,0.32 7 | 6,Athens,AL,1,35611,5,34.8056,-86.9651,Unsightly Mountain School of Fine Art,1201,1,139899,0,0,NULL,0.87,55899,0.95 8 | 7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Auburn University College,740,3,165974,0,0,0.21,NULL,51608,0.73 9 | 8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Birmingham School,1084,1,224554,0,0,NULL,0.70,29545,0.67 10 | 9,Tanner,AL,1,35671,5,34.6543,-86.9491,Conn Institute of Technology,1171,4,87710,0,0,NULL,0.56,58307,0.63 11 | 10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Hollow Resonance Institute,1058,2,97265,0,0,NULL,0.59,17880,0.36 12 | -------------------------------------------------------------------------------- /spec/fixtures/schools/data.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | version: Aug6-2015-08-10-23:48-0600 3 | api: fakeschool 4 | index: fakeschool-data 5 | unique: 6 | - id 7 | options: 8 | limit_files: 1 9 | limit_rows: 100 10 | 11 | dictionary: 12 | id: 13 | source: UNITID 14 | type: integer 15 | description: Unit ID for institution 16 | school.name: 17 | source: INSTNM 18 | description: Institution name 19 | school.city: 20 | source: CITY_MAIN 21 | description: City 22 | school.state: 23 | source: STABBR_MAIN 24 | description: State postcode 25 | school.zip: 26 | source: ZIP_MAIN 27 | type: integer 28 | description: ZIP code 29 | completion.rate.lt_four_year: 30 | source: C150_L4_POOLED_SUPP 31 | type: float 32 | description: 150% completion rate for less-than-four-year institutions, pooled in two-year rolling averages and suppressed for small n size 33 | completion.rate.four_year: 34 | source: C150_4_POOLED_SUPP 35 | type: float 36 | description: 150% completion rate for four-year institutions, pooled in two-year rolling averages and suppressed for small n size 37 | completion.rate.overall: 38 | calculate: C150_L4_POOLED_SUPP or C150_4_POOLED_SUPP 39 | type: float 40 | description: 150% completion rate for the institution, independent of degree 41 | 42 | files: 43 | - name: schools.csv 44 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | ruby '2.2.4' 3 | 4 | # Distribute your app as a gem 5 | # gemspec 6 | 7 | # Server requirements 8 | # gem 'thin' # or mongrel 9 | # gem 'trinidad', :platform => 'jruby' 10 | 11 | # Optional JSON codec (faster performance) 12 | # gem 'oj' 13 | 14 | # open-data-maker requirements 15 | gem 'elasticsearch' 16 | gem 'stretchy' 17 | gem 'hashie' 18 | gem 'cf-app-utils' 19 | #gem 'unicorn' 20 | gem 'puma' 21 | gem 'safe_yaml' 22 | gem 'aws-sdk', '~> 2' 23 | gem 'actionview' 24 | gem 'dotenv' 25 | gem 'oj' 26 | gem 'parslet' 27 | gem 'parallel' 28 | 29 | # Project requirements 30 | gem 'rake' 31 | 32 | # Component requirements 33 | gem 'sass' 34 | gem 'liquify' 35 | gem 'liquid', '= 3.0.3' 36 | gem 'erubis' 37 | 38 | # Test requirements 39 | group :test do 40 | gem 'rspec' 41 | gem 'rspec-mocks' 42 | gem 'rack-test', :require => 'rack/test' 43 | end 44 | 45 | group 'dev' do 46 | gem 'google_drive' 47 | gem 'ruby-prof' 48 | 49 | end 50 | # Padrino Stable Gem 51 | gem 'padrino', '0.12.5' 52 | 53 | gem 'pry', :group => ['development', 'test'] 54 | gem 'pry-byebug', :group => ['development', 'test'] 55 | gem 'newrelic_rpm' 56 | 57 | # Or Padrino Edge 58 | # gem 'padrino', :github => 'padrino/padrino-framework' 59 | 60 | # Or Individual Gems 61 | # %w(core support gen helpers cache mailer admin).each do |g| 62 | # gem 'padrino-' + g, '0.12.5' 63 | # end 64 | -------------------------------------------------------------------------------- /lib/data_magic/index/output.rb: -------------------------------------------------------------------------------- 1 | module DataMagic 2 | module Index 3 | class Output 4 | attr_reader :row_count, :headers, :skipped 5 | 6 | def initialize 7 | @row_count = 0 8 | @skipped = [] 9 | end 10 | 11 | def set_headers(doc) 12 | return if headers 13 | @headers = doc.headers 14 | end 15 | 16 | def skipping(id) 17 | skipped << id 18 | end 19 | 20 | def increment(count = 1) 21 | @row_count += count 22 | end 23 | 24 | def validate! 25 | raise DataMagic::InvalidData, "zero rows" if empty? 26 | end 27 | 28 | def empty? 29 | row_count == 0 30 | end 31 | 32 | def log(doc) 33 | log_0(doc) if empty? 34 | log_marker if row_count % 500 == 0 35 | end 36 | 37 | def log_skips 38 | return if skipped.empty? 39 | logger.info "skipped (missing parent id): #{skipped.join(',')}" 40 | end 41 | 42 | def log_limit 43 | logger.info "done now, limiting rows to #{row_count}" 44 | end 45 | 46 | private 47 | 48 | def log_0(document) 49 | logger.debug "csv parsed" 50 | logger.info "row#{row_count} -> #{document.preview}" 51 | end 52 | 53 | def log_marker 54 | logger.info "indexing rows: #{row_count}..." 55 | end 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /spec/features/web_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe 'app', type: 'feature' do 4 | before do 5 | DataMagic.destroy 6 | ENV['DATA_PATH'] = './spec/fixtures/sample-data' 7 | DataMagic.init(load_now: true) 8 | end 9 | 10 | after do 11 | DataMagic.destroy 12 | end 13 | 14 | it "should load the home page" do 15 | get '/' 16 | expect(last_response).to be_ok 17 | end 18 | 19 | it "should display links to endpoints" do 20 | get '/' 21 | expect(last_response.body).to include 'cities' 22 | end 23 | 24 | it "should display a list of categories" do 25 | get '/' 26 | expect(last_response.body).to include('Browse Data Details by Category') 27 | expect(last_response.body).to include('General') # category name 28 | expect(last_response.body).to include('general information about the city, including standard identifiers') 29 | end 30 | 31 | it "should load the correct category page" do 32 | get '/category/general' 33 | expect(last_response.body).to include('Data Details for the') 34 | expect(last_response.body).to include('category_entry = {"title":"General"') 35 | expect(last_response.body).to include('population') # a field name 36 | expect(last_response.body).to include('The name of the city') # a field description 37 | expect(last_response.body).to include('literal') # field type 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /script/bootstrap: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | fancy_echo() { 6 | local fmt="$1"; shift 7 | 8 | # shellcheck disable=SC2059 9 | printf "\n$fmt\n" "$@" 10 | } 11 | 12 | brew_install_or_upgrade() { 13 | if brew_is_installed "$1"; then 14 | if brew_is_upgradable "$1"; then 15 | fancy_echo "Upgrading %s ..." "$1" 16 | brew upgrade "$@" 17 | else 18 | fancy_echo "Already using the latest version of %s. Skipping ..." "$1" 19 | fi 20 | else 21 | fancy_echo "Installing %s ..." "$1" 22 | brew install "$@" 23 | fi 24 | } 25 | 26 | brew_is_installed() { 27 | brew list -1 | grep -Fqx "$1" 28 | } 29 | 30 | brew_is_upgradable() { 31 | ! brew outdated --quiet "$1" >/dev/null 32 | } 33 | 34 | brew_tap_is_installed() { 35 | brew tap | grep -Fqx "$1" 36 | } 37 | 38 | brew_tap() { 39 | if ! brew_tap_is_installed "$1"; then 40 | fancy_echo "Tapping $1..." 41 | brew tap "$1" 2> /dev/null 42 | fi 43 | } 44 | 45 | echo 'Installing dependencies...' 46 | 47 | if command -v brew >/dev/null; then 48 | brew update 49 | 50 | brew_tap 'homebrew/services' 51 | brew_tap 'homebrew/versions' 52 | brew_install_or_upgrade 'elasticsearch17' 53 | 54 | brew services restart elasticsearch17 55 | 56 | # elasticsearch takes several seconds to load 57 | sleep 10 58 | fi 59 | 60 | gem install bundler --conservative 61 | bundle check || bundle install 62 | 63 | echo "All done!" 64 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | As a work of the United States Government, this project is in the 2 | public domain within the United States. 3 | 4 | Additionally, we waive copyright and related rights in the work 5 | worldwide through the CC0 1.0 Universal public domain dedication. 6 | 7 | ## CC0 1.0 Universal Summary 8 | 9 | This is a human-readable summary of the 10 | [Legal Code (read the full text)](https://creativecommons.org/publicdomain/zero/1.0/legalcode). 11 | 12 | ### No Copyright 13 | 14 | The person who associated a work with this deed has dedicated the work to 15 | the public domain by waiving all of his or her rights to the work worldwide 16 | under copyright law, including all related and neighboring rights, to the 17 | extent allowed by law. 18 | 19 | You can copy, modify, distribute and perform the work, even for commercial 20 | purposes, all without asking permission. 21 | 22 | ### Other Information 23 | 24 | In no way are the patent or trademark rights of any person affected by CC0, 25 | nor are the rights that other persons may have in the work or in how the 26 | work is used, such as publicity or privacy rights. 27 | 28 | Unless expressly stated otherwise, the person who associated a work with 29 | this deed makes no warranties about the work, and disclaims liability for 30 | all uses of the work, to the fullest extent permitted by applicable law. 31 | When using or citing the work, you should not imply endorsement by the 32 | author or the affirmer. 33 | 34 | -------------------------------------------------------------------------------- /DICTIONARY.md: -------------------------------------------------------------------------------- 1 | # Dictionary Format 2 | 3 | The data dictionary format may be (optionally) specified in the `data.yaml` file. If unspecified, all columns are imported as strings. 4 | 5 | ## Simple Data Types 6 | 7 | ``` 8 | dictionary: 9 | name: 10 | source: COLUMN_NAME 11 | type: integer 12 | description: explanation of where this data comes from and its meaning 13 | ``` 14 | 15 | In the above example: 16 | * `source:` is the name of the column in the csv. (This doesn't have to be all caps, we just find that to be common in government datasets.) 17 | * `type:` may be `integer`, `float`, `string` 18 | * `description:` text description suitable for developer documentation or information provided to data analysts 19 | 20 | ## Calculated columns 21 | 22 | Optionally, you can add "columns" by calculating fields at import based on multiple csv columns. 23 | 24 | ``` 25 | academics.program.degree.health: 26 | calculate: CIP51ASSOC or CIP51BACHL 27 | type: integer 28 | description: Associate or Bachelor's degree in Health 29 | ``` 30 | 31 | Multiple operations are supported. In the following example, if the columns `apples`, `oranges` and `plums` had a `0` value when there were none, and a `1` to represent if they were available, then these values could be combines with `or` to create a data field representing if any were true. 32 | 33 | ``` 34 | fruit: 35 | calculate: apples or oranges or plums 36 | type: integer 37 | description: is there any fruit available? 38 | ``` 39 | -------------------------------------------------------------------------------- /spec/lib/data_magic/name_type_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | 4 | describe "DataMagic name types" do 5 | 6 | before :example do 7 | DataMagic.destroy 8 | ENV['DATA_PATH'] = './spec/fixtures/types' 9 | DataMagic.init(load_now: true) 10 | end 11 | after :example do 12 | DataMagic.destroy 13 | end 14 | 15 | it "can search for one word" do 16 | response = DataMagic.search({'city.name' => 'New'}, fields:['city.name']) 17 | results = response['results'].sort {|a,b| a['city.name'] <=> b['city.name']} 18 | expect(results).to eq( 19 | [{"city.name"=>"New Orleans"}, {"city.name"=>"New York"}]) 20 | end 21 | 22 | it "can search for multiple words" do 23 | response = DataMagic.search({'city.name' => 'New York'}, fields:['city.name']) 24 | results = response['results'] 25 | expect(results).to eq( 26 | [{"city.name"=>"New York"}]) 27 | end 28 | 29 | it "can search for partial words" do 30 | response = DataMagic.search({'city.name' => 'S Fran'}, fields:['city.name']) 31 | results = response['results'] 32 | expect(results).to eq( 33 | [{"city.name"=>"San Francisco"}]) 34 | end 35 | 36 | it "is not case sensitive" do 37 | response = DataMagic.search({'city.name' => 'nEW'}, fields:['city.name']) 38 | results = response['results'].sort {|a,b| a['city.name'] <=> b['city.name']} 39 | expect(results).to eq( 40 | [{"city.name"=>"New Orleans"}, {"city.name"=>"New York"}]) 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /spec/lib/expression/parser_spec.rb: -------------------------------------------------------------------------------- 1 | require 'expression/parser' 2 | 3 | describe Expression::Parser do 4 | 5 | let(:parser) { Expression::Parser.new } 6 | describe 'vars' do 7 | it "parses one" do 8 | expect(parser.parse('one')).to eq(var: 'one') 9 | end 10 | it "preserves case " do 11 | expect(parser.parse('ONe')).to eq(var: 'ONe') 12 | end 13 | it "consumes trailing white space" do 14 | expect(parser.parse('one ')).to eq(var: 'one') 15 | end 16 | end 17 | 18 | it "parses or expression" do 19 | expect(parser.parse('apples or oranges')).to eq( 20 | {or: {left: {var: "apples"}, right: {var: "oranges"}}} 21 | ) 22 | end 23 | 24 | it "parses and expression" do 25 | expect(parser.parse('apples and oranges')).to eq( 26 | {and: {left: {var: "apples"}, right: {var: "oranges"}}} 27 | ) 28 | end 29 | 30 | describe "parens" do 31 | it "nested 'or'" do 32 | expect(parser.parse('(apples or cranberries) and nuts')).to eq( 33 | {:and => { 34 | :left=>{:or=>{:left=>{:var=>"apples"}, :right=>{:var=>"cranberries"}}}, 35 | :right=>{:var=>"nuts"}}} 36 | ) 37 | end 38 | it "nested 'and'" do 39 | expect(parser.parse('(nuts and cranberries) or apples')).to eq( 40 | { or: { 41 | left: { and: { left: {var: "nuts"}, right: {var:"cranberries"}}}, 42 | right: { var: "apples" } 43 | } 44 | } 45 | ) 46 | end 47 | 48 | end 49 | 50 | end 51 | -------------------------------------------------------------------------------- /lib/expression/parser.rb: -------------------------------------------------------------------------------- 1 | require 'parslet' 2 | # based on https://github.com/kschiess/parslet/blob/master/example/boolean_algebra.rb 3 | # usage: 4 | # def parse(str) 5 | # ExpressionParser.new.parse(str) 6 | # 7 | # rescue Parslet::ParseFailed => failure 8 | # puts failure.cause.ascii_tree 9 | # end 10 | # 11 | # tree = ExpressionParser.new.parse("one or two") 12 | # => {:or=>{:left=>{:var=>"one"@0}, :right=>{:var=>"two"@7}}} 13 | # Eval.new.apply(tree, variables: {"one"=>1, "two"=>2}) 14 | # 15 | # Variables.new.apply(tree) 16 | 17 | class Expression 18 | class Parser < Parslet::Parser 19 | rule(:space) { match[" "].repeat(1) } 20 | rule(:space?) { space.maybe } 21 | 22 | rule(:lparen) { str("(") >> space? } 23 | rule(:rparen) { str(")") >> space? } 24 | 25 | rule(:and_operator) { str("and") >> space? } 26 | rule(:or_operator) { str("or") >> space? } 27 | 28 | rule(:var) { match["[^\s\(\)]"].repeat(1).as(:var) >> space? } 29 | 30 | # The primary rule deals with parentheses. 31 | rule(:primary) { lparen >> or_operation >> rparen | var } 32 | 33 | # Note that following rules are both right-recursive. 34 | rule(:and_operation) { 35 | (primary.as(:left) >> and_operator >> 36 | and_operation.as(:right)).as(:and) | 37 | primary } 38 | 39 | rule(:or_operation) { 40 | (and_operation.as(:left) >> or_operator >> 41 | or_operation.as(:right)).as(:or) | 42 | and_operation } 43 | 44 | # We start at the lowest precedence rule. 45 | root(:or_operation) 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /spec/lib/data_magic/index/event_logger_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | 4 | describe DataMagic::Index::EventLogger do 5 | let(:event_logger) { 6 | l = DataMagic::Index::EventLogger.new 7 | allow(l).to receive(:logger).and_return(logger) 8 | l 9 | } 10 | 11 | let(:logger) { double('logger') } 12 | 13 | context 'when triggering an event with only a message argument' do 14 | it 'logs the message with the right level' do 15 | expect(logger).to receive(:info).with('hey!') 16 | event_logger.trigger('info', 'hey!') 17 | 18 | expect(logger).to receive(:debug).with('what happened?') 19 | event_logger.trigger('debug', 'what happened?') 20 | 21 | expect(logger).to receive(:warn).with('dude? everything ok?') 22 | event_logger.trigger('warn', 'dude? everything ok?') 23 | 24 | expect(logger).to receive(:error).with('FIRE IN THE HOLE!') 25 | event_logger.trigger('error', 'FIRE IN THE HOLE!') 26 | end 27 | end 28 | 29 | context 'when triggering an event with a message and an object' do 30 | it 'logs as a key value pair with an inspection of the object' do 31 | expect(logger).to receive(:info).with("foo: {:wild=>\"bar\"}") 32 | event_logger.trigger('info', 'foo', {wild: 'bar'}) 33 | end 34 | 35 | it 'will shorten the object inspection when provided a limit' do 36 | expect(logger).to receive(:warn).with("foo: {:wild") 37 | event_logger.trigger('warn', 'foo', {wild: 'bar'}, 5) 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /lib/data_magic/index/repository.rb: -------------------------------------------------------------------------------- 1 | module DataMagic 2 | module Index 3 | class Repository 4 | attr_reader :client, :document 5 | 6 | def initialize(client, document) 7 | @client = client 8 | @document = document 9 | end 10 | 11 | def save 12 | @skipped = false 13 | if client.creating? 14 | create 15 | else 16 | update 17 | end 18 | end 19 | 20 | def skipped? 21 | @skipped 22 | end 23 | 24 | def save 25 | if client.creating? 26 | create 27 | else 28 | update 29 | end 30 | end 31 | 32 | private 33 | 34 | def update 35 | if client.allow_skips? 36 | update_with_rescue 37 | else 38 | update_without_rescue 39 | end 40 | end 41 | 42 | def create 43 | client.index({ 44 | index: client.index_name, 45 | id: document.id, 46 | type: 'document', 47 | body: document.data 48 | }) 49 | end 50 | 51 | def update_without_rescue 52 | client.update({ 53 | index: client.index_name, 54 | id: document.id, 55 | type: 'document', 56 | body: {doc: document.data} 57 | }) 58 | end 59 | 60 | def update_with_rescue 61 | update_without_rescue 62 | rescue Elasticsearch::Transport::Transport::Errors::NotFound 63 | @skipped = true 64 | end 65 | end 66 | end 67 | end 68 | -------------------------------------------------------------------------------- /lib/nested_hash.rb: -------------------------------------------------------------------------------- 1 | class NestedHash < Hash 2 | 3 | def initialize(hash = {}, default = nil, &block) 4 | default ? super(default) : super(&block) 5 | self.add(hash) 6 | end 7 | 8 | def add(hash) 9 | hash.each do |full_name, value| 10 | parts = full_name.to_s.split('.') 11 | last = parts.length - 1 12 | add_to = self 13 | parts.each_with_index do |name, index| 14 | if index == last 15 | add_to[name] = value 16 | else 17 | add_to[name] ||= {} 18 | add_to = add_to[name] 19 | end 20 | end 21 | end 22 | self 23 | end 24 | 25 | # generate a flat, non-nested hash 26 | # with keys that have dots representing the hierarchy 27 | def withdotkeys(deep_hash = self, flat_hash = {}, root = '') 28 | deep_hash.each do |key, value| 29 | if deep_hash[key].is_a?(Hash) 30 | flat_hash.merge! withdotkeys(value, flat_hash, key + '.') 31 | else 32 | key = "#{root}#{key}" if not root.empty? 33 | flat_hash[key] = value 34 | end 35 | end 36 | flat_hash 37 | end 38 | 39 | # generate a list of the keys with dots representing the hierarchy 40 | def dotkeys(row = self, prefix = '', path = []) 41 | human_names = [] 42 | paths = [] 43 | row.keys.each do |key| 44 | if row[key].is_a?(Hash) 45 | new_human_names = dotkeys(row[key], key + '.') 46 | human_names += new_human_names 47 | else 48 | human_names << prefix + key 49 | end 50 | end 51 | human_names 52 | end 53 | 54 | end 55 | -------------------------------------------------------------------------------- /app/views/home.liquid: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 |
5 |

{{ title }}

6 |
7 |
8 |
9 |
10 | 11 |

API endpoints

12 | 13 | 18 | 19 | {% if examples.size > 0 %} 20 |

Examples

21 | 22 | 27 | {% endif %} 28 | 29 |

Browse Data Details by Category

30 |
31 |
32 |
33 | 34 | 53 | -------------------------------------------------------------------------------- /spec/lib/data_magic/calculated_columns_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | 4 | describe "calculated columns" do 5 | 6 | before :example do 7 | DataMagic.destroy 8 | ENV['DATA_PATH'] = data_path 9 | end 10 | after :example do 11 | DataMagic.destroy 12 | end 13 | 14 | describe "combine into float" do 15 | let(:data_path) { "./spec/fixtures/schools" } 16 | it "can combine two columns" do 17 | DataMagic.config = DataMagic::Config.new 18 | DataMagic.import_with_dictionary 19 | result = DataMagic.search({}, fields: ['id', 'completion.rate.overall']) 20 | results = result['results'].sort_by { |hash| hash['id'] } 21 | expect(results[0]).to eq('id' => 1, 'completion.rate.overall' => 0.16) 22 | expect(results[1]).to eq('id' => 2, 'completion.rate.overall' => 0.62) 23 | expect(results[2]).to eq('id' => 3, 'completion.rate.overall' => nil) 24 | expect(results[3]).to eq('id' => 4, 'completion.rate.overall' => nil) 25 | expect(results[4]).to eq('id' => 5, 'completion.rate.overall' => 0.91) 26 | end 27 | end 28 | 29 | describe "combine into boolean" do 30 | let(:data_path) { "./spec/fixtures/calculated_columns" } 31 | it "can combine multiple columns" do 32 | DataMagic.config = DataMagic::Config.new 33 | DataMagic.import_with_dictionary 34 | result = DataMagic.search({}, fields: %w(id summarybool)) 35 | results = result['results'].sort_by { |hash| hash['id'] } 36 | expect(results[0]).to eq('id' => 1, 'summarybool' => true) 37 | expect(results[1]).to eq('id' => 2, 'summarybool' => false) 38 | expect(results[2]).to eq('id' => 3, 'summarybool' => true) 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /spec/lib/expression_spec.rb: -------------------------------------------------------------------------------- 1 | require 'expression/expression' 2 | 3 | describe Expression do 4 | context "simple or expression" do 5 | it "can find variables" do 6 | expr = "ONE or TWO" 7 | expect(Expression.new(expr).variables).to eq(%w(ONE TWO)) 8 | end 9 | 10 | it "evaluates: 0 OR 1 to be 1" do 11 | expr = "f or t" 12 | values = {f:0, t:1} 13 | expect(Expression.new(expr).evaluate(values)).to eq(1) 14 | end 15 | 16 | it "evaluates: 1 OR 0 to be 1" do 17 | expr = "t or f" 18 | values = {f:0, t:1} 19 | expect(Expression.new(expr).evaluate(values)).to eq(1) 20 | end 21 | 22 | it "evaluates: 0 OR 0 to be 0" do 23 | expr = "f1 or f2" 24 | values = {f1:0, f2:0} 25 | expect(Expression.new(expr).evaluate(values)).to eq(0) 26 | end 27 | 28 | it "evaluates: 1 OR 1 to be 1" do 29 | expr = "t1 or t2" 30 | values = {t1:1, t2:1} 31 | expect(Expression.new(expr).evaluate(values)).to eq(1) 32 | end 33 | 34 | it "evaluates: 1 OR nil to be 1" do 35 | expr = "t1 or t2" 36 | values = {t1:1, t2:nil} 37 | expect(Expression.new(expr).evaluate(values)).to eq(1) 38 | end 39 | 40 | it "evaluates: 0 OR nil to be nil" do 41 | expr = "t1 or t2" 42 | values = {t1:0, t2:nil} 43 | expect(Expression.new(expr).evaluate(values)).to eq(nil) 44 | end 45 | 46 | it "evaluates: nil OR 0 to be 0" do 47 | expr = "t1 or t2" 48 | values = {t1:nil, t2:0} 49 | expect(Expression.new(expr).evaluate(values)).to eq(0) 50 | end 51 | 52 | it "evaluates: nil OR nil to be nil" do 53 | expr = "t1 or t2" 54 | values = {t1:nil, t2:nil} 55 | expect(Expression.new(expr).evaluate(values)).to eq(nil) 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /spec/lib/data_magic/create_index_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | 4 | describe "DataMagic #init" do 5 | before (:all) do 6 | ENV['DATA_PATH'] = './spec/fixtures/import_with_dictionary' 7 | end 8 | 9 | after(:each) do 10 | DataMagic.destroy 11 | end 12 | 13 | context "with no options" do 14 | it "creates index only once" do 15 | expect(DataMagic).to receive(:create_index).once 16 | DataMagic.init 17 | end 18 | 19 | it "creates index" do 20 | DataMagic.init 21 | expect(DataMagic.config.index_exists?).to be true 22 | end 23 | 24 | it "does not re-create index with subsequent call to #import_with_dictionary" do 25 | expect(DataMagic).to receive(:create_index).once 26 | DataMagic.init 27 | DataMagic.import_with_dictionary 28 | end 29 | end 30 | 31 | 32 | context "with load_now: false" do 33 | it "does not call #create_index" do 34 | expect(DataMagic).not_to receive(:create_index) 35 | DataMagic.init(load_now: false) 36 | end 37 | 38 | it "does not create index" do 39 | DataMagic.init(load_now: false) 40 | expect(DataMagic.config.index_exists?).to be false 41 | end 42 | 43 | it "creates index with subsequent call to #import_with_dictionary" do 44 | DataMagic.init(load_now: false) 45 | DataMagic.import_with_dictionary 46 | expect(DataMagic.config.index_exists?).to be true 47 | end 48 | 49 | it "creates index with subsequent call to #import_csv" do 50 | ENV['DATA_PATH'] = './spec/fixtures/minimal' 51 | DataMagic.init(load_now: false) 52 | data_str = <<-eos 53 | a,b 54 | 1,2 55 | 3,4 56 | eos 57 | data = StringIO.new(data_str) 58 | DataMagic.import_csv(data) 59 | expect(DataMagic.config.index_exists?).to be true 60 | end 61 | end 62 | end -------------------------------------------------------------------------------- /spec/lib/nested_hash_spec.rb: -------------------------------------------------------------------------------- 1 | require 'nested_hash' 2 | 3 | describe NestedHash do 4 | let(:input) { {"loc.x" => 1, "loc.y" => 2, "foo.a" => 10, "foo.b" => 20, "loc.z" => 3}} 5 | let(:expected) {{"loc" => {"x" => 1, "y" => 2, "z" => 3}, "foo" => {"a" => 10, "b" => 20}}} 6 | 7 | let(:symbol_keys) { {x:1, y:2}} 8 | let(:symbol_keys_result) { {'x' => 1, 'y' => 2}} 9 | 10 | 11 | it ".add created nested hash elements for string keys with '.'" do 12 | result = NestedHash.new.add(input) 13 | expect(result).to eq(expected) 14 | end 15 | 16 | it "does no harm when initialized with an already nested hash" do 17 | expect(NestedHash.new(expected)).to eq(expected) 18 | end 19 | 20 | context "methods" do 21 | let (:result) { NestedHash.new(input) } 22 | it "can initialize with another Hash" do 23 | expect(result).to eq(expected) 24 | end 25 | 26 | it "can generate dotkeys" do 27 | expect(result.dotkeys.sort).to eq(input.keys.sort) 28 | end 29 | 30 | it "withdotkeys generates keys with '.'" do 31 | expect(result.withdotkeys).to eq(input) 32 | end 33 | 34 | it "dotkeys and withdotkeys have same order" do 35 | expect(result.withdotkeys.keys).to eq(result.dotkeys) 36 | end 37 | end 38 | 39 | 40 | it "turns symbol keys into simple strings" do 41 | result = NestedHash.new.add(symbol_keys) 42 | expect(result).to eq(symbol_keys_result) 43 | end 44 | 45 | context "deeply nested" do 46 | let(:input) { {"info.loc.x" => 0.11, "info.loc.y" => 0.222, "foo.a" => 10, "foo.b" => 20}} 47 | let(:expected) { {"info" => {"loc" => {"x" => 0.11, "y" => 0.222}}, "foo" => {"a" => 10, "b" => 20}}} 48 | 49 | it "creates nested hash elements for string keys with '.'" do 50 | result = NestedHash.new.add(input) 51 | expect(result).to eq(expected) 52 | end 53 | 54 | end 55 | 56 | end 57 | -------------------------------------------------------------------------------- /app/views/category.liquid: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 |
5 |

{{ title }}

6 |
7 |
8 |
9 |
10 | 11 |

Data Details for the Category

12 | 13 |
14 |
15 |
16 |
17 | 18 |
19 | Back to the list of Categories 20 | 21 | 55 | -------------------------------------------------------------------------------- /config/apps.rb: -------------------------------------------------------------------------------- 1 | ## 2 | # This file mounts each app in the Padrino project to a specified sub-uri. 3 | # You can mount additional applications using any of these commands below: 4 | # 5 | # Padrino.mount('blog').to('/blog') 6 | # Padrino.mount('blog', :app_class => 'BlogApp').to('/blog') 7 | # Padrino.mount('blog', :app_file => 'path/to/blog/app.rb').to('/blog') 8 | # 9 | # You can also map apps to a specified host: 10 | # 11 | # Padrino.mount('Admin').host('admin.example.org') 12 | # Padrino.mount('WebSite').host(/.*\.?example.org/) 13 | # Padrino.mount('Foo').to('/foo').host('bar.example.org') 14 | # 15 | # Note 1: Mounted apps (by default) should be placed into the project root at '/app_name'. 16 | # Note 2: If you use the host matching remember to respect the order of the rules. 17 | # 18 | # By default, this file mounts the primary app which was generated with this project. 19 | # However, the mounted app can be modified as needed: 20 | # 21 | # Padrino.mount('AppName', :app_file => 'path/to/file', :app_class => 'BlogApp').to('/') 22 | # 23 | 24 | ## 25 | # Setup global project settings for your apps. These settings are inherited by every subapp. You can 26 | # override these settings in the subapps as needed. 27 | # 28 | Padrino.configure_apps do 29 | # enable :sessions 30 | set :session_secret, 'ffb8bfc2d71e2ad938950169de2757ab7b73b1cd5fbf91b4b912ae493dc5b70f' 31 | set :protection, :except => :path_traversal 32 | set :protect_from_csrf, true 33 | 34 | set :allow_origin, :any 35 | 36 | end 37 | 38 | # If needed, mount the app that does indexing 39 | if ENV['INDEX_APP'] == "enable" 40 | puts "mounting index app" 41 | Padrino.mount('OpenDataMaker::IndexApp', :app_file => Padrino.root('app/index_app.rb')).to('/index') 42 | end 43 | 44 | # Mounts the core application for this project 45 | Padrino.mount('OpenDataMaker::App', :app_file => Padrino.root('app/app.rb')).to('/') 46 | -------------------------------------------------------------------------------- /lib/data_magic/index/row_importer.rb: -------------------------------------------------------------------------------- 1 | require 'forwardable' 2 | 3 | module DataMagic 4 | module Index 5 | class RowImporter 6 | attr_reader :row, :importer 7 | 8 | def initialize(row, importer) 9 | @row = row 10 | @importer = importer 11 | end 12 | 13 | def process 14 | log_row_start 15 | before_save 16 | save 17 | after_save 18 | log_row_end 19 | end 20 | 21 | def document 22 | @document ||= DocumentBuilder.create(row, importer.builder_data, config) 23 | end 24 | 25 | def repository 26 | @repository ||= Repository.new(importer.client, document) 27 | end 28 | 29 | private 30 | 31 | def log_row_start 32 | trigger("debug", "csv parsed") if importer.empty? 33 | trigger("info", "row #{importer.row_count}", document, 500) if importer.row_count % 500 == 0 34 | #trigger("info", "id", document.id) 35 | if document.id_empty? 36 | trigger("warn", "blank id") 37 | trigger("warn", "unique", config.data["unique"]) 38 | trigger("warn", "in row", document, 255) 39 | end 40 | end 41 | 42 | def before_save 43 | importer.set_headers(document) 44 | end 45 | 46 | def save 47 | repository.save 48 | end 49 | 50 | def after_save 51 | importer.skipping(document.id) if repository.skipped? 52 | importer.increment 53 | end 54 | 55 | def log_row_end 56 | return if !importer.at_limit? 57 | trigger("info", "done now, limiting rows to #{importer.row_count}") 58 | end 59 | 60 | def config 61 | DataMagic.config 62 | end 63 | 64 | extend Forwardable 65 | 66 | def_delegators :importer, :trigger 67 | 68 | def self.process(*args) 69 | new(*args).process 70 | end 71 | end 72 | end 73 | end 74 | -------------------------------------------------------------------------------- /spec/lib/data_magic/index/document_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | 4 | describe DataMagic::Index::Document do 5 | before do 6 | allow(DataMagic).to receive(:config).and_return(config) 7 | end 8 | 9 | let(:document) { DataMagic::Index::Document.new(data) } 10 | let(:config) { DataMagic::Config.new() } 11 | let(:data) { {} } 12 | 13 | context 'when configured without any unique keys' do 14 | before do 15 | config.data['unique'] = [] 16 | end 17 | 18 | it 'id should be nil' do 19 | expect(document.id).to be(nil) 20 | end 21 | 22 | it 'id should not be empty though' do 23 | expect(document.id_empty?).to be_falsey 24 | end 25 | end 26 | 27 | context 'when configured with the default keys' do 28 | context 'and there is no data' do 29 | it 'id should be an empty string' do 30 | expect(document.id).to eq('') 31 | end 32 | 33 | it 'id should be considered empty' do 34 | expect(document.id_empty?).to be_truthy 35 | end 36 | end 37 | 38 | context 'when there is data' do 39 | let(:data) { 40 | {"name" => "foo", "state"=>"MA"} 41 | } 42 | 43 | it 'id should be the value for the name key' do 44 | expect(document.id).to eq('foo') 45 | end 46 | 47 | it 'id should not be considered empty' do 48 | expect(document.id_empty?).to be_falsey 49 | end 50 | end 51 | end 52 | 53 | context 'with custom id configuration' do 54 | let(:data) { 55 | {"name" => "foo", "state"=>"MA"} 56 | } 57 | 58 | before do 59 | config.data['unique'] = ['name', 'state'] 60 | end 61 | 62 | it 'id should build the right id for the data' do 63 | expect(document.id).to eq('foo:MA') 64 | end 65 | 66 | it 'id should not be considered empty' do 67 | expect(document.id_empty?).to be_falsey 68 | end 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /spec/lib/data_magic/import_with_nested_files_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | 4 | describe "unique key(s)" do 5 | 6 | before :example do 7 | DataMagic.destroy 8 | ENV['DATA_PATH'] = './spec/fixtures/nested_files' 9 | DataMagic.config = DataMagic::Config.new 10 | DataMagic.import_with_dictionary 11 | end 12 | after :example do 13 | DataMagic.destroy 14 | end 15 | let(:query) { {} } 16 | let(:sort) { nil } 17 | let(:result) { DataMagic.search(query, sort: sort) } 18 | let(:first) { result['results'].first } 19 | let(:id_one) { result['results'].find { |item| item['id'] == '1' } } 20 | let(:total) { result['metadata']['total'] } 21 | 22 | it "creates one document per unique id" do 23 | expect(total).to eq(11) 24 | end 25 | 26 | it "nests documents per unique id" do 27 | expect(id_one['id']).to eq('1') 28 | expect(id_one['2013']).to_not be_nil 29 | end 30 | 31 | it "root document contains special 'only' fields" do 32 | expect(id_one['id']).to eq('1') 33 | expect(id_one['name']).to eq('Reichert University') 34 | expect(id_one['city']).to eq('Normal') 35 | expect(id_one['state']).to eq('AL') 36 | end 37 | 38 | context "can import a subset of fields" do 39 | context "and when searching for a field value" do 40 | let(:query) { {zipcode: "35762"} } 41 | it "and doesn't find column" do 42 | expect(total).to eq(0) 43 | end 44 | end 45 | it "and doesn't include extra field" do 46 | expect(first['zipcode']).to be(nil) 47 | end 48 | end 49 | 50 | context "when searching on a nested field" do 51 | let(:query) { { '2013.earnings.6_yrs_after_entry.median' => 26318 } } 52 | it "can find the correct results" do 53 | expect(total).to eq(1) 54 | expect(first['2013']['earnings']['6_yrs_after_entry']).to eq({"percent_gt_25k"=>0.53, "median"=>26318}) 55 | end 56 | end 57 | 58 | context "when sorting by a nested field" do 59 | let(:sort) { '2013.earnings.6_yrs_after_entry.median' } 60 | it "can find the right first result" do 61 | expect(total).to eq(11) 62 | expect(first['2013']['earnings']['6_yrs_after_entry']).to eq({"percent_gt_25k"=>0.09, "median"=>1836}) 63 | end 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /app/stylesheets/application.sass: -------------------------------------------------------------------------------- 1 | body 2 | -webkit-font-smoothing: antialiased 3 | font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif 4 | font-size: 1em 5 | line-height: 1.5 6 | color: #333 7 | 8 | h1, h2, h3, h4, h5, h6 9 | font-family: "Raleway", "Helvetica Neue", Helvetica, Arial, sans-serif 10 | line-height: 1.1em 11 | margin: 0 12 | text-rendering: optimizeLegibility 13 | 14 | p 15 | margin: 0 0 0.75em 16 | 17 | hr 18 | border-bottom: 1px solid silver 19 | border-left: none 20 | border-right: none 21 | border-top: none 22 | margin: 1em 0 23 | 24 | img 25 | -webkit-user-select: none 26 | cursor: zoom-in 27 | margin: 0 28 | max-width: 50% 29 | 30 | .logo 31 | height: 150px 32 | width: 150px 33 | top: 50px 34 | left: 50px 35 | z-index: 20 36 | 37 | @media screen and (max-width: 995px) 38 | .logo 39 | height: 100px 40 | width: 100px 41 | top: 40px 42 | left: 20px 43 | 44 | @media screen and (max-width: 785px) 45 | .logo 46 | height: 75px 47 | width: 75px 48 | 49 | @media screen and (max-width: 590px) 50 | .logo 51 | top: 73px 52 | 53 | @media screen and (max-width: 480px) 54 | .logo 55 | top: 16px 56 | left: 0px 57 | 58 | .bottom-margin 59 | margin-bottom: 0.5em 60 | color: #c00 61 | 62 | .title 63 | text-align: center 64 | font-family: "Raleway", "Helvetica Neue", Helvetica, Arial, sans-serif 65 | font-size: 2em 66 | line-height: 2em 67 | 68 | .header 69 | background-color: #9cf 70 | 71 | .categories .category 72 | margin: 5px 73 | padding: 15px 74 | border: solid 1px silver 75 | word-wrap: break-word 76 | display: inline-block 77 | width: 92% 78 | background-color: #ffc 79 | a 80 | color: black 81 | text-decoration: none 82 | &:visited 83 | color: black 84 | 85 | .categories__column 86 | display: inline-block 87 | width: 100% 88 | vertical-align: top 89 | -webkit-column-count: 2 90 | -moz-column-count: 2 91 | column-count: 2 92 | column-gap: .2em 93 | -webkit-column-gap: .2em 94 | -moz-column-gap: .2em 95 | 96 | .category__name 97 | font-size: 18px 98 | font-weight: bold 99 | margin-bottom: 5px 100 | color: #c00 101 | 102 | .category__fields 103 | list-style: none 104 | padding: 0 105 | 106 | .category__field-name 107 | font-size: 15px 108 | font-weight: bold 109 | margin-bottom: 2px 110 | color: #c00 111 | width: 80% 112 | 113 | .category__field-type 114 | font-size: 15px 115 | font-weight: bold 116 | color: #c00 117 | width: 10% 118 | float: right 119 | -------------------------------------------------------------------------------- /spec/lib/data_magic/index/repository_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | 4 | describe DataMagic::Index::Repository do 5 | let(:repository) { DataMagic::Index::Repository.new(super_client, document) } 6 | 7 | let(:super_client) { double('super client', index_name: 'index') } 8 | let(:document) { double('document', {id: 'id', data: 'data'}) } 9 | 10 | context 'when super client is creating' do 11 | before do 12 | allow(super_client).to receive(:creating?).and_return(true) 13 | allow(super_client).to receive(:index) 14 | end 15 | 16 | it '#save creates an index' do 17 | expect(super_client).to receive(:index).with({ 18 | index: 'index', 19 | id: 'id', 20 | type: 'document', 21 | body: 'data' 22 | }) 23 | repository.save 24 | end 25 | 26 | it '#save will not be skipped when successful' do 27 | repository.save 28 | expect(repository.skipped?).to be_falsey 29 | end 30 | end 31 | 32 | context 'when super client is not creating' do 33 | before do 34 | allow(super_client).to receive(:creating?).and_return(false) 35 | allow(super_client).to receive(:allow_skips?) 36 | allow(super_client).to receive(:update) 37 | end 38 | 39 | it '#save updates an index' do 40 | expect(super_client).to receive(:update).with({ 41 | index: 'index', 42 | id: 'id', 43 | type: 'document', 44 | body: {doc: 'data'} 45 | }) 46 | repository.save 47 | end 48 | 49 | it '#save will not be skipped when successful' do 50 | repository.save 51 | expect(repository.skipped?).to be_falsey 52 | end 53 | end 54 | 55 | context 'when super client is not creating, not skipping and an error is raised' do 56 | before do 57 | allow(super_client).to receive(:creating?).and_return(false) 58 | allow(super_client).to receive(:allow_skips?).and_return(false) 59 | end 60 | 61 | it '#save raises an error' do 62 | allow(super_client).to receive(:update).and_raise(Elasticsearch::Transport::Transport::Errors::NotFound) 63 | expect { 64 | repository.save 65 | }.to raise_error(Elasticsearch::Transport::Transport::Errors::NotFound) 66 | end 67 | end 68 | 69 | context 'when super client is not creating, skipping and an error is raised' do 70 | before do 71 | allow(super_client).to receive(:creating?).and_return(false) 72 | allow(super_client).to receive(:allow_skips?).and_return(true) 73 | end 74 | 75 | it '#save marks the repository as skipped' do 76 | allow(super_client).to receive(:update).and_raise(Elasticsearch::Transport::Transport::Errors::NotFound) 77 | expect { 78 | repository.save 79 | }.not_to raise_error 80 | expect(repository.skipped?).to eq(true) 81 | end 82 | end 83 | end 84 | -------------------------------------------------------------------------------- /lib/data_magic/index.rb: -------------------------------------------------------------------------------- 1 | require 'forwardable' 2 | 3 | require_relative 'config' 4 | require_relative 'index/builder_data' 5 | require_relative 'index/event_logger' 6 | require_relative 'index/document' 7 | require_relative 'index/document_builder' 8 | require_relative 'index/importer' 9 | require_relative 'index/output' 10 | require_relative 'index/repository' 11 | require_relative 'index/row_importer' 12 | require_relative 'index/super_client' 13 | 14 | require 'action_view' # for distance_of_time_in_words (logging time) 15 | include ActionView::Helpers::DateHelper # for distance_of_time_in_words (logging time) 16 | 17 | module DataMagic 18 | # data could be a String or an io stream 19 | def self.import_csv(data, options={}) 20 | Index::Importer.process(data, options) 21 | end 22 | 23 | # pre-condition: index is already created w/ config 24 | def self.index_with_dictionary(options = {}) 25 | start_time = Time.now 26 | Config.logger.debug "--- index_with_dictionary, starting at #{start_time}" 27 | 28 | logger.info "files: #{self.config.files}" 29 | config.files.each_with_index do |filepath, index| 30 | fname = filepath.split('/').last 31 | logger.debug "indexing #{fname} #{index} file config:#{config.additional_data_for_file(index).inspect}" 32 | options[:add_data] = config.additional_data_for_file(index) 33 | options[:only] = config.info_for_file(index, :only) 34 | options[:nest] = config.info_for_file(index, :nest) 35 | begin 36 | logger.debug "*"*40 37 | logger.debug "* #{filepath}" 38 | logger.debug "*"*40 39 | file_start = Time.now 40 | data = config.read_path(filepath) 41 | rows, _ = DataMagic.import_csv(data, options) 42 | file_end = Time.now 43 | logger.debug "imported #{rows} rows in #{distance_of_time_in_words(file_end, file_start)}, ms: #{file_end - file_start}" 44 | rescue DataMagic::InvalidData => e 45 | Config.logger.debug "Error: skipping #{filepath}, #{e.message}" 46 | end 47 | end 48 | end_time = Time.now 49 | logger.debug "indexing complete: #{distance_of_time_in_words(end_time, start_time)}" 50 | logger.debug "duration: #{end_time - start_time}" 51 | end 52 | 53 | def self.import_with_dictionary(options = {}) 54 | #logger.debug("field_mapping: #{field_mapping.inspect}") 55 | options[:mapping] = config.field_mapping 56 | options = options.merge(config.options) 57 | 58 | es_index_name = self.config.load_datayaml(options[:data_path]) 59 | unless config.index_exists?(es_index_name) 60 | logger.info "creating #{es_index_name}" # TO DO: fix #14 61 | create_index es_index_name, config.field_types 62 | end 63 | 64 | index_with_dictionary(options) 65 | 66 | end # import_with_dictionary 67 | 68 | private 69 | def self.valid_types 70 | %w[integer float string literal name autocomplete boolean] 71 | end 72 | 73 | end # module DataMagic 74 | -------------------------------------------------------------------------------- /spec/lib/data_magic/import_without_data_yaml_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | 4 | describe "DataMagic #import_without_data_yaml" do 5 | describe "without ALLOW_MISSING_YML" do 6 | it "not found locally raises error" do 7 | ENV['DATA_PATH'] = './spec/fixtures/cities_without_yml' 8 | expect { 9 | DataMagic.init(load_now: true) 10 | }.to raise_error(IOError, "No data.y?ml found at ./spec/fixtures/cities_without_yml. Did you mean to define ALLOW_MISSING_YML environment variable?") 11 | end 12 | it "not found on s3 raises error" do 13 | ENV['DATA_PATH'] = 's3://mybucket' 14 | fake_s3 = Aws::S3::Client.new(stub_responses: true) 15 | fake_s3.stub_responses(:get_object, Aws::S3::Errors::NoSuchKey.new(Seahorse::Client::RequestContext, 'Fake Error')) 16 | expect { 17 | config = DataMagic::Config.new(s3: fake_s3) 18 | }.to raise_error(IOError, "No data.y?ml found at s3://mybucket. Did you mean to define ALLOW_MISSING_YML environment variable?") 19 | end 20 | 21 | end 22 | describe "with ALLOW_MISSING_YML" do 23 | let (:expected) do 24 | { 25 | "metadata" => { 26 | "total" => 1, 27 | "page" => 0, 28 | "per_page" => DataMagic::DEFAULT_PAGE_SIZE 29 | }, 30 | "results" => [] 31 | } 32 | end 33 | 34 | before(:all) do 35 | DataMagic.destroy 36 | ENV['ALLOW_MISSING_YML'] = 'allow' 37 | ENV['DATA_PATH'] = './spec/fixtures/cities_without_yml' 38 | DataMagic.init(load_now: true) 39 | end 40 | after(:all) do 41 | DataMagic.destroy 42 | ENV['ALLOW_MISSING_YML'] = '' 43 | end 44 | 45 | it "can get list of imported csv files" do 46 | file_list = [ 47 | "./spec/fixtures/cities_without_yml/cities50.csv", 48 | "./spec/fixtures/cities_without_yml/cities51-100.csv", 49 | "./spec/fixtures/cities_without_yml/more.csv", 50 | ] 51 | expect(DataMagic.config.files.sort).to eq(file_list) 52 | end 53 | 54 | it "can get index name from api endpoint" do 55 | expect(DataMagic.config.find_index_for('cities-without-yml')).to eq('cities-without-yml') 56 | end 57 | 58 | it "indexes files with yaml mapping" do 59 | result = DataMagic.search({NAME: "Chicago"}, api: 'cities-without-yml') 60 | expected["results"] = [ 61 | { 62 | "USPS"=>"IL", 63 | "GEOID"=>"1714000", 64 | "ANSICODE"=>"00428803", 65 | "NAME"=>"Chicago", 66 | "LSAD"=>"25", 67 | "FUNCSTAT"=>"A", 68 | "POP10"=>"2695598", 69 | "HU10"=>"1194337", 70 | "ALAND"=>"589571105", 71 | "AWATER"=>"16781658", 72 | "ALAND_SQMI"=>"227.635", 73 | "AWATER_SQMI"=>"6.479", 74 | "INTPTLAT"=>"41.837551", 75 | "INTPTLONG"=>"-87.681844", 76 | } 77 | ] 78 | expect(result).to eq(expected) 79 | end 80 | end 81 | end 82 | -------------------------------------------------------------------------------- /spec/lib/data_magic/config_field_types_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe 'DataMagic::Config #field_types' do 4 | let(:config) { DataMagic::Config.new(load_datayaml: false) } 5 | 6 | it "returns empty if dictionary is empty" do 7 | allow(config).to receive(:file_config).and_return([{'name' => 'one.csv'}]) 8 | allow(config).to receive(:dictionary).and_return({}) 9 | expect(config.field_types).to eq({}) 10 | end 11 | 12 | context "when no type is given" do 13 | before do 14 | allow(config).to receive(:file_config).and_return([{'name' => 'one.csv'}]) 15 | allow(config).to receive(:dictionary).and_return({ 16 | 'name' => {source:'NAME_COLUMN'} 17 | }) 18 | end 19 | 20 | it "defaults to string" do 21 | expect(config.field_types).to eq({ 22 | 'name' => 'string' 23 | }) 24 | end 25 | end 26 | 27 | it "supports integers" do 28 | allow(config).to receive(:file_config).and_return([{'name' => 'one.csv'}]) 29 | allow(config).to receive(:dictionary).and_return( 30 | IndifferentHash.new count: 31 | {source:'COUNT_COLUMN', type: 'integer'} 32 | ) 33 | expect(config.field_types).to eq({'count' => 'integer'}) 34 | end 35 | 36 | context "with float type" do 37 | it "sets float mapping" do 38 | allow(config).to receive(:file_config).and_return([{'name' => 'one.csv'}]) 39 | allow(config).to receive(:dictionary).and_return( 40 | IndifferentHash.new percent: 41 | {source:'PERCENT_COLUMN', type: 'float'} 42 | ) 43 | expect(config.field_types).to eq({'percent' => 'float'}) 44 | end 45 | 46 | it "can be excluded" do 47 | allow(config).to receive(:dictionary).and_return( 48 | IndifferentHash.new id: {source:'ID', type: 'integer'}, 49 | percent: {source:'PERCENT', type: 'float'} 50 | ) 51 | allow(config).to receive(:file_config).and_return([ 52 | IndifferentHash.new({ name:'one.csv', only: ['id'] }) 53 | ]) 54 | expect(config.field_types).to eq({'id' => 'integer'}) 55 | end 56 | 57 | it "can be nested" do 58 | allow(config).to receive(:dictionary).and_return( 59 | IndifferentHash.new id: {source:'ID', type: 'integer'}, 60 | percent: {source:'PERCENT', type: 'float'} 61 | ) 62 | allow(config).to receive(:file_config).and_return([ 63 | IndifferentHash.new({name:'one.csv', 64 | only: ['id']}), 65 | IndifferentHash.new({name:'two.csv', 66 | nest: {key: '2012', contents: ['percent']}}) 67 | ]) 68 | expect(config.field_types).to eq({ 69 | 'id' => 'integer', 70 | '2012.percent' => 'float' 71 | }) 72 | end 73 | end 74 | 75 | it "supports special case for location fields as nil" do 76 | # special case for location in create_index 77 | allow(config).to receive(:dictionary).and_return( 78 | IndifferentHash.new 'location.lat': {source:'LAT_COLUMN'}, 79 | 'location.lon': {source:'LON_COLUMN'} 80 | 81 | ) 82 | expect(config.field_types).to eq({}) 83 | end 84 | end 85 | -------------------------------------------------------------------------------- /spec/fixtures/sample-data/data.yaml: -------------------------------------------------------------------------------- 1 | version: cities100-2010 2 | # cities100.txt 3 | # National Places Gazetteer Files, from US Census 2010 4 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html 5 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt 6 | # head -n 101 results.txt > cities100.txt 7 | # then convertes to csv and removed " city" from after each city name 8 | index: city-data 9 | api: cities 10 | unique: ['name'] 11 | options: 12 | search: dictionary_only 13 | 14 | dictionary: 15 | id: 16 | source: GEOID 17 | description: > 18 | Geographic Identifier - fully concatenated geographic code (State FIPS and 19 | County FIPS). The Census Bureau and other state and federal agencies are 20 | responsible for assigning geographic identifiers, or GEOIDs, to geographic 21 | entities to facilitate the organization, presentation, and 22 | exchange of geographic and statistical data. GEOIDs are numeric codes that 23 | uniquely identify all administrative/legal and statistical geographic areas for 24 | which the Census Bureau tabulates data. From Alaska, the largest state, 25 | to the smallest census block in New York City, every geographic area 26 | has a unique GEOID. Data users rely on GEOIDs to join the appropriate 27 | demographic data from censuses and surveys, such as the 28 | American Community Survey (ACS), to various levels of geography for data 29 | analysis, interpretation and mapping. 30 | code: 31 | source: ANSICODE 32 | description: > 33 | American National Standards Institute codes (ANSI codes) 34 | are standardized numeric or alphabetic codes issued by the American 35 | National Standards Institute (ANSI) to ensure uniform identification of 36 | geographic entities through all federal government agencies. 37 | name: 38 | source: NAME 39 | description: The name of the city 40 | type: literal 41 | state: 42 | source: USPS 43 | description: Two letter state abbreviation 44 | population: 45 | source: POP10 46 | description: City population from 2010 Census data 47 | type: integer 48 | location.lat: INTPTLAT 49 | location.lon: INTPTLONG 50 | area.land: 51 | description: Land Area (square miles) 52 | source: ALAND_SQMI 53 | type: float 54 | area.water: 55 | description: Water Area (square miles) 56 | source: AWATER_SQMI 57 | type: float 58 | 59 | categories: 60 | general: 61 | title: General 62 | description: > 63 | general information about the city, including standard 64 | identifiers and actual census summary data about the population of the city. 65 | fields: [id, code, name, state, population] 66 | geographic: 67 | title: Geographic 68 | description: > 69 | Geographic characteristics of the area. These are created for 70 | statistical purposes only. Depiction and designation for statistical 71 | purposes does not constitute a determination of jurisdictional authority 72 | or rights of ownership or entitlement. 73 | fields: [location, area.land, area.water] 74 | 75 | files: 76 | - name: cities100.csv 77 | -------------------------------------------------------------------------------- /notes.txt: -------------------------------------------------------------------------------- 1 | commit eabfb903751cc5b7bc9ae0affeb15ad020e1d783 2 | Merge: 0b017e5 a5c5a18 3 | Author: Yoz Grahame 4 | Date: Tue Sep 8 17:53:20 2015 -0700 5 | 6 | Merge pull request #198 from 18F/source-false 7 | 8 | Use `_source: false` to limit JSON coming back from ES 9 | 10 | commit a5c5a18381214d3a51ab54152b463727bb3f79bc 11 | Author: Sarah Allen 12 | Date: Tue Sep 8 17:41:21 2015 -0700 13 | 14 | exclude fields starting with _ 15 | 16 | when the whole source is returned 17 | when we’re not specifying fields 18 | we need to explicitly exclude _names 19 | 20 | commit 94bef492dca538f3e27f20824712f47793f8ab8c 21 | Author: Yoz (Jeremy) Grahame 22 | Date: Tue Sep 8 17:05:28 2015 -0700 23 | 24 | Also record "took" MS value from ES result 25 | 26 | commit 2205dce984c52a50382030d37edeefbcd4316873 27 | Merge: df7f98b 0b017e5 28 | Author: Yoz (Jeremy) Grahame 29 | Date: Tue Sep 8 17:02:09 2015 -0700 30 | 31 | Merge branch 'dev' into source-false 32 | 33 | commit df7f98b216163eb1f27a038a535817ac95aea742 34 | Author: Yoz (Jeremy) Grahame 35 | Date: Tue Sep 8 17:01:20 2015 -0700 36 | 37 | Use `_source: false` for proper field exclusion 38 | 39 | also the "oj" gem for faster JSON 40 | 41 | commit 0b017e56e3a6ed7c9549214a52ed6c2b568c4746 42 | Merge: 9761906 e4f6dfd 43 | Author: Sarah Allen 44 | Date: Tue Sep 8 17:00:48 2015 -0700 45 | 46 | Merge pull request #197 from 18F/log-query-time 47 | 48 | Log ES query time, and show with "debug" option 49 | 50 | commit e4f6dfd219b514a3d4523d836ad083005848186c 51 | Author: Yoz (Jeremy) Grahame 52 | Date: Tue Sep 8 16:58:55 2015 -0700 53 | 54 | No, THAT's how you test a hash value 55 | 56 | commit 97619061da0badde70890ed1f26912c0355c5245 57 | Merge: 9c1e04e 63ead03 58 | Author: Sarah Allen 59 | Date: Tue Sep 8 16:25:19 2015 -0700 60 | 61 | Merge pull request #196 from 18F/max-page-size 62 | 63 | Only allow up to MAX_PAGE_SIZE per page 64 | 65 | commit 63ead033b40c619ac143065d36be3a4427458d68 66 | Author: Yoz (Jeremy) Grahame 67 | Date: Tue Sep 8 16:01:54 2015 -0700 68 | 69 | String-to-int bugfix 70 | 71 | commit f85c981da6622d8d8925037325822ada4c4bfce1 72 | Author: Yoz (Jeremy) Grahame 73 | Date: Tue Sep 8 15:32:57 2015 -0700 74 | 75 | Added MAX_PAGE_SIZE test 76 | 77 | commit 4ba1cedcf73a8ff7e6415ab54dcbdf80ff6d211c 78 | Author: Yoz (Jeremy) Grahame 79 | Date: Tue Sep 8 15:25:21 2015 -0700 80 | 81 | Only allow up to MAX_PAGE_SIZE per page 82 | 83 | commit 226fb43f73268c91e4044118f84ceca707cceadb 84 | Author: Yoz (Jeremy) Grahame 85 | Date: Tue Sep 8 15:04:03 2015 -0700 86 | 87 | Log ES query time, and show with "debug" option 88 | 89 | commit 9c1e04ecf2d25c505941c55d71868c2894102983 90 | Merge: 6461581 99a490b 91 | Author: Yoz Grahame 92 | Date: Tue Sep 8 09:13:40 2015 -0700 93 | 94 | Merge pull request #194 from 18F/dev-sort 95 | 96 | autocomplete type alpha sort 97 | 98 | commit 99a490b1b12046895b49f9b3001c05f30b2dfa82 99 | Author: Sarah Allen 100 | Date: Tue Sep 8 01:51:27 2015 -0700 101 | 102 | autocomplete type alpha sort 103 | -------------------------------------------------------------------------------- /lib/data_magic/index/importer.rb: -------------------------------------------------------------------------------- 1 | require 'forwardable' 2 | 3 | module DataMagic 4 | module Index 5 | class Importer 6 | attr_reader :raw_data, :options 7 | 8 | def initialize(raw_data, options) 9 | @raw_data = raw_data 10 | @options = options 11 | end 12 | 13 | def process 14 | setup 15 | parse_and_log 16 | finish! 17 | 18 | [row_count, headers] 19 | end 20 | 21 | def client 22 | @client ||= SuperClient.new(es_client, options) 23 | end 24 | 25 | def builder_data 26 | @builder_data ||= BuilderData.new(raw_data, options) 27 | end 28 | 29 | def output 30 | @output ||= Output.new 31 | end 32 | 33 | def parse_and_log 34 | parse_csv 35 | rescue InvalidData => e 36 | trigger("error", e.message) 37 | raise InvalidData, "invalid file format" if empty? 38 | end 39 | 40 | def chunk_size 41 | (ENV['CHUNK_SIZE'] || 100).to_i 42 | end 43 | 44 | def nprocs 45 | (ENV['NPROCS'] || 1).to_i 46 | end 47 | 48 | def parse_csv 49 | if nprocs == 1 50 | parse_csv_whole 51 | else 52 | parse_csv_chunked 53 | end 54 | data.close 55 | end 56 | 57 | def parse_csv_whole 58 | CSV.new( 59 | data, 60 | headers: true, 61 | header_converters: lambda { |str| str.strip.to_sym } 62 | ).each do |row| 63 | RowImporter.process(row, self) 64 | break if at_limit? 65 | end 66 | end 67 | 68 | def parse_csv_chunked 69 | CSV.new( 70 | data, 71 | headers: true, 72 | header_converters: lambda { |str| str.strip.to_sym } 73 | ).each.each_slice(chunk_size) do |chunk| 74 | break if at_limit? 75 | chunks_per_proc = (chunk.size / nprocs.to_f).ceil 76 | Parallel.each(chunk.each_slice(chunks_per_proc)) do |rows| 77 | rows.each_with_index do |row, idx| 78 | RowImporter.process(row, self) 79 | end 80 | end 81 | if !headers 82 | single_document = DocumentBuilder.create(chunk.first, builder_data, DataMagic.config) 83 | set_headers(single_document) 84 | end 85 | increment(chunk.size) 86 | end 87 | end 88 | 89 | def setup 90 | client.create_index 91 | log_setup 92 | end 93 | 94 | def finish! 95 | validate! 96 | refresh_index 97 | log_finish 98 | end 99 | 100 | def log_setup 101 | opts = options.reject { |k,v| k == :mapping } 102 | trigger("info", "options", opts) 103 | trigger("info", "new_field_names", new_field_names) 104 | trigger("info", "additional_data", additional_data) 105 | end 106 | 107 | def log_finish 108 | trigger("info", "skipped (missing parent id)", output.skipped) if !output.skipped.empty? 109 | trigger('info', "done #{row_count} rows") 110 | end 111 | 112 | def event_logger 113 | @event_logger ||= EventLogger.new 114 | end 115 | 116 | def at_limit? 117 | options[:limit_rows] && row_count == options[:limit_rows] 118 | end 119 | 120 | extend Forwardable 121 | 122 | def_delegators :output, :set_headers, :skipping, :skipped, :increment, :row_count, :log_limit, 123 | :empty?, :validate!, :headers 124 | def_delegators :builder_data, :data, :new_field_names, :additional_data 125 | def_delegators :client, :refresh_index 126 | def_delegators :event_logger, :trigger 127 | 128 | def self.process(*args) 129 | new(*args).process 130 | end 131 | 132 | private 133 | 134 | def es_client 135 | DataMagic.client 136 | end 137 | end 138 | end 139 | end 140 | -------------------------------------------------------------------------------- /sample-data/data.yaml: -------------------------------------------------------------------------------- 1 | version: cities100-2010 2 | # cities100.txt 3 | # National Places Gazetteer Files, from US Census 2010 4 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html 5 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt 6 | # head -n 101 results.txt > cities100.txt 7 | # then convertes to csv and removed " city" from after each city name 8 | index: city-data 9 | api: cities 10 | unique: ['name'] 11 | 12 | options: 13 | search: dictionary_only # API provides error when requesting fields not in dictionary 14 | 15 | dictionary: 16 | id: 17 | source: GEOID 18 | description: > 19 | Geographic Identifier - fully concatenated geographic code (State FIPS and 20 | County FIPS). The Census Bureau and other state and federal agencies are 21 | responsible for assigning geographic identifiers, or GEOIDs, to geographic 22 | entities to facilitate the organization, presentation, and 23 | exchange of geographic and statistical data. GEOIDs are numeric codes that 24 | uniquely identify all administrative/legal and statistical geographic areas for 25 | which the Census Bureau tabulates data. From Alaska, the largest state, 26 | to the smallest census block in New York City, every geographic area 27 | has a unique GEOID. Data users rely on GEOIDs to join the appropriate 28 | demographic data from censuses and surveys, such as the 29 | American Community Survey (ACS), to various levels of geography for data 30 | analysis, interpretation and mapping. 31 | code: 32 | source: ANSICODE 33 | description: > 34 | American National Standards Institute codes (ANSI codes) 35 | are standardized numeric or alphabetic codes issued by the American 36 | National Standards Institute (ANSI) to ensure uniform identification of 37 | geographic entities through all federal government agencies. 38 | name: 39 | source: NAME 40 | description: The name of the city 41 | type: literal 42 | state: 43 | source: USPS 44 | description: Two letter state abbreviation 45 | population: 46 | source: POP10 47 | description: City population from 2010 Census data 48 | type: integer 49 | location.lat: INTPTLAT 50 | location.lon: INTPTLONG 51 | land_area: 52 | source: ALAND_SQMI 53 | description: Land Area (square miles) 54 | source: ALAND_SQMI 55 | type: float 56 | area.water: 57 | description: Water Area (square miles) 58 | source: AWATER_SQMI 59 | type: float 60 | 61 | categories: 62 | general: 63 | title: General 64 | description: > 65 | general information about the city, including standard 66 | identifiers and actual census summary data about the population of the city. 67 | fields: [id, code, name, state, population] 68 | geographic: 69 | title: Geographic 70 | description: > 71 | Geographic characteristics of the area. These are created for 72 | statistical purposes only. Depiction and designation for statistical 73 | purposes does not constitute a determination of jurisdictional authority 74 | or rights of ownership or entitlement. 75 | fields: [location, area.land, area.water] 76 | general2: 77 | title: General2 78 | description: > 79 | general information about the city, including standard 80 | identifiers and actual census summary data about the population of the city. 81 | fields: [id, code, name, state, population] 82 | general3: 83 | title: General3 84 | description: > 85 | short 86 | fields: [id, code, name, state, population] 87 | general4: 88 | title: General4 89 | description: > 90 | short 91 | fields: [id, code, name, state, population] 92 | general5: 93 | title: General5 94 | description: > 95 | general information about the city, including standard 96 | identifiers and actual census summary data about the population of the city. 97 | fields: [id, code, name, state, population] 98 | 99 | files: 100 | - name: cities100.csv 101 | -------------------------------------------------------------------------------- /public/javascripts/jquery-ujs.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Padrino Javascript Jquery Adapter 3 | * Created for use with Padrino Ruby Web Framework (http://www.padrinorb.com) 4 | **/ 5 | 6 | /* Remote Form Support 7 | * form_for @user, '/user', :remote => true 8 | **/ 9 | 10 | $(function(){ 11 | $('form').on('submit', function(e) { 12 | var element = $(this), message = element.data('confirm'); 13 | if (message && !confirm(message)) { return false; } 14 | if (element.data('remote') == true) { 15 | e.preventDefault(); e.stopped = true; 16 | JSAdapter.sendRequest(element, { 17 | verb: element.data('method') || element.attr('method') || 'post', 18 | url: element.attr('action'), 19 | dataType: element.data('type') || ($.ajaxSettings && $.ajaxSettings.dataType) || 'script', 20 | params: element.serializeArray() 21 | }); 22 | } 23 | }); 24 | 25 | /* Confirmation Support 26 | * link_to 'sign out', '/logout', :confirm => 'Log out?' 27 | **/ 28 | 29 | $(document).on('click', 'a[data-confirm]', function(e) { 30 | var message = $(this).data('confirm'); 31 | if (!confirm(message)) { e.preventDefault(); e.stopped = true; } 32 | }); 33 | 34 | /* 35 | * Link Remote Support 36 | * link_to 'add item', '/create', :remote => true 37 | **/ 38 | 39 | $(document).on('click', 'a[data-remote=true]', function(e) { 40 | var element = $(this); 41 | if (e.stopped) return; 42 | e.preventDefault(); e.stopped = true; 43 | JSAdapter.sendRequest(element, { 44 | verb: element.data('method') || 'get', 45 | url: element.attr('href') 46 | }); 47 | }); 48 | 49 | /* 50 | * Link Method Support 51 | * link_to 'delete item', '/destroy', :method => :delete 52 | **/ 53 | 54 | $(document).on('click', 'a[data-method]:not([data-remote])', function(e) { 55 | if (e.stopped) return; 56 | JSAdapter.sendMethod($(this)); 57 | e.preventDefault(); e.stopped = true; 58 | }); 59 | 60 | /* JSAdapter */ 61 | var JSAdapter = { 62 | // Sends an xhr request to the specified url with given verb and params 63 | // JSAdapter.sendRequest(element, { verb: 'put', url : '...', params: {} }); 64 | sendRequest: function(element, options) { 65 | var verb = options.verb, url = options.url, params = options.params, dataType = options.dataType; 66 | var event = element.trigger('ajax:before'); 67 | if (event.stopped) return false; 68 | $.ajax({ 69 | url: url, 70 | type: verb.toUpperCase() || 'POST', 71 | data: params || [], 72 | dataType: dataType, 73 | 74 | beforeSend: function(request) { element.trigger('ajax:loading', [ request ]); }, 75 | complete: function(request) { element.trigger('ajax:complete', [ request ]); }, 76 | success: function(request) { element.trigger('ajax:success', [ request ]); }, 77 | error: function(request) { element.trigger('ajax:failure', [ request ]); } 78 | }); 79 | element.trigger('ajax:after'); 80 | }, 81 | // Triggers a particular method verb to be triggered in a form posting to the url 82 | // JSAdapter.sendMethod(element); 83 | sendMethod: function(element) { 84 | var verb = element.data('method'); 85 | var url = element.attr('href'); 86 | var form = $('
'); 87 | var csrf_token = $('meta[name=csrf-token]').attr('content'); 88 | var csrf_param = $('meta[name=csrf-param]').attr('content'); 89 | form.hide().appendTo('body'); 90 | if (verb !== 'post') { 91 | var field = ''; 92 | form.append(field); 93 | } 94 | if (csrf_param !== undefined && csrf_token !== undefined) { 95 | var field = ''; 96 | form.append(field); 97 | } 98 | form.submit(); 99 | } 100 | }; 101 | 102 | // Every xhr request is sent along with the CSRF token. 103 | $.ajaxPrefilter(function(options, originalOptions, xhr) { 104 | if (options.verb !== 'GET') { 105 | var token = $('meta[name="csrf-token"]').attr('content'); 106 | if (token) xhr.setRequestHeader('X-CSRF-Token', token); 107 | } 108 | }); 109 | }); 110 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Running Open Data Maker on your computer 2 | 3 | If you just want to install and run, then you can just download a 4 | [zip file](https://github.com/18F/open-data-maker/archive/master.zip). 5 | 6 | You will still need the the dependencies below, but you don't need to 7 | clone the git repo for the source code. 8 | 9 | ## Install Prerequisites 10 | 11 | You can run our bootstrap script to make sure you have all the dependencies. 12 | It will also install and start up Elasticsearch: 13 | 14 | ``` 15 | script/bootstrap 16 | ``` 17 | 18 | To run Open Data Maker, you will need to have the following software installed on your computer: 19 | * [Elasticsearch] 1.7.3 20 | * [Ruby] 2.2.2 21 | 22 | **NOTE: Open Data Maker does not currently work with Elasticsearch versions 2.x and above.** 23 | You can follow or assist our progress towards 2.x compatibility [at this GitHub issue](https://github.com/18F/open-data-maker/issues/248). 24 | 25 | ### Mac OS X 26 | 27 | On a Mac, we recommend installing Ruby 2.2.2 via [RVM], and Elasticsearch 1.7.3 via 28 | [Homebrew]. If you don't want to use the bootstrap script above, you can install 29 | elasticsearch 1.7 with brew using the following command: 30 | 31 | ``` 32 | brew install elasticsearch17 33 | ``` 34 | 35 | If you are contributing to development, you will also need [Git]. 36 | If you don't already have these tools, the 18F [laptop] script will install 37 | them for you. 38 | 39 | ## Get the Source Code 40 | 41 | For development, [fork](http://help.github.com/fork-a-repo/) the repo 42 | first, then clone your fork. 43 | 44 | ``` 45 | git clone https://github.com//open-data-maker.git 46 | cd open-data-maker 47 | ``` 48 | 49 | ## Run the App 50 | 51 | ### Make sure Elasticsearch is up and running 52 | If you just ran `script/bootstrap`, then Elasticsearch should already be 53 | running. But if you stopped it or restarted your computer, you'll need to 54 | start it back up. Assuming you installed Elasticsearch via our `bootstrap` 55 | script, you can restart it with this command: 56 | 57 | ```brew services restart elasticsearch``` 58 | 59 | 60 | ### Import the data 61 | 62 | To get started, you can import sample data with: 63 | 64 | `rake import` 65 | 66 | ### Start the app 67 | 68 | ``` 69 | padrino start 70 | ``` 71 | Go to: http://127.0.0.1:3000/ 72 | 73 | and you should see the text `Welcome to Open Data Maker` with a link to 74 | the API created by the [sample data](sample-data). 75 | 76 | You can verify that the import was successful by visiting 77 | http://127.0.0.1:3000/v1/cities?name=Cleveland. You should see something like: 78 | 79 | ```json 80 | { 81 | "state": "OH", 82 | "name": "Cleveland", 83 | "population": 396815, 84 | "land_area": 77.697, 85 | "location": { 86 | "lat": 41.478138, 87 | "lon": -81.679486 88 | } 89 | ``` 90 | 91 | ### Custom Datasets 92 | 93 | While the app is running (or anytime) you can run `rake import`. For instance, if you had a `presidents/data.yaml` file, you would import 94 | it with: 95 | 96 | ```sh 97 | export DATA_PATH=presidents 98 | rake import 99 | # or, more succintly: 100 | DATA_PATH=presidents rake import 101 | ``` 102 | 103 | to clear the data, assuming the data set had an index named "president-data" 104 | 105 | ``` 106 | rake es:delete[president-data] 107 | ``` 108 | 109 | you may alternately delete all the indices (which could affect other apps if 110 | they are using your local Elasticsearch) 111 | 112 | ``` 113 | rake es:delete[_all] 114 | ``` 115 | 116 | The data directory can optionally include a file called `data.yaml` (see [the sample one](sample-data/data.yaml) for its schema) that references one or more `.csv` files and specifies data types, 117 | field name mapping, and other support data. 118 | 119 | ## Experimental web UI for indexing 120 | 121 | Optionally, you can enable indexing from webapp, but this option is still experimental: 122 | * `export INDEX_APP=enable` 123 | * in your browser, go to /index/reindex 124 | 125 | the old index (if present) will be deleted and re-created from source files at DATA_PATH. 126 | 127 | ## Want to help? 128 | 129 | See [Contribution Guide](CONTRIBUTING.md) 130 | 131 | Read additional [implementation notes](NOTES.md) 132 | 133 | [Elasticsearch]: https://www.elastic.co/products/elasticsearch 134 | [Homebrew]: http://brew.sh/ 135 | [RVM]: https://github.com/wayneeseguin/rvm 136 | [rbenv]: https://github.com/sstephenson/rbenv 137 | [Ruby]: https://www.ruby-lang.org/en/ 138 | [Git]: https://git-scm.com/ 139 | [laptop]: https://github.com/18F/laptop 140 | -------------------------------------------------------------------------------- /app/controllers.rb: -------------------------------------------------------------------------------- 1 | # Main front page 2 | OpenDataMaker::App.controllers do 3 | get :index do 4 | render :home, layout: true, locals: { 5 | 'title' => 'Open Data Maker', 6 | 'endpoints' => DataMagic.config.api_endpoint_names, 7 | 'examples' => DataMagic.config.examples, 8 | 'categories' => DataMagic.config.categories.to_json 9 | } 10 | end 11 | 12 | get :category, :with => :id do 13 | category_entry = DataMagic.config.category_by_id(params[:id]) 14 | render :category, layout: true, locals: { 15 | 'title' => 'Open Data Maker', 16 | 'category_entry' => category_entry.to_json, 17 | 'field_details' => category_entry['field_details'].to_json 18 | } 19 | end 20 | end 21 | 22 | CACHE_TTL = 300 23 | 24 | # All API requests are prefixed by the API version 25 | # in this case, "v1" - e.g. "/vi/endpoints" etc. 26 | OpenDataMaker::App.controllers :v1 do 27 | before do 28 | content_type :json 29 | headers 'Access-Control-Allow-Origin' => '*', 30 | 'Access-Control-Allow-Methods' => ['GET'], 31 | 'Surrogate-Control' => "max-age=#{CACHE_TTL}" 32 | cache_control :public, max_age: CACHE_TTL 33 | end 34 | 35 | get :endpoints do 36 | endpoints = DataMagic.config.api_endpoints.keys.map do |key| 37 | { 38 | name: key, 39 | url: url_for(:v1, :index, endpoint: key) 40 | } 41 | end 42 | return { endpoints: endpoints }.to_json 43 | end 44 | 45 | get '/data.json' do 46 | data = DataMagic.config.data 47 | data.to_json 48 | end 49 | 50 | get :index, with: ':endpoint/:command', provides: [:json] do 51 | process_params 52 | end 53 | 54 | get :index, with: ':endpoint', provides: [:json, :csv] do 55 | process_params 56 | end 57 | end 58 | 59 | def process_params 60 | options = get_search_args_from_params(params) 61 | DataMagic.logger.debug "-----> APP GET #{params.inspect} with options #{options.inspect}" 62 | 63 | check_endpoint!(options) 64 | set_content_type(options) 65 | search_and_respond(options) 66 | end 67 | 68 | def search_and_respond(options) 69 | data = DataMagic.search(params, options) 70 | halt 400, data.to_json if data.key?(:errors) 71 | 72 | if content_type == :csv 73 | output_data_as_csv(data['results']) 74 | else 75 | data.to_json 76 | end 77 | end 78 | 79 | def check_endpoint!(options) 80 | unless DataMagic.config.api_endpoints.keys.include? options[:endpoint] 81 | halt 404, { 82 | error: 404, 83 | message: "#{options[:endpoint]} not found. Available endpoints: #{DataMagic.config.api_endpoints.keys.join(',')}" 84 | }.to_json 85 | end 86 | end 87 | 88 | def set_content_type(options) 89 | if options[:command] == 'stats' 90 | content_type :json 91 | else 92 | content_type(options[:format].nil? ? :json : options[:format].to_sym) 93 | end 94 | end 95 | 96 | # TODO: Use of non-underscore-prefixed option parameters is still 97 | # supported but deprecated, and should be removed at some point soon - 98 | # see comment in method body 99 | def get_search_args_from_params(params) 100 | options = {} 101 | %w(metrics sort fields zip distance page per_page debug).each do |opt| 102 | options[opt.to_sym] = params.delete("_#{opt}") 103 | # TODO: remove next line to end support for un-prefixed option parameters 104 | options[opt.to_sym] ||= params.delete(opt) 105 | end 106 | options[:endpoint] = params.delete("endpoint") # these two params are 107 | options[:format] = params.delete("format") # supplied by Padrino 108 | options[:fields] = (options[:fields] || "").split(',') 109 | options[:command] = params.delete("command") 110 | 111 | options[:metrics] = options[:metrics].split(/\s*,\s*/) if options[:metrics] 112 | options 113 | end 114 | 115 | def output_data_as_csv(results) 116 | # We assume all rows have the same keys 117 | if results.empty? 118 | '' 119 | else 120 | CSV.generate(force_quotes: true, headers: true) do |csv| 121 | results.each_with_index do |row, row_num| 122 | row = NestedHash.new(row).withdotkeys 123 | # make the order match data.yaml order 124 | output = DataMagic.config.field_types.each_with_object({}) do |(name, type), output| 125 | output[name] = row[name] unless row[name].nil? 126 | if name == "location" 127 | output["location.lat"] = row["location.lat"] unless row["location.lat"].nil? 128 | output["location.lon"] = row["location.lon"] unless row["location.lon"].nil? 129 | end 130 | end 131 | csv << output.keys if row_num == 0 132 | csv << output 133 | end 134 | end 135 | end 136 | end 137 | -------------------------------------------------------------------------------- /spec/lib/data_magic/search_name_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'data_magic' 3 | require 'csv' 4 | 5 | describe "DataMagic intuitive search" do 6 | 7 | before :example do 8 | DataMagic.destroy 9 | ENV['DATA_PATH'] = './spec/fixtures/school_names' 10 | DataMagic.init(load_now: true) 11 | end 12 | after :example do 13 | DataMagic.destroy 14 | end 15 | 16 | RSpec.configure do |c| 17 | c.alias_it_should_behave_like_to :it_correctly, 'correctly:' 18 | end 19 | 20 | let(:expected_meta) {{"metadata"=>{"total"=>1, "page"=>0, "per_page"=>20}}} 21 | let(:expected_match) { "" } 22 | let(:response) { DataMagic.search( 23 | {'school.name' => subject}, fields:['school.name']) } 24 | 25 | context "full request" do 26 | let(:response) { DataMagic.search({id: 1}) } 27 | let(:expected_match) { [{"id"=>"1", "school"=>{"state"=>"AL", "name"=>"Stillman College"}}]} 28 | it "provides expected document" do 29 | expect(response['results']).to eql expected_match 30 | end 31 | end 32 | 33 | context "sort" do 34 | shared_examples "returns" do 35 | it "sorted results " do 36 | expect(response['results'].map { |i| i['school.name'] }) 37 | .to eql expected_match 38 | end 39 | end 40 | 41 | context "with list of names" do 42 | let(:response) { DataMagic.search({}, fields:['school.name'], 43 | sort: 'school.name') } 44 | # fields:['name'], 45 | let(:expected_match) { 46 | csv_path = File.expand_path("../../fixtures/school_names/school_names.csv", __dir__) 47 | data = CSV.read(csv_path).slice(1..-1) 48 | data.map { |row| row[2] } 49 | .sort.slice(0,20) 50 | } 51 | it_correctly "returns" 52 | end 53 | 54 | end 55 | 56 | context "basic search" do 57 | shared_examples "finds" do 58 | it "correct results " do 59 | expect(response['results'] 60 | .map { |i| i['school.name'] } 61 | .sort ) 62 | .to eql expected_match 63 | end 64 | it "correct metadata" do 65 | expect(response.reject { |k, _| k == 'results' }).to eql expected_meta 66 | end 67 | end 68 | 69 | context "for exact match" do 70 | subject { 'New York University' } 71 | let(:expected_match) { ['New York University'] } 72 | it_correctly "finds" 73 | end 74 | context "for exact match (case insensitive)" do 75 | subject { 'new YORK UniverSity' } 76 | let(:expected_match) { ['New York University'] } 77 | it_correctly "finds" 78 | end 79 | 80 | context "for exact match (case insensitive)" do 81 | subject { 'new YORK UniverSity' } 82 | let(:expected_match) { ['New York University'] } 83 | it_correctly "finds" 84 | end 85 | 86 | context "by prefix" do 87 | subject { 'Still' } 88 | let(:expected_match) { ['Stillman College'] } 89 | it_correctly "finds" 90 | end 91 | 92 | context "by prefix (case insensitive)" do 93 | subject { 'still' } 94 | let(:expected_match) { ['Stillman College'] } 95 | it_correctly "finds" 96 | end 97 | 98 | context "by prefix in the middle of the name" do 99 | subject { 'Phoenix' } 100 | let(:expected_meta) {{"metadata"=>{"total"=>3, "page"=>0, "per_page"=>20}}} 101 | let(:expected_match) { ['Phoenix College', 102 | 'University of Phoenix-Online Campus', 103 | "University of Phoenix-Phoenix Campus"] } 104 | it_correctly "finds" 105 | end 106 | 107 | context "with words in the wrong order" do 108 | subject { 'University New York' } 109 | let(:expected_match) { ['New York University'] } 110 | it_correctly "finds" 111 | end 112 | 113 | context "partial word after dash" do 114 | subject { 'berk' } 115 | let(:expected_meta) {{"metadata"=>{"total"=>3, "page"=>0, "per_page"=>20}}} 116 | let(:expected_match) { ['Berk Trade and Business School', 117 | 'Berklee College of Music', 118 | 'University of California-Berkeley'] } 119 | it_correctly "finds" 120 | end 121 | 122 | context "words separated by dash" do 123 | subject { 'phoenix online' } 124 | let(:expected_match) { ['University of Phoenix-Online Campus'] } 125 | it_correctly "finds" 126 | end 127 | end 128 | # TO DO 129 | # "pheonix" (mis-spelling) should probably work 130 | # "phoenix college" should also probably return "university of phoenix" --- since college is a synonym for unversity 131 | 132 | end 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Open Data Maker 2 | [![Build Status](https://circleci.com/gh/18F/open-data-maker/tree/dev.svg?style=svg)](https://circleci.com/gh/18F/open-data-maker/tree/dev) 3 | 4 | The goal of this project is to make it easy to turn a lot of potentially large 5 | csv files into open data via an API and the ability for people to download 6 | smaller csv files with a subset of the data. 7 | 8 | Preliminary research suggests that open data users (journalists and others) 9 | actually know how to work with spreadsheets really well, but a lot of the 10 | data sets that we have in government are huge. 11 | 12 | The first version of this project will allow us to host a website for an 13 | agency with a specific set of csv files, which are deployed with the app. 14 | This will allows us to deploy more quickly since there will be a lower risk 15 | security profile than if an agency could upload the CSV files (which might 16 | be a nice longer term feature). 17 | 18 | 19 | ## Install and Run the App (as a developer) 20 | 21 | See our [Installation Guide](INSTALL.md) 22 | 23 | ## How this works 24 | 25 | By default, data will be loaded from /sample-data when you run `rake import` 26 | 27 | * [cities100.csv](sample-data/cities100.csv) - dataset of 100 most populous cities in the US 28 | * [data.yaml](sample-data/data.yaml) - configuration for 29 | * index name *city-data* 30 | * api endpoint name *cities* 31 | * how columns are mapped to fields in json output 32 | * data types 33 | * unique columns *name* 34 | 35 | When you run the app, you can query the dataset via json API, like: /cities?name=Chicago 36 | 37 | * http://localhost:3000/cities?name=Chicago 38 | * http://localhost:3000/cities?name=Chicago&state=IL 39 | * http://localhost:3000/cities?state=NY,MA 40 | * http://localhost:3000/cities?state=CA&fields=name,size 41 | 42 | To use your own data, you can set a different directory, for example: 43 | 44 | ``` 45 | export DATA_PATH='./data' 46 | ``` 47 | 48 | 1. Put csv files into /data 49 | 1. Import files from /data: ```rake import``` (or restart the app) 50 | 1. There can be multiple files (must end in .csv) 51 | 1. Optional [data.yaml](sample-data/data.yaml) file that specifies index name, API endpoint, file list, and a dictionary of column -> field name mapping and types 52 | 1. Optionally import all the columns, not just ones specified in dictionary (see example: [import: all](spec/fixtures/import_with_options/data.yaml)) 53 | 1. If data.yaml not provided, all fields and fields will be imported with folder or bucket name used as the API endpoint (name is 'slugified' with dashes replacing spaces) 54 | 1. api endpoint to get the data /api=endpoint?field_or_column_name=value 55 | 56 | ## More Configuration Options 57 | 58 | Often while you are developing an API and data dictionary, 59 | it is helpful to include all the columns in the csv. If you add the following to 60 | data.yaml, the field names and types from the dictionary will be used and any 61 | unspecified columns will simply use the column name as the field name. 62 | 63 | ``` 64 | options: 65 | columns: all 66 | ``` 67 | 68 | You can use the dictionary to provide nice errors to developers who use the API. 69 | This can be used in conjunction with the above ```columns: all``` which will 70 | make it so that columns that are not referenced in the dictionary are not 71 | searchable, but will make it so that unspecified fields cause errors to be 72 | reported. 73 | 74 | ``` 75 | options: 76 | search: dictionary_only 77 | ``` 78 | 79 | Also for debugging, you can limit the number of files that will be imported. This is helpful when the import process is time consuming because you have many, many files, but can test format changes with a subset of the files. 80 | 81 | ``` 82 | options: 83 | limit: 4 84 | ``` 85 | 86 | 87 | 88 | ## Help Wanted 89 | 90 | 1. Try out importing multiple data sets with different endpoints and data.yaml configuration 91 | 2. Take a look at our [open issues](https://github.com/18F/open-data-maker/issues) and our [Contribution Guide](CONTRIBUTING.md) 92 | 93 | ## More Info 94 | 95 | Here's how it might look in the future: 96 | 97 | ![overview of data types, prompt to download data, create a custom data set, or look at API docs](/doc/data-overview.png) 98 | 99 | 100 | ![Download all the data or make choices to create a csv with a subset](/doc/csv-download.png) 101 | 102 | ### Acknowledgements 103 | Zipcode latitude and longitude provided by [GeoNames](http://www.geonames.org/) under under a [Creative Commons Attribution 3.0 License](http://creativecommons.org/licenses/by/3.0/). 104 | 105 | ### Public domain 106 | 107 | Except as noted above, this project is in the worldwide [public domain](LICENSE.md). As stated in [CONTRIBUTING](CONTRIBUTING.md): 108 | 109 | > This project is in the public domain within the United States, and copyright and related rights in the work worldwide are waived through the [CC0 1.0 Universal public domain dedication](https://creativecommons.org/publicdomain/zero/1.0/). 110 | > 111 | > All contributions to this project will be released under the CC0 dedication. By submitting a pull request, you are agreeing to comply with this waiver of copyright interest. 112 | -------------------------------------------------------------------------------- /lib/data_magic/error_checker.rb: -------------------------------------------------------------------------------- 1 | module DataMagic 2 | module ErrorChecker 3 | class << self 4 | def check(params, options, config) 5 | report_required_params_absent(options) + 6 | report_nonexistent_params(params, config) + 7 | report_nonexistent_operators(params) + 8 | report_nonexistent_fields(options[:fields], config) + 9 | report_bad_range_argument(params) + 10 | report_wrong_field_type(params, config) + 11 | report_wrong_zip(options) + 12 | report_distance_requires_zip(options) 13 | end 14 | 15 | private 16 | 17 | def report_required_params_absent(options) 18 | if options[:command] == 'stats' && options[:fields].length == 0 19 | [build_error(error: 'invalid_or_incomplete_parameters', input: options[:command])] 20 | else 21 | [] 22 | end 23 | end 24 | 25 | def report_distance_requires_zip(params) 26 | # if distance, must have zip 27 | return [] if (params[:distance] && params[:zip]) || (!params[:distance]) 28 | [build_error( 29 | error: 'distance_error' 30 | )] 31 | end 32 | 33 | def report_wrong_zip(params) 34 | return [] if !params[:zip] || Zipcode.valid?(params[:zip]) 35 | [build_error( 36 | error: 'zipcode_error', 37 | parameter: :zip, 38 | input: params[:zip].to_s 39 | )] 40 | end 41 | 42 | def report_nonexistent_params(params, config) 43 | return [] unless config.dictionary_only_search? 44 | params.keys.reject { |p| config.field_type(strip_op(p)) }. 45 | map { |p| build_error(error: 'parameter_not_found', input: strip_op(p)) } 46 | end 47 | 48 | def report_nonexistent_operators(params) 49 | params.keys.select { |p| p =~ /__(\w+)$/ && $1 !~ /range|not|ne/i }. 50 | map do |p| 51 | (param, op) = p.match(/^(.*)__(\w+)$/).captures 52 | build_error(error: 'operator_not_found', parameter: param, input: op) 53 | end 54 | end 55 | 56 | def report_nonexistent_fields(fields, config) 57 | if fields && !fields.empty? && config.dictionary_only_search? 58 | fields.reject { |f| config.field_type(f.to_s) }. 59 | map { |f| build_error(error: 'field_not_found', input: f.to_s) } 60 | else 61 | [] 62 | end 63 | end 64 | 65 | def report_bad_range_argument(params) 66 | ranges = params.select do |p,v| 67 | p =~ /__range$/ and 68 | v !~ / ^(\d+(\.\d+)?)? # optional starting number 69 | \.\. # range dots 70 | (\d+(\.\d+)?)? # optional ending number 71 | (,(\d+(\.\d+)?)?\.\.(\d+(\.\d+)?)?)* # and more, with commas 72 | $/x 73 | end 74 | ranges.map do |p,v| 75 | build_error(error: 'range_format_error', parameter: strip_op(p), input: v) 76 | end 77 | end 78 | 79 | def report_wrong_field_type(params, config) 80 | bad_fields = params.select do |p, v| 81 | next false if p =~ /__range$/ 82 | param_type = config.field_type(strip_op(p)) 83 | value_type = guess_value_type(v) 84 | (param_type == "float" && value_type != "float" && value_type != "integer") or 85 | (param_type == "integer" && value_type != "integer") 86 | end 87 | bad_fields.map do |p, v| 88 | build_error(error: 'parameter_type_error', parameter: p, input: v, 89 | expected_type: config.field_type(strip_op(p)), 90 | input_type: guess_value_type(v)) 91 | end 92 | end 93 | 94 | def build_error(opts) 95 | opts[:message] = 96 | case opts[:error] 97 | when 'invalid_or_incomplete_parameters' 98 | "The command #{opts[:input]} requires a fields parameter." 99 | when 'parameter_not_found' 100 | "The input parameter '#{opts[:input]}' is not known in this dataset." 101 | when 'field_not_found' 102 | "The input field '#{opts[:input]}' (in the fields parameter) is not a field in this dataset." 103 | when 'operator_not_found' 104 | "The input operator '#{opts[:input]}' (appended to the parameter '#{opts[:parameter]}') is not known or supported. (Known operators: range, ne, not)" 105 | when 'parameter_type_error' 106 | "The parameter '#{opts[:parameter]}' expects a value of type #{opts[:expected_type]}, but received '#{opts[:input]}' which is a value of type #{opts[:input_type]}." 107 | when 'range_format_error' 108 | "The range '#{opts[:input]}' supplied to parameter '#{opts[:parameter]}' isn't in the correct format." 109 | when 'zipcode_error' 110 | "The provided zipcode, '#{opts[:input]}', is not valid." 111 | when 'distance_error' 112 | "Use of the 'distance' parameter also requires a 'zip' parameter." 113 | end 114 | opts 115 | end 116 | 117 | def guess_value_type(value) 118 | case value.to_s 119 | when /^-?\d+$/ 120 | "integer" 121 | when /^(-?\d+,?)+$/ # list of integers 122 | "integer" 123 | when /^-?\d+\.\d+$/ 124 | "float" 125 | else 126 | "string" 127 | end 128 | end 129 | 130 | def strip_op(param) 131 | param.sub(/__\w+$/, '') 132 | end 133 | end 134 | end 135 | end 136 | -------------------------------------------------------------------------------- /lib/data_magic/query_builder.rb: -------------------------------------------------------------------------------- 1 | module DataMagic 2 | module QueryBuilder 3 | class << self 4 | # Creates query from parameters passed into endpoint 5 | def from_params(params, options, config) 6 | per_page = (options[:per_page] || config.page_size || DataMagic::DEFAULT_PAGE_SIZE).to_i 7 | page = options[:page].to_i || 0 8 | per_page = DataMagic::MAX_PAGE_SIZE if per_page > DataMagic::MAX_PAGE_SIZE 9 | query_hash = { 10 | from: page * per_page, 11 | size: per_page, 12 | } 13 | 14 | query_hash[:query] = generate_squery(params, options, config).to_search 15 | 16 | if options[:command] == 'stats' 17 | query_hash.merge! add_aggregations(params, options, config) 18 | end 19 | 20 | if options[:fields] && !options[:fields].empty? 21 | query_hash[:fields] = get_restrict_fields(options) 22 | query_hash[:_source] = false 23 | else 24 | query_hash[:_source] = { 25 | exclude: ["_*"] 26 | } 27 | end 28 | query_hash[:sort] = get_sort_order(options[:sort], config) if options[:sort] && !options[:sort].empty? 29 | query_hash 30 | end 31 | 32 | private 33 | 34 | def generate_squery(params, options, config) 35 | squery = Stretchy.query(type: 'document') 36 | squery = search_location(squery, options) 37 | search_fields_and_ranges(squery, params, config) 38 | end 39 | 40 | # Wrapper for Stretchy aggregation clause builder (which wraps ElasticSearch (ES) :aggs parameter) 41 | # Extracts all extended_stats aggregations from ES, to be filtered later 42 | # Is a no-op if no fields are specified, or none of them are numeric 43 | def add_aggregations(params, options, config) 44 | agg_hash = options[:fields].inject({}) do |memo, f| 45 | if config.column_field_types[f.to_s] && ["integer", "float"].include?(config.column_field_types[f.to_s]) 46 | memo[f.to_s] = { extended_stats: { "field" => f.to_s } } 47 | end 48 | memo 49 | end 50 | 51 | agg_hash.empty? ? {} : { aggs: agg_hash } 52 | end 53 | 54 | def get_restrict_fields(options) 55 | options[:fields].map(&:to_s) 56 | end 57 | 58 | # @description turns a string like "state,population:desc" into [{'state' => {order: 'asc'}},{ "population" => {order: "desc"} }] 59 | # @param [String] sort_param 60 | # @return [Array] 61 | def get_sort_order(sort_param, config) 62 | sort_param.to_s.scan(/(\w+[\.\w]*):?(\w*)/).map do |field_name, direction| 63 | direction = 'asc' if direction.empty? 64 | type = config.field_type(field_name) 65 | # for 'autocomplete' search on lowercase not analyzed indexed in _name 66 | field_name = "_#{field_name}" if type == 'autocomplete' 67 | { field_name => { order: direction } } 68 | end 69 | end 70 | 71 | def to_number(value) 72 | value =~ /\./ ? value.to_f : value.to_i 73 | end 74 | 75 | def search_fields_and_ranges(squery, params, config) 76 | params.each do |param, value| 77 | field_type = config.field_type(param) 78 | if field_type == "name" 79 | squery = include_name_query(squery, param, value) 80 | elsif field_type == "autocomplete" 81 | squery = autocomplete_query(squery, param, value) 82 | elsif match = /(.+)__(range|ne|not)\z/.match(param) 83 | field, operator = match.captures.map(&:to_sym) 84 | squery = range_query(squery, operator, field, value) 85 | elsif field_type == "integer" && value.is_a?(String) && /,/.match(value) # list of integers 86 | squery = integer_list_query(squery, param, value) 87 | else # field equality 88 | squery = squery.where(param => value) 89 | end 90 | end 91 | squery 92 | end 93 | 94 | def include_name_query(squery, field, value) 95 | value = value.split(' ').map { |word| "#{word}*"}.join(' ') 96 | squery.match.query( 97 | # we store lowercase name in field with prefix _ 98 | "wildcard": { "_#{field}" => { "value": value.downcase } } 99 | ) 100 | end 101 | 102 | def range_query(squery, operator, field, value) 103 | if operator == :ne or operator == :not # field negation 104 | squery.where.not(field => value) 105 | else # field range 106 | squery.filter( 107 | or: build_ranges(field, value.split(',')) 108 | ) 109 | end 110 | end 111 | 112 | def autocomplete_query(squery, field, value) 113 | squery.match.query( 114 | common: { 115 | field => { 116 | query: value, 117 | cutoff_frequency: 0.001, 118 | low_freq_operator: "and" 119 | } 120 | }) 121 | end 122 | 123 | def integer_list_query(squery, field, value) 124 | squery.filter( 125 | terms: { 126 | field => value.split(',').map(&:to_i) } 127 | ) 128 | end 129 | 130 | def build_ranges(field, range_strings) 131 | range_strings.map do |range| 132 | min, max = range.split('..') 133 | values = {} 134 | values[:gte] = to_number(min) unless min.empty? 135 | values[:lte] = to_number(max) if max 136 | { 137 | range: { field => values } 138 | } 139 | end 140 | end 141 | 142 | # Handles location (currently only uses SFO location) 143 | def search_location(squery, options) 144 | distance = options[:distance] 145 | location = Zipcode.latlon(options[:zip]) 146 | 147 | if distance && !distance.empty? 148 | # default to miles if no distance given 149 | unit = distance[-2..-1] 150 | distance = "#{distance}mi" if unit != "km" and unit != "mi" 151 | 152 | squery = squery.geo('location', distance: distance, lat: location[:lat], lng: location[:lon]) 153 | end 154 | squery 155 | end 156 | end 157 | end 158 | end 159 | -------------------------------------------------------------------------------- /spec/lib/data_magic/config_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe DataMagic::Config do 4 | before(:all) do 5 | ENV['DATA_PATH'] = './spec/fixtures/import_with_dictionary' 6 | end 7 | 8 | it "detects data.yml files" do 9 | ENV['DATA_PATH'] = './spec/fixtures/cities_with_yml' 10 | config = DataMagic::Config.new 11 | expect(config.data["api"]).to eq("cities") 12 | end 13 | 14 | describe 'slugification' do 15 | it 'slugifies local paths' do 16 | config = DataMagic::Config.new 17 | slugified = config.clean_index('path/to/my_directory') 18 | expect(slugified).to eq('my-directory') 19 | end 20 | 21 | it 'slugifes s3 bucket names' do 22 | config = DataMagic::Config.new 23 | slugified = config.clean_index('s3://user:pass@my_bucket') 24 | expect(slugified).to eq('my-bucket') 25 | end 26 | end 27 | 28 | context "s3" do 29 | it "detects data.yaml" do 30 | ENV['DATA_PATH'] = 's3://mybucket' 31 | fake_s3 = class_spy("Fake Aws::S3::Client") 32 | fake_get_object_response = double( 33 | "S3 response", 34 | body: StringIO.new({ 'index' => 'fake-index' }.to_yaml), 35 | isOK: true, 36 | status: 200 37 | ) 38 | allow(fake_s3).to receive(:get_object) 39 | .with(bucket: 'mybucket', key: 'data.yaml', response_target: duck_type(:read)) 40 | .and_return(fake_get_object_response) 41 | config = DataMagic::Config.new(s3: fake_s3) 42 | expect(config.s3).to eq(fake_s3) 43 | expect(config.data["index"]).to eq("fake-index") 44 | end 45 | 46 | it "raises error if s3 errors" do 47 | ENV['DATA_PATH'] = 's3://mybucket' 48 | fake_s3 = class_spy("Fake Aws::S3::Client") 49 | 50 | allow(fake_s3).to receive(:get_object) 51 | .with(bucket: 'mybucket', key: 'data.yaml', response_target: duck_type(:read)) 52 | .and_raise(RuntimeError) 53 | expect { 54 | DataMagic::Config.new(s3: fake_s3) 55 | }.to raise_error(RuntimeError) 56 | end 57 | 58 | end 59 | 60 | context "create" do 61 | it "works with zero args" do 62 | expect(DataMagic::Config.new).to_not be_nil 63 | end 64 | it "can set s3 client" do 65 | # TODO: mock s3 66 | s3_client = "s3 client" 67 | config = DataMagic::Config.new(s3: s3_client) 68 | expect(config.s3).to eq(s3_client) 69 | end 70 | end 71 | 72 | context "when loaded" do 73 | let(:config) { DataMagic::Config.new } 74 | 75 | after do 76 | config.clear_all 77 | end 78 | 79 | context "#scoped_index_name" do 80 | it "includes environment prefix" do 81 | expect(config.scoped_index_name).to eq('test-city-data') 82 | end 83 | end 84 | 85 | it "has config data" do 86 | default_config = { 87 | "version" => "cities100-2010", 88 | "index" => "city-data", "api" => "cities", 89 | "files" => [{ "name" => "cities100.csv" }], 90 | "options" => {:search=>"dictionary_only"}, 91 | "unique" => ["name"], 92 | "data_path" => "./sample-data" 93 | } 94 | expect(config.data.keys).to include('dictionary') 95 | dictionary = config.data.delete 'dictionary' 96 | 97 | expect(dictionary.keys.sort).to eq %w(id code name state population 98 | location.lat location.lon land_area area.water).sort 99 | categories = config.data.delete 'categories' 100 | expect(categories.keys.sort).to eq %w(general general2 general3 general4 general5 geographic).sort 101 | expect(config.data).to eq(default_config) 102 | end 103 | 104 | it "has default page size" do 105 | expect(DataMagic::DEFAULT_PAGE_SIZE).to_not be_nil 106 | expect(config.page_size).to eq(DataMagic::DEFAULT_PAGE_SIZE) 107 | end 108 | 109 | describe "#update_indexed_config" do # rename ... or do this in load_config or something 110 | context "after loading config" do 111 | let(:fixture_path) { "./spec/fixtures/import_with_dictionary" } 112 | before do 113 | config.load_datayaml(fixture_path) 114 | end 115 | it "should be true" do 116 | expect(config.update_indexed_config).to be true 117 | end 118 | it "should set new data_path" do 119 | expect(config.data_path).to eq(fixture_path) 120 | end 121 | 122 | it "twice should be false" do 123 | config.update_indexed_config 124 | expect(config.update_indexed_config).to be false 125 | end 126 | end 127 | end 128 | 129 | describe "when has a custom null_value" do 130 | it 'should have a default null value' do 131 | expect(config.null_value).to eq('NULL') 132 | end 133 | 134 | it 'should set null value field' do 135 | config.load_datayaml("./spec/fixtures/import_with_null_value") 136 | expect(config.null_value).to eq('abc123') 137 | end 138 | end 139 | end 140 | 141 | context ".calculated_field_list" do 142 | let(:config) { DataMagic::Config.new(load_datayaml: false) } 143 | it "finds fields with 'calculate' property" do 144 | allow(config).to receive(:dictionary).and_return( 145 | { 146 | one: { 147 | source: 'column1', 148 | type: 'float' 149 | }, 150 | two: { 151 | source: 'column2', 152 | type: 'float' 153 | }, 154 | all: { 155 | calculate: 'column1 or column2', 156 | type: 'float', 157 | description: 'something' 158 | } 159 | } 160 | ) 161 | expect(config.calculated_field_list).to eq(['all']) 162 | end 163 | end 164 | 165 | context ".only_field_list" do 166 | let(:config) { DataMagic::Config.new(load_datayaml: false) } 167 | let(:simple_fields) do 168 | { 'one' => 'column1', 'two' => 'column2', 'three' => 'column3' } 169 | end 170 | let(:fields_with_dots) do 171 | { 'one' => 'column1', 'two.a' => 'column2a', 'two.b' => 'column2b' } 172 | end 173 | 174 | it "selects a subset" do 175 | expect(config.only_field_list(%w(one two), simple_fields)).to eq( 176 | 'one' => 'column1', 'two' => 'column2' 177 | ) 178 | end 179 | 180 | it "selects fields with dots" do 181 | expect(config.only_field_list(%w(two), fields_with_dots)).to eq( 182 | 'two.a' => 'column2a', 'two.b' => 'column2b' 183 | ) 184 | end 185 | end 186 | end 187 | -------------------------------------------------------------------------------- /spec/fixtures/cities_with_yml/cities51-100.csv: -------------------------------------------------------------------------------- 1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG" 2 | "TX","4804000","02409731","Arlington","25","A","365438","144805","248332497","9690024","95.882","3.741","32.700708","-97.124691" 3 | "CA","0603526","02409774","Bakersfield","25","A","347483","120725","368204317","3741691","142.164","1.445","35.321213","-119.018291" 4 | "LA","2255000","00545142","New Orleans","25","A","343829","189896","438803381","468240430","169.423","180.789","30.068636","-89.939007" 5 | "HI","1571550","02630783","Urban Honolulu CDP","57","S","337256","143173","156748036","20484151","60.521","7.909","21.325852","-157.845315" 6 | "CA","0602000","02409704","Anaheim","25","A","336265","104237","129073275","2526668","49.835","0.976","33.855497","-117.760071" 7 | "FL","1271000","02405568","Tampa","25","A","335709","157130","293727878","160127838","113.409","61.826","27.970086","-82.479673" 8 | "CO","0804000","02409757","Aurora","25","A","325078","131040","400759192","1806832","154.734","0.698","39.688002","-104.689740" 9 | "CA","0669000","02411814","Santa Ana","25","A","324528","76896","70627761","643479","27.270","0.248","33.736478","-117.882593" 10 | "MO","2965000","00767557","St. Louis","25","A","319294","176002","160343174","10683076","61.909","4.125","38.635699","-90.244582" 11 | "PA","4261000","01214818","Pittsburgh","25","A","305704","156165","143399923","7693613","55.367","2.971","40.439753","-79.976592" 12 | "TX","4817000","02410234","Corpus Christi","25","A","305215","125469","415982136","852055055","160.612","328.980","27.754252","-97.173385" 13 | "CA","0662000","02410965","Riverside","25","A","303871","98444","210152356","788400","81.140","0.304","33.938143","-117.393168" 14 | "OH","3915000","01086201","Cincinnati","25","A","296943","161095","201869928","4155439","77.942","1.604","39.139902","-84.506446" 15 | "KY","2146027","02405089","Lexington-Fayette urban county","UC","A","295803","135160","734648526","4922803","283.649","1.901","38.040157","-84.458443" 16 | "AK","0203000","02419025","Anchorage municipality","37","A","291826","113032","4415108963","663860984","1704.683","256.318","61.177549","-149.274354" 17 | "CA","0675000","02411987","Stockton","25","A","291707","99637","159723404","7984682","61.670","3.083","37.976342","-121.313304" 18 | "OH","3977000","01086537","Toledo","25","A","287208","138039","208991246","8889079","80.692","3.432","41.664071","-83.581861" 19 | "MN","2758000","02396511","St. Paul","25","A","285068","120795","134623737","10875208","51.979","4.199","44.948869","-93.103855" 20 | "NJ","3451000","00885317","Newark","25","A","277140","109520","62643850","4972876","24.187","1.920","40.724220","-74.172574" 21 | "NC","3728000","02403745","Greensboro","25","A","269666","124074","327673360","13690607","126.515","5.286","36.096483","-79.827108" 22 | "NY","3611000","00978764","Buffalo","25","A","261310","133444","104594197","31364094","40.384","12.110","42.892492","-78.859686" 23 | "TX","4858016","02411437","Plano","25","A","259841","103672","185394655","937663","71.581","0.362","33.050769","-96.747944" 24 | "NE","3128000","02395713","Lincoln","25","A","258379","110546","230804010","3229386","89.114","1.247","40.808957","-96.680354" 25 | "NV","3231900","02410741","Henderson","25","A","257729","113586","279023542","0","107.732","0.000","36.012233","-115.037462" 26 | "IN","1825000","02394798","Fort Wayne","25","A","253691","113541","286500436","553423","110.618","0.214","41.088173","-85.143880" 27 | "NJ","3436000","00885264","Jersey","25","A","247597","108720","38315542","16280557","14.794","6.286","40.711417","-74.064760" 28 | "FL","1263000","02405401","St. Petersburg","25","A","244769","129401","159909751","196473878","61.742","75.859","27.761976","-82.644055" 29 | "CA","0613392","02409461","Chula Vista","25","A","243916","79416","128544675","6380068","49.631","2.463","32.627670","-117.015170" 30 | "VA","5157000","01498557","Norfolk","25","A","242803","95018","140171293","109376999","54.120","42.231","36.923015","-76.244641" 31 | "FL","1253000","02404443","Orlando","25","A","238300","121254","265203107","21469603","102.395","8.289","28.415886","-81.298750" 32 | "AZ","0412000","02409433","Chandler","25","A","236123","94404","166828220","289715","64.413","0.112","33.282874","-111.854943" 33 | "TX","4841464","02411626","Laredo","25","A","236091","68610","230271380","3754983","88.908","1.450","27.547681","-99.486931" 34 | "WI","5548000","01583625","Madison","25","A","233209","108843","198882058","44658619","76.789","17.243","43.087806","-89.430121" 35 | "NC","3775000","02405771","Winston-Salem","25","A","229617","103974","343041264","3228612","132.449","1.247","36.103262","-80.260578" 36 | "TX","4845000","02410892","Lubbock","25","A","229573","95926","317041399","2962034","122.410","1.144","33.566479","-101.886677" 37 | "LA","2205000","02403821","Baton Rouge","25","B","229493","100801","199291656","5588234","76.947","2.158","30.448454","-91.125899" 38 | "NC","3719000","02403521","Durham","25","A","228330","103221","278087581","2357401","107.370","0.910","35.980964","-78.905647" 39 | "TX","4829000","02410572","Garland","25","A","226876","80834","147848881","340126","57.085","0.131","32.909826","-96.630357" 40 | "AZ","0427820","02410596","Glendale","25","A","226721","90505","155337275","401624","59.976","0.155","33.533111","-112.189901" 41 | "NV","3260600","02410923","Reno","25","A","225221","102582","266792840","7423507","103.009","2.866","39.474487","-119.776538" 42 | "FL","1230000","02404689","Hialeah","25","A","224669","74067","55554697","3599730","21.450","1.390","25.869941","-80.302865" 43 | "NV","3254600","02409023","Paradise CDP","57","S","223167","114296","120996826","0","46.717","0.000","36.080689","-115.136839" 44 | "VA","5116000","01498558","Chesapeake","25","A","222209","83196","882669156","26052854","340.800","10.059","36.679376","-76.301788" 45 | "AZ","0465000","02411845","Scottsdale","25","A","217385","124001","476350341","1231086","183.920","0.475","33.668727","-111.823682" 46 | "NV","3251800","02411273","North Las Vegas","25","A","216961","76073","262483131","112001","101.345","0.043","36.282974","-115.089262" 47 | "TX","4837000","02410117","Irving","25","A","216290","91128","173573892","2594600","67.017","1.002","32.857748","-96.970022" 48 | "CA","0626000","02410545","Fremont","25","A","214089","73989","200617968","26291598","77.459","10.151","37.494373","-121.941117" 49 | "CA","0636770","02410116","Irvine","25","A","212375","83899","171214072","900908","66.106","0.348","33.678399","-117.771254" 50 | "AL","0107000","02403868","Birmingham","25","A","212237","108981","378310927","6590665","146.067","2.545","33.527444","-86.799047" 51 | "NY","3663000","00979426","Rochester","25","A","210565","97158","92671789","3558427","35.781","1.374","43.169927","-77.616891" 52 | -------------------------------------------------------------------------------- /spec/fixtures/cities_without_yml/cities51-100.csv: -------------------------------------------------------------------------------- 1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG" 2 | "TX","4804000","02409731","Arlington","25","A","365438","144805","248332497","9690024","95.882","3.741","32.700708","-97.124691" 3 | "CA","0603526","02409774","Bakersfield","25","A","347483","120725","368204317","3741691","142.164","1.445","35.321213","-119.018291" 4 | "LA","2255000","00545142","New Orleans","25","A","343829","189896","438803381","468240430","169.423","180.789","30.068636","-89.939007" 5 | "HI","1571550","02630783","Urban Honolulu CDP","57","S","337256","143173","156748036","20484151","60.521","7.909","21.325852","-157.845315" 6 | "CA","0602000","02409704","Anaheim","25","A","336265","104237","129073275","2526668","49.835","0.976","33.855497","-117.760071" 7 | "FL","1271000","02405568","Tampa","25","A","335709","157130","293727878","160127838","113.409","61.826","27.970086","-82.479673" 8 | "CO","0804000","02409757","Aurora","25","A","325078","131040","400759192","1806832","154.734","0.698","39.688002","-104.689740" 9 | "CA","0669000","02411814","Santa Ana","25","A","324528","76896","70627761","643479","27.270","0.248","33.736478","-117.882593" 10 | "MO","2965000","00767557","St. Louis","25","A","319294","176002","160343174","10683076","61.909","4.125","38.635699","-90.244582" 11 | "PA","4261000","01214818","Pittsburgh","25","A","305704","156165","143399923","7693613","55.367","2.971","40.439753","-79.976592" 12 | "TX","4817000","02410234","Corpus Christi","25","A","305215","125469","415982136","852055055","160.612","328.980","27.754252","-97.173385" 13 | "CA","0662000","02410965","Riverside","25","A","303871","98444","210152356","788400","81.140","0.304","33.938143","-117.393168" 14 | "OH","3915000","01086201","Cincinnati","25","A","296943","161095","201869928","4155439","77.942","1.604","39.139902","-84.506446" 15 | "KY","2146027","02405089","Lexington-Fayette urban county","UC","A","295803","135160","734648526","4922803","283.649","1.901","38.040157","-84.458443" 16 | "AK","0203000","02419025","Anchorage municipality","37","A","291826","113032","4415108963","663860984","1704.683","256.318","61.177549","-149.274354" 17 | "CA","0675000","02411987","Stockton","25","A","291707","99637","159723404","7984682","61.670","3.083","37.976342","-121.313304" 18 | "OH","3977000","01086537","Toledo","25","A","287208","138039","208991246","8889079","80.692","3.432","41.664071","-83.581861" 19 | "MN","2758000","02396511","St. Paul","25","A","285068","120795","134623737","10875208","51.979","4.199","44.948869","-93.103855" 20 | "NJ","3451000","00885317","Newark","25","A","277140","109520","62643850","4972876","24.187","1.920","40.724220","-74.172574" 21 | "NC","3728000","02403745","Greensboro","25","A","269666","124074","327673360","13690607","126.515","5.286","36.096483","-79.827108" 22 | "NY","3611000","00978764","Buffalo","25","A","261310","133444","104594197","31364094","40.384","12.110","42.892492","-78.859686" 23 | "TX","4858016","02411437","Plano","25","A","259841","103672","185394655","937663","71.581","0.362","33.050769","-96.747944" 24 | "NE","3128000","02395713","Lincoln","25","A","258379","110546","230804010","3229386","89.114","1.247","40.808957","-96.680354" 25 | "NV","3231900","02410741","Henderson","25","A","257729","113586","279023542","0","107.732","0.000","36.012233","-115.037462" 26 | "IN","1825000","02394798","Fort Wayne","25","A","253691","113541","286500436","553423","110.618","0.214","41.088173","-85.143880" 27 | "NJ","3436000","00885264","Jersey","25","A","247597","108720","38315542","16280557","14.794","6.286","40.711417","-74.064760" 28 | "FL","1263000","02405401","St. Petersburg","25","A","244769","129401","159909751","196473878","61.742","75.859","27.761976","-82.644055" 29 | "CA","0613392","02409461","Chula Vista","25","A","243916","79416","128544675","6380068","49.631","2.463","32.627670","-117.015170" 30 | "VA","5157000","01498557","Norfolk","25","A","242803","95018","140171293","109376999","54.120","42.231","36.923015","-76.244641" 31 | "FL","1253000","02404443","Orlando","25","A","238300","121254","265203107","21469603","102.395","8.289","28.415886","-81.298750" 32 | "AZ","0412000","02409433","Chandler","25","A","236123","94404","166828220","289715","64.413","0.112","33.282874","-111.854943" 33 | "TX","4841464","02411626","Laredo","25","A","236091","68610","230271380","3754983","88.908","1.450","27.547681","-99.486931" 34 | "WI","5548000","01583625","Madison","25","A","233209","108843","198882058","44658619","76.789","17.243","43.087806","-89.430121" 35 | "NC","3775000","02405771","Winston-Salem","25","A","229617","103974","343041264","3228612","132.449","1.247","36.103262","-80.260578" 36 | "TX","4845000","02410892","Lubbock","25","A","229573","95926","317041399","2962034","122.410","1.144","33.566479","-101.886677" 37 | "LA","2205000","02403821","Baton Rouge","25","B","229493","100801","199291656","5588234","76.947","2.158","30.448454","-91.125899" 38 | "NC","3719000","02403521","Durham","25","A","228330","103221","278087581","2357401","107.370","0.910","35.980964","-78.905647" 39 | "TX","4829000","02410572","Garland","25","A","226876","80834","147848881","340126","57.085","0.131","32.909826","-96.630357" 40 | "AZ","0427820","02410596","Glendale","25","A","226721","90505","155337275","401624","59.976","0.155","33.533111","-112.189901" 41 | "NV","3260600","02410923","Reno","25","A","225221","102582","266792840","7423507","103.009","2.866","39.474487","-119.776538" 42 | "FL","1230000","02404689","Hialeah","25","A","224669","74067","55554697","3599730","21.450","1.390","25.869941","-80.302865" 43 | "NV","3254600","02409023","Paradise CDP","57","S","223167","114296","120996826","0","46.717","0.000","36.080689","-115.136839" 44 | "VA","5116000","01498558","Chesapeake","25","A","222209","83196","882669156","26052854","340.800","10.059","36.679376","-76.301788" 45 | "AZ","0465000","02411845","Scottsdale","25","A","217385","124001","476350341","1231086","183.920","0.475","33.668727","-111.823682" 46 | "NV","3251800","02411273","North Las Vegas","25","A","216961","76073","262483131","112001","101.345","0.043","36.282974","-115.089262" 47 | "TX","4837000","02410117","Irving","25","A","216290","91128","173573892","2594600","67.017","1.002","32.857748","-96.970022" 48 | "CA","0626000","02410545","Fremont","25","A","214089","73989","200617968","26291598","77.459","10.151","37.494373","-121.941117" 49 | "CA","0636770","02410116","Irvine","25","A","212375","83899","171214072","900908","66.106","0.348","33.678399","-117.771254" 50 | "AL","0107000","02403868","Birmingham","25","A","212237","108981","378310927","6590665","146.067","2.545","33.527444","-86.799047" 51 | "NY","3663000","00979426","Rochester","25","A","210565","97158","92671789","3558427","35.781","1.374","43.169927","-77.616891" 52 | -------------------------------------------------------------------------------- /spec/fixtures/import_with_dictionary/cities51-100.csv: -------------------------------------------------------------------------------- 1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG" 2 | "TX","4804000","02409731","Arlington","25","A","365438","144805","248332497","9690024","95.882","3.741","32.700708","-97.124691" 3 | "CA","0603526","02409774","Bakersfield","25","A","347483","120725","368204317","3741691","142.164","1.445","35.321213","-119.018291" 4 | "LA","2255000","00545142","New Orleans","25","A","343829","189896","438803381","468240430","169.423","180.789","30.068636","-89.939007" 5 | "HI","1571550","02630783","Urban Honolulu CDP","57","S","337256","143173","156748036","20484151","60.521","7.909","21.325852","-157.845315" 6 | "CA","0602000","02409704","Anaheim","25","A","336265","104237","129073275","2526668","49.835","0.976","33.855497","-117.760071" 7 | "FL","1271000","02405568","Tampa","25","A","335709","157130","293727878","160127838","113.409","61.826","27.970086","-82.479673" 8 | "CO","0804000","02409757","Aurora","25","A","325078","131040","400759192","1806832","154.734","0.698","39.688002","-104.689740" 9 | "CA","0669000","02411814","Santa Ana","25","A","324528","76896","70627761","643479","27.270","0.248","33.736478","-117.882593" 10 | "MO","2965000","00767557","St. Louis","25","A","319294","176002","160343174","10683076","61.909","4.125","38.635699","-90.244582" 11 | "PA","4261000","01214818","Pittsburgh","25","A","305704","156165","143399923","7693613","55.367","2.971","40.439753","-79.976592" 12 | "TX","4817000","02410234","Corpus Christi","25","A","305215","125469","415982136","852055055","160.612","328.980","27.754252","-97.173385" 13 | "CA","0662000","02410965","Riverside","25","A","303871","98444","210152356","788400","81.140","0.304","33.938143","-117.393168" 14 | "OH","3915000","01086201","Cincinnati","25","A","296943","161095","201869928","4155439","77.942","1.604","39.139902","-84.506446" 15 | "KY","2146027","02405089","Lexington-Fayette urban county","UC","A","295803","135160","734648526","4922803","283.649","1.901","38.040157","-84.458443" 16 | "AK","0203000","02419025","Anchorage municipality","37","A","291826","113032","4415108963","663860984","1704.683","256.318","61.177549","-149.274354" 17 | "CA","0675000","02411987","Stockton","25","A","291707","99637","159723404","7984682","61.670","3.083","37.976342","-121.313304" 18 | "OH","3977000","01086537","Toledo","25","A","287208","138039","208991246","8889079","80.692","3.432","41.664071","-83.581861" 19 | "MN","2758000","02396511","St. Paul","25","A","285068","120795","134623737","10875208","51.979","4.199","44.948869","-93.103855" 20 | "NJ","3451000","00885317","Newark","25","A","277140","109520","62643850","4972876","24.187","1.920","40.724220","-74.172574" 21 | "NC","3728000","02403745","Greensboro","25","A","269666","124074","327673360","13690607","126.515","5.286","36.096483","-79.827108" 22 | "NY","3611000","00978764","Buffalo","25","A","261310","133444","104594197","31364094","40.384","12.110","42.892492","-78.859686" 23 | "TX","4858016","02411437","Plano","25","A","259841","103672","185394655","937663","71.581","0.362","33.050769","-96.747944" 24 | "NE","3128000","02395713","Lincoln","25","A","258379","110546","230804010","3229386","89.114","1.247","40.808957","-96.680354" 25 | "NV","3231900","02410741","Henderson","25","A","257729","113586","279023542","0","107.732","0.000","36.012233","-115.037462" 26 | "IN","1825000","02394798","Fort Wayne","25","A","253691","113541","286500436","553423","110.618","0.214","41.088173","-85.143880" 27 | "NJ","3436000","00885264","Jersey","25","A","247597","108720","38315542","16280557","14.794","6.286","40.711417","-74.064760" 28 | "FL","1263000","02405401","St. Petersburg","25","A","244769","129401","159909751","196473878","61.742","75.859","27.761976","-82.644055" 29 | "CA","0613392","02409461","Chula Vista","25","A","243916","79416","128544675","6380068","49.631","2.463","32.627670","-117.015170" 30 | "VA","5157000","01498557","Norfolk","25","A","242803","95018","140171293","109376999","54.120","42.231","36.923015","-76.244641" 31 | "FL","1253000","02404443","Orlando","25","A","238300","121254","265203107","21469603","102.395","8.289","28.415886","-81.298750" 32 | "AZ","0412000","02409433","Chandler","25","A","236123","94404","166828220","289715","64.413","0.112","33.282874","-111.854943" 33 | "TX","4841464","02411626","Laredo","25","A","236091","68610","230271380","3754983","88.908","1.450","27.547681","-99.486931" 34 | "WI","5548000","01583625","Madison","25","A","233209","108843","198882058","44658619","76.789","17.243","43.087806","-89.430121" 35 | "NC","3775000","02405771","Winston-Salem","25","A","229617","103974","343041264","3228612","132.449","1.247","36.103262","-80.260578" 36 | "TX","4845000","02410892","Lubbock","25","A","229573","95926","317041399","2962034","122.410","1.144","33.566479","-101.886677" 37 | "LA","2205000","02403821","Baton Rouge","25","B","229493","100801","199291656","5588234","76.947","2.158","30.448454","-91.125899" 38 | "NC","3719000","02403521","Durham","25","A","228330","103221","278087581","2357401","107.370","0.910","35.980964","-78.905647" 39 | "TX","4829000","02410572","Garland","25","A","226876","80834","147848881","340126","57.085","0.131","32.909826","-96.630357" 40 | "AZ","0427820","02410596","Glendale","25","A","226721","90505","155337275","401624","59.976","0.155","33.533111","-112.189901" 41 | "NV","3260600","02410923","Reno","25","A","225221","102582","266792840","7423507","103.009","2.866","39.474487","-119.776538" 42 | "FL","1230000","02404689","Hialeah","25","A","224669","74067","55554697","3599730","21.450","1.390","25.869941","-80.302865" 43 | "NV","3254600","02409023","Paradise CDP","57","S","223167","114296","120996826","0","46.717","0.000","36.080689","-115.136839" 44 | "VA","5116000","01498558","Chesapeake","25","A","222209","83196","882669156","26052854","340.800","10.059","36.679376","-76.301788" 45 | "AZ","0465000","02411845","Scottsdale","25","A","217385","124001","476350341","1231086","183.920","0.475","33.668727","-111.823682" 46 | "NV","3251800","02411273","North Las Vegas","25","A","216961","76073","262483131","112001","101.345","0.043","36.282974","-115.089262" 47 | "TX","4837000","02410117","Irving","25","A","216290","91128","173573892","2594600","67.017","1.002","32.857748","-96.970022" 48 | "CA","0626000","02410545","Fremont","25","A","214089","73989","200617968","26291598","77.459","10.151","37.494373","-121.941117" 49 | "CA","0636770","02410116","Irvine","25","A","212375","83899","171214072","900908","66.106","0.348","33.678399","-117.771254" 50 | "AL","0107000","02403868","Birmingham","25","A","212237","108981","378310927","6590665","146.067","2.545","33.527444","-86.799047" 51 | "NY","3663000","00979426","Rochester","25","A","210565","97158","92671789","3558427","35.781","1.374","43.169927","-77.616891" 52 | -------------------------------------------------------------------------------- /spec/fixtures/cities_with_yml/cities50.csv: -------------------------------------------------------------------------------- 1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG" 2 | "NY","3651000","02395220","New York","25","A","8175133","3371062","783842402","429527437","302.643","165.841","40.664274","-73.938500" 3 | "CA","0644000","02410877","Los Angeles","25","A","3792621","1413995","1213850147","88119442","468.670","34.023","34.019394","-118.410825" 4 | "IL","1714000","00428803","Chicago","25","A","2695598","1194337","589571105","16781658","227.635","6.479","41.837551","-87.681844" 5 | "TX","4835000","02410796","Houston","25","A","2099451","892646","1552929379","72277296","599.589","27.906","29.780472","-95.386342" 6 | "PA","4260000","01215531","Philadelphia","25","A","1526006","670171","347321129","22289408","134.101","8.606","40.009376","-75.133346" 7 | "AZ","0455000","02411414","Phoenix","25","A","1445632","590149","1338256106","3221362","516.704","1.244","33.572162","-112.087966" 8 | "TX","4865000","02411774","San Antonio","25","A","1327407","524246","1193811736","14968532","460.933","5.779","29.472403","-98.525142" 9 | "CA","0666000","02411782","San Diego","25","A","1307402","516033","842233444","122273028","325.188","47.210","32.815300","-117.134993" 10 | "TX","4819000","02410288","Dallas","25","A","1197816","516639","881939036","117394747","340.519","45.326","32.794176","-96.765503" 11 | "CA","0668000","02411790","San Jose","25","A","945942","314038","457201438","8907829","176.526","3.439","37.296867","-121.819306" 12 | "FL","1235000","02404783","Jacksonville","25","A","821784","366273","1934729255","330550113","747.003","127.626","30.337019","-81.661302" 13 | "IN","1836003","02395424","Indianapolis (balance)","00","F","820445","379856","936107645","17072644","361.433","6.592","39.776664","-86.145935" 14 | "CA","0667000","02411786","San Francisco","25","A","805235","376942","121399963","479190317","46.873","185.016","37.727239","-123.032229" 15 | "TX","4805000","02409761","Austin","25","A","790390","354241","771546901","18560605","297.896","7.166","30.307182","-97.755996" 16 | "OH","3918000","01086101","Columbus","25","A","787033","370965","562466164","15373425","217.169","5.936","39.984799","-82.985044" 17 | "TX","4827000","02410531","Fort Worth","25","A","741206","291086","880127783","20832365","339.819","8.043","32.779542","-97.346335" 18 | "NC","3712000","02404032","Charlotte","25","A","731424","319918","770983559","5148533","297.678","1.988","35.208707","-80.830739" 19 | "MI","2622000","01626181","Detroit","25","A","713777","349170","359361795","10667148","138.750","4.119","42.383037","-83.102237" 20 | "TX","4824000","02410414","El Paso","25","A","649121","227605","661056631","2642037","255.235","1.020","31.848360","-106.426979" 21 | "TN","4748000","02405068","Memphis","25","A","646889","291883","815988030","23176952","315.055","8.949","35.103543","-89.978498" 22 | "MD","2404000","01702381","Baltimore","25","A","620961","296685","209643241","28768302","80.944","11.108","39.300213","-76.610516" 23 | "MA","2507000","00619463","Boston","25","A","617594","272481","125037462","107130299","48.277","41.363","42.331960","-71.020173" 24 | "WA","5363000","02411856","Seattle","25","A","608660","308516","217410345","152055857","83.943","58.709","47.620499","-122.350876" 25 | "DC","1150000","02390665","Washington","25","N","601723","296719","158114680","18884970","61.048","7.292","38.904149","-77.017094" 26 | "TN","4752006","02405092","Nashville-Davidson metropolitan government (balance)","00","F","601222","272622","1230569857","56261385","475.126","21.723","36.171800","-86.785002" 27 | "CO","0820000","02410324","Denver","25","A","600158","285797","396268588","4225359","153.000","1.631","39.761849","-104.880625" 28 | "KY","2148006","01967434","Louisville/Jefferson County metro government (balance)","00","F","597337","270928","842389102","43938966","325.248","16.965","38.178077","-85.666708" 29 | "WI","5553000","01583724","Milwaukee","25","A","594833","255569","248955419","1755108","96.122","0.678","43.063348","-87.966695" 30 | "OR","4159000","02411471","Portland","25","A","583776","265439","345573252","30196598","133.427","11.659","45.536951","-122.649971" 31 | "NV","3240000","02411630","Las Vegas","25","A","583756","243701","351758506","133986","135.815","0.052","36.227712","-115.264045" 32 | "OK","4055000","02411311","Oklahoma","25","A","579999","256930","1570595903","37324163","606.410","14.411","35.467079","-97.513657" 33 | "NM","3502000","02409678","Albuquerque","25","A","545852","239166","486218256","4702696","187.730","1.816","35.105552","-106.647388" 34 | "AZ","0477000","02412104","Tucson","25","A","520116","229762","587173307","842214","226.709","0.325","32.154289","-110.871062" 35 | "CA","0627000","02410546","Fresno","25","A","494665","171288","289966538","910373","111.957","0.351","36.782674","-119.794492" 36 | "CA","0664000","02411751","Sacramento","25","A","466488","190911","253599699","5673097","97.915","2.190","38.566592","-121.468632" 37 | "CA","0643000","02410866","Long Beach","25","A","462257","176032","130259313","2963688","50.293","1.144","33.809102","-118.155327" 38 | "MO","2938000","02395492","Kansas","25","A","459787","221860","815717156","10575319","314.950","4.083","39.125212","-94.551136" 39 | "AZ","0446000","02411087","Mesa","25","A","439041","201173","353409966","1580758","136.452","0.610","33.401926","-111.717379" 40 | "VA","5182000","01498559","Virginia Beach","25","A","437994","177879","644948896","643150238","249.016","248.322","36.779322","-76.024020" 41 | "GA","1304000","02403126","Atlanta","25","A","420003","224573","344861307","2219186","133.152","0.857","33.762909","-84.422675" 42 | "CO","0816000","02410198","Colorado Springs","25","A","416427","179607","503857525","947180","194.540","0.366","38.867255","-104.760749" 43 | "NE","3137000","02396064","Omaha","25","A","408958","177518","329157346","9036153","127.088","3.489","41.264675","-96.041927" 44 | "NC","3755000","02404590","Raleigh","25","A","403892","176124","370117231","2846320","142.903","1.099","35.830204","-78.641439" 45 | "FL","1245000","02404247","Miami","25","A","399457","183994","92905577","52298641","35.871","20.193","25.775163","-80.208615" 46 | "OH","3916000","01085963","Cleveland","25","A","396815","207536","201234202","12351419","77.697","4.769","41.478138","-81.679486" 47 | "OK","4075000","02412110","Tulsa","25","A","391906","185127","509590364","10967865","196.754","4.235","36.127949","-95.902316" 48 | "CA","0653000","02411292","Oakland","25","A","390724","169710","144484543","57539591","55.786","22.216","37.769857","-122.225640" 49 | "MN","2743000","02395345","Minneapolis","25","A","382578","178287","139789184","9052448","53.973","3.495","44.963323","-93.268284" 50 | "KS","2079000","00485662","Wichita","25","A","382368","167310","412571486","11136472","159.295","4.300","37.690694","-97.342678" 51 | "PR","7276770","02414943","San Juan zona urbana","62","S","381931","194316","102305007","17914425","39.500","6.917","18.406409","-66.064004" 52 | -------------------------------------------------------------------------------- /spec/fixtures/cities_without_yml/cities50.csv: -------------------------------------------------------------------------------- 1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG" 2 | "NY","3651000","02395220","New York","25","A","8175133","3371062","783842402","429527437","302.643","165.841","40.664274","-73.938500" 3 | "CA","0644000","02410877","Los Angeles","25","A","3792621","1413995","1213850147","88119442","468.670","34.023","34.019394","-118.410825" 4 | "IL","1714000","00428803","Chicago","25","A","2695598","1194337","589571105","16781658","227.635","6.479","41.837551","-87.681844" 5 | "TX","4835000","02410796","Houston","25","A","2099451","892646","1552929379","72277296","599.589","27.906","29.780472","-95.386342" 6 | "PA","4260000","01215531","Philadelphia","25","A","1526006","670171","347321129","22289408","134.101","8.606","40.009376","-75.133346" 7 | "AZ","0455000","02411414","Phoenix","25","A","1445632","590149","1338256106","3221362","516.704","1.244","33.572162","-112.087966" 8 | "TX","4865000","02411774","San Antonio","25","A","1327407","524246","1193811736","14968532","460.933","5.779","29.472403","-98.525142" 9 | "CA","0666000","02411782","San Diego","25","A","1307402","516033","842233444","122273028","325.188","47.210","32.815300","-117.134993" 10 | "TX","4819000","02410288","Dallas","25","A","1197816","516639","881939036","117394747","340.519","45.326","32.794176","-96.765503" 11 | "CA","0668000","02411790","San Jose","25","A","945942","314038","457201438","8907829","176.526","3.439","37.296867","-121.819306" 12 | "FL","1235000","02404783","Jacksonville","25","A","821784","366273","1934729255","330550113","747.003","127.626","30.337019","-81.661302" 13 | "IN","1836003","02395424","Indianapolis (balance)","00","F","820445","379856","936107645","17072644","361.433","6.592","39.776664","-86.145935" 14 | "CA","0667000","02411786","San Francisco","25","A","805235","376942","121399963","479190317","46.873","185.016","37.727239","-123.032229" 15 | "TX","4805000","02409761","Austin","25","A","790390","354241","771546901","18560605","297.896","7.166","30.307182","-97.755996" 16 | "OH","3918000","01086101","Columbus","25","A","787033","370965","562466164","15373425","217.169","5.936","39.984799","-82.985044" 17 | "TX","4827000","02410531","Fort Worth","25","A","741206","291086","880127783","20832365","339.819","8.043","32.779542","-97.346335" 18 | "NC","3712000","02404032","Charlotte","25","A","731424","319918","770983559","5148533","297.678","1.988","35.208707","-80.830739" 19 | "MI","2622000","01626181","Detroit","25","A","713777","349170","359361795","10667148","138.750","4.119","42.383037","-83.102237" 20 | "TX","4824000","02410414","El Paso","25","A","649121","227605","661056631","2642037","255.235","1.020","31.848360","-106.426979" 21 | "TN","4748000","02405068","Memphis","25","A","646889","291883","815988030","23176952","315.055","8.949","35.103543","-89.978498" 22 | "MD","2404000","01702381","Baltimore","25","A","620961","296685","209643241","28768302","80.944","11.108","39.300213","-76.610516" 23 | "MA","2507000","00619463","Boston","25","A","617594","272481","125037462","107130299","48.277","41.363","42.331960","-71.020173" 24 | "WA","5363000","02411856","Seattle","25","A","608660","308516","217410345","152055857","83.943","58.709","47.620499","-122.350876" 25 | "DC","1150000","02390665","Washington","25","N","601723","296719","158114680","18884970","61.048","7.292","38.904149","-77.017094" 26 | "TN","4752006","02405092","Nashville-Davidson metropolitan government (balance)","00","F","601222","272622","1230569857","56261385","475.126","21.723","36.171800","-86.785002" 27 | "CO","0820000","02410324","Denver","25","A","600158","285797","396268588","4225359","153.000","1.631","39.761849","-104.880625" 28 | "KY","2148006","01967434","Louisville/Jefferson County metro government (balance)","00","F","597337","270928","842389102","43938966","325.248","16.965","38.178077","-85.666708" 29 | "WI","5553000","01583724","Milwaukee","25","A","594833","255569","248955419","1755108","96.122","0.678","43.063348","-87.966695" 30 | "OR","4159000","02411471","Portland","25","A","583776","265439","345573252","30196598","133.427","11.659","45.536951","-122.649971" 31 | "NV","3240000","02411630","Las Vegas","25","A","583756","243701","351758506","133986","135.815","0.052","36.227712","-115.264045" 32 | "OK","4055000","02411311","Oklahoma","25","A","579999","256930","1570595903","37324163","606.410","14.411","35.467079","-97.513657" 33 | "NM","3502000","02409678","Albuquerque","25","A","545852","239166","486218256","4702696","187.730","1.816","35.105552","-106.647388" 34 | "AZ","0477000","02412104","Tucson","25","A","520116","229762","587173307","842214","226.709","0.325","32.154289","-110.871062" 35 | "CA","0627000","02410546","Fresno","25","A","494665","171288","289966538","910373","111.957","0.351","36.782674","-119.794492" 36 | "CA","0664000","02411751","Sacramento","25","A","466488","190911","253599699","5673097","97.915","2.190","38.566592","-121.468632" 37 | "CA","0643000","02410866","Long Beach","25","A","462257","176032","130259313","2963688","50.293","1.144","33.809102","-118.155327" 38 | "MO","2938000","02395492","Kansas","25","A","459787","221860","815717156","10575319","314.950","4.083","39.125212","-94.551136" 39 | "AZ","0446000","02411087","Mesa","25","A","439041","201173","353409966","1580758","136.452","0.610","33.401926","-111.717379" 40 | "VA","5182000","01498559","Virginia Beach","25","A","437994","177879","644948896","643150238","249.016","248.322","36.779322","-76.024020" 41 | "GA","1304000","02403126","Atlanta","25","A","420003","224573","344861307","2219186","133.152","0.857","33.762909","-84.422675" 42 | "CO","0816000","02410198","Colorado Springs","25","A","416427","179607","503857525","947180","194.540","0.366","38.867255","-104.760749" 43 | "NE","3137000","02396064","Omaha","25","A","408958","177518","329157346","9036153","127.088","3.489","41.264675","-96.041927" 44 | "NC","3755000","02404590","Raleigh","25","A","403892","176124","370117231","2846320","142.903","1.099","35.830204","-78.641439" 45 | "FL","1245000","02404247","Miami","25","A","399457","183994","92905577","52298641","35.871","20.193","25.775163","-80.208615" 46 | "OH","3916000","01085963","Cleveland","25","A","396815","207536","201234202","12351419","77.697","4.769","41.478138","-81.679486" 47 | "OK","4075000","02412110","Tulsa","25","A","391906","185127","509590364","10967865","196.754","4.235","36.127949","-95.902316" 48 | "CA","0653000","02411292","Oakland","25","A","390724","169710","144484543","57539591","55.786","22.216","37.769857","-122.225640" 49 | "MN","2743000","02395345","Minneapolis","25","A","382578","178287","139789184","9052448","53.973","3.495","44.963323","-93.268284" 50 | "KS","2079000","00485662","Wichita","25","A","382368","167310","412571486","11136472","159.295","4.300","37.690694","-97.342678" 51 | "PR","7276770","02414943","San Juan zona urbana","62","S","381931","194316","102305007","17914425","39.500","6.917","18.406409","-66.064004" 52 | -------------------------------------------------------------------------------- /spec/fixtures/import_with_dictionary/cities50.csv: -------------------------------------------------------------------------------- 1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG" 2 | "NY","3651000","02395220","New York","25","A","8175133","3371062","783842402","429527437","302.643","165.841","40.664274","-73.938500" 3 | "CA","0644000","02410877","Los Angeles","25","A","3792621","1413995","1213850147","88119442","468.670","34.023","34.019394","-118.410825" 4 | "IL","1714000","00428803","Chicago","25","A","2695598","1194337","589571105","16781658","227.635","6.479","41.837551","-87.681844" 5 | "TX","4835000","02410796","Houston","25","A","2099451","892646","1552929379","72277296","599.589","27.906","29.780472","-95.386342" 6 | "PA","4260000","01215531","Philadelphia","25","A","1526006","670171","347321129","22289408","134.101","8.606","40.009376","-75.133346" 7 | "AZ","0455000","02411414","Phoenix","25","A","1445632","590149","1338256106","3221362","516.704","1.244","33.572162","-112.087966" 8 | "TX","4865000","02411774","San Antonio","25","A","1327407","524246","1193811736","14968532","460.933","5.779","29.472403","-98.525142" 9 | "CA","0666000","02411782","San Diego","25","A","1307402","516033","842233444","122273028","325.188","47.210","32.815300","-117.134993" 10 | "TX","4819000","02410288","Dallas","25","A","1197816","516639","881939036","117394747","340.519","45.326","32.794176","-96.765503" 11 | "CA","0668000","02411790","San Jose","25","A","945942","314038","457201438","8907829","176.526","3.439","37.296867","-121.819306" 12 | "FL","1235000","02404783","Jacksonville","25","A","821784","366273","1934729255","330550113","747.003","127.626","30.337019","-81.661302" 13 | "IN","1836003","02395424","Indianapolis (balance)","00","F","820445","379856","936107645","17072644","361.433","6.592","39.776664","-86.145935" 14 | "CA","0667000","02411786","San Francisco","25","A","805235","376942","121399963","479190317","46.873","185.016","37.727239","-123.032229" 15 | "TX","4805000","02409761","Austin","25","A","790390","354241","771546901","18560605","297.896","7.166","30.307182","-97.755996" 16 | "OH","3918000","01086101","Columbus","25","A","787033","370965","562466164","15373425","217.169","5.936","39.984799","-82.985044" 17 | "TX","4827000","02410531","Fort Worth","25","A","741206","291086","880127783","20832365","339.819","8.043","32.779542","-97.346335" 18 | "NC","3712000","02404032","Charlotte","25","A","731424","319918","770983559","5148533","297.678","1.988","35.208707","-80.830739" 19 | "MI","2622000","01626181","Detroit","25","A","713777","349170","359361795","10667148","138.750","4.119","42.383037","-83.102237" 20 | "TX","4824000","02410414","El Paso","25","A","649121","227605","661056631","2642037","255.235","1.020","31.848360","-106.426979" 21 | "TN","4748000","02405068","Memphis","25","A","646889","291883","815988030","23176952","315.055","8.949","35.103543","-89.978498" 22 | "MD","2404000","01702381","Baltimore","25","A","620961","296685","209643241","28768302","80.944","11.108","39.300213","-76.610516" 23 | "MA","2507000","00619463","Boston","25","A","617594","272481","125037462","107130299","48.277","41.363","42.331960","-71.020173" 24 | "WA","5363000","02411856","Seattle","25","A","608660","308516","217410345","152055857","83.943","58.709","47.620499","-122.350876" 25 | "DC","1150000","02390665","Washington","25","N","601723","296719","158114680","18884970","61.048","7.292","38.904149","-77.017094" 26 | "TN","4752006","02405092","Nashville-Davidson metropolitan government (balance)","00","F","601222","272622","1230569857","56261385","475.126","21.723","36.171800","-86.785002" 27 | "CO","0820000","02410324","Denver","25","A","600158","285797","396268588","4225359","153.000","1.631","39.761849","-104.880625" 28 | "KY","2148006","01967434","Louisville/Jefferson County metro government (balance)","00","F","597337","270928","842389102","43938966","325.248","16.965","38.178077","-85.666708" 29 | "WI","5553000","01583724","Milwaukee","25","A","594833","255569","248955419","1755108","96.122","0.678","43.063348","-87.966695" 30 | "OR","4159000","02411471","Portland","25","A","583776","265439","345573252","30196598","133.427","11.659","45.536951","-122.649971" 31 | "NV","3240000","02411630","Las Vegas","25","A","583756","243701","351758506","133986","135.815","0.052","36.227712","-115.264045" 32 | "OK","4055000","02411311","Oklahoma","25","A","579999","256930","1570595903","37324163","606.410","14.411","35.467079","-97.513657" 33 | "NM","3502000","02409678","Albuquerque","25","A","545852","239166","486218256","4702696","187.730","1.816","35.105552","-106.647388" 34 | "AZ","0477000","02412104","Tucson","25","A","520116","229762","587173307","842214","226.709","0.325","32.154289","-110.871062" 35 | "CA","0627000","02410546","Fresno","25","A","494665","171288","289966538","910373","111.957","0.351","36.782674","-119.794492" 36 | "CA","0664000","02411751","Sacramento","25","A","466488","190911","253599699","5673097","97.915","2.190","38.566592","-121.468632" 37 | "CA","0643000","02410866","Long Beach","25","A","462257","176032","130259313","2963688","50.293","1.144","33.809102","-118.155327" 38 | "MO","2938000","02395492","Kansas","25","A","459787","221860","815717156","10575319","314.950","4.083","39.125212","-94.551136" 39 | "AZ","0446000","02411087","Mesa","25","A","439041","201173","353409966","1580758","136.452","0.610","33.401926","-111.717379" 40 | "VA","5182000","01498559","Virginia Beach","25","A","437994","177879","644948896","643150238","249.016","248.322","36.779322","-76.024020" 41 | "GA","1304000","02403126","Atlanta","25","A","420003","224573","344861307","2219186","133.152","0.857","33.762909","-84.422675" 42 | "CO","0816000","02410198","Colorado Springs","25","A","416427","179607","503857525","947180","194.540","0.366","38.867255","-104.760749" 43 | "NE","3137000","02396064","Omaha","25","A","408958","177518","329157346","9036153","127.088","3.489","41.264675","-96.041927" 44 | "NC","3755000","02404590","Raleigh","25","A","403892","176124","370117231","2846320","142.903","1.099","35.830204","-78.641439" 45 | "FL","1245000","02404247","Miami","25","A","399457","183994","92905577","52298641","35.871","20.193","25.775163","-80.208615" 46 | "OH","3916000","01085963","Cleveland","25","A","396815","207536","201234202","12351419","77.697","4.769","41.478138","-81.679486" 47 | "OK","4075000","02412110","Tulsa","25","A","391906","185127","509590364","10967865","196.754","4.235","36.127949","-95.902316" 48 | "CA","0653000","02411292","Oakland","25","A","390724","169710","144484543","57539591","55.786","22.216","37.769857","-122.225640" 49 | "MN","2743000","02395345","Minneapolis","25","A","382578","178287","139789184","9052448","53.973","3.495","44.963323","-93.268284" 50 | "KS","2079000","00485662","Wichita","25","A","382368","167310","412571486","11136472","159.295","4.300","37.690694","-97.342678" 51 | "PR","7276770","02414943","San Juan zona urbana","62","S","381931","194316","102305007","17914425","39.500","6.917","18.406409","-66.064004" 52 | -------------------------------------------------------------------------------- /lib/data_magic/index/document_builder.rb: -------------------------------------------------------------------------------- 1 | require './lib/expression/expression' 2 | 3 | module DataMagic 4 | module Index 5 | module DocumentBuilder 6 | class << self 7 | def logger 8 | DataMagic::Config.logger 9 | end 10 | 11 | # build a nested json document from a csv row 12 | # row: a hash { column_name => value } 13 | # where all column_names and values are strings 14 | # fields: column_name => field_name 15 | # config: DataMagic.Config instance for dictionary, column types, NULL 16 | def build(row, builder_data, config) 17 | fields = builder_data.new_field_names 18 | options = builder_data.options 19 | additional = builder_data.additional_data 20 | csv_row = map_column_types(row.to_hash, config) 21 | if fields.empty? 22 | field_values = csv_row 23 | else 24 | field_values = map_field_names(csv_row, fields, options) 25 | end 26 | field_values.merge!(calculated_fields(csv_row, config)) 27 | field_values.merge!(lowercase_columns(field_values, config.column_field_types)) 28 | field_values.merge!(additional) if additional 29 | doc = NestedHash.new.add(field_values) 30 | doc = parse_nested(doc, options) if options[:nest] 31 | doc = select_only_fields(doc, options[:only]) unless options[:only].nil? 32 | doc 33 | end 34 | 35 | def create(*args) 36 | Document.new( 37 | build(*args) 38 | ) 39 | end 40 | 41 | private 42 | 43 | def calculated_fields(row, config) 44 | result = {} 45 | config.calculated_field_list.each do |name| 46 | result[name] = calculate(name, row, config.dictionary) 47 | end 48 | result 49 | end 50 | 51 | # row: a hash (keys may be strings or symbols) 52 | # valid_types: an array of allowed types 53 | # field_types: hash field_name : type (float, integer, string) 54 | # returns a hash where values have been coerced to the new type 55 | # TODO: move type validation to config load time instead 56 | def map_column_types(row, config) 57 | valid_types = config.valid_types 58 | null_value = config.null_value || null_value = 'NULL' 59 | 60 | mapped = {} 61 | row.each do |key, value| 62 | if value == null_value 63 | mapped[key] = nil 64 | else 65 | type = config.csv_column_type(key) 66 | if valid_types.include? type 67 | mapped[key] = fix_field_type(type, value, key) 68 | else 69 | fail InvalidDictionary, "unexpected type '#{type.inspect}' for field '#{key}'" 70 | end 71 | end 72 | end 73 | mapped 74 | end 75 | 76 | def lowercase_columns(row, field_types = {}) 77 | new_columns = {} 78 | row.each do |key, value| 79 | type = field_types[key.to_sym] || field_types[key.to_s] 80 | new_columns["_#{key}"] = value.downcase if type == "name" || type == "autocomplete" 81 | end 82 | new_columns 83 | end 84 | 85 | def parse_nested(document, options) 86 | new_doc = {} 87 | nest_options = options[:nest] 88 | if nest_options 89 | key = nest_options['key'] 90 | new_doc[key] = {} 91 | new_doc['id'] = document['id'] unless document['id'].nil? 92 | nest_options['contents'].each do |item_key| 93 | new_doc[key][item_key] = document[item_key] 94 | end 95 | end 96 | new_doc 97 | end 98 | 99 | def fix_field_type(type, value, key=nil) 100 | return value if value.nil? 101 | 102 | new_value = case type 103 | when "float" 104 | value.to_f 105 | when "integer" 106 | value.to_i 107 | when "lowercase_name" 108 | value.to_s.downcase 109 | when "boolean" 110 | parse_boolean(value) 111 | else # "string" 112 | value.to_s 113 | end 114 | new_value = value.to_f if key and key.to_s.include? "location" 115 | new_value 116 | end 117 | 118 | def parse_boolean(value) 119 | case value 120 | when "true" 121 | true 122 | when "false" 123 | false 124 | when 0 125 | false 126 | else 127 | !!value 128 | end 129 | end 130 | 131 | # currently we just support 'or' operations on two columns 132 | def calculate(field_name, row, dictionary) 133 | item = dictionary[field_name.to_s] || dictionary[field_name.to_sym] 134 | type = item['type'] || item[:type] 135 | fail "calculate: field not found in dictionary #{field_name.inspect}" if item.nil? 136 | expr = item['calculate'] || item[:calculate] 137 | fail ArgumentError, "expected to calculate #{field_name}" if expr.nil? 138 | e = Expression.find_or_create(expr) 139 | vars = {} 140 | e.variables.each do |name| 141 | vars[name] = fix_field_type(type, row[name.to_sym]) 142 | end 143 | fix_field_type(type, e.evaluate(vars)) 144 | end 145 | 146 | # row: a hash (keys may be strings or symbols) 147 | # new_fields: hash current_name : new_name 148 | # returns a hash (which may be a subset of row) where keys are new_name 149 | # with value of corresponding row[current_name] 150 | def map_field_names(row, new_fields, options = {}) 151 | mapped = {} 152 | row.each do |key, value| 153 | fail ArgumentError, "column header missing for: #{value}" if key.nil? 154 | new_key = new_fields[key.to_sym] || new_fields[key.to_s] 155 | if new_key 156 | value = value.to_f if new_key.include? "location" 157 | mapped[new_key] = value 158 | elsif options[:columns] == 'all' 159 | mapped[key] = value 160 | end 161 | end 162 | mapped 163 | end 164 | 165 | # select top-level fields from a hash 166 | # if there are name types, also select _name 167 | # doc: hash with string keys 168 | # only_keys: array of keys 169 | def select_only_fields(doc, only_keys) 170 | doc = doc.select do |key, value| 171 | key = key.to_s 172 | # if key has _ prefix, select if key present without _ 173 | key = key[1..-1] if key[0] == '_' 174 | only_keys.include?(key) 175 | end 176 | end 177 | 178 | end # class methods 179 | end # module QueryBuilder 180 | end 181 | end # module DataMagic 182 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | actionview (4.2.3) 5 | activesupport (= 4.2.3) 6 | builder (~> 3.1) 7 | erubis (~> 2.7.0) 8 | rails-dom-testing (~> 1.0, >= 1.0.5) 9 | rails-html-sanitizer (~> 1.0, >= 1.0.2) 10 | activesupport (4.2.3) 11 | i18n (~> 0.7) 12 | json (~> 1.7, >= 1.7.7) 13 | minitest (~> 5.1) 14 | thread_safe (~> 0.3, >= 0.3.4) 15 | tzinfo (~> 1.1) 16 | addressable (2.3.8) 17 | autoparse (0.3.3) 18 | addressable (>= 2.3.1) 19 | extlib (>= 0.9.15) 20 | multi_json (>= 1.0.0) 21 | aws-sdk (2.1.11) 22 | aws-sdk-resources (= 2.1.11) 23 | aws-sdk-core (2.1.11) 24 | jmespath (~> 1.0) 25 | aws-sdk-resources (2.1.11) 26 | aws-sdk-core (= 2.1.11) 27 | axiom-types (0.1.1) 28 | descendants_tracker (~> 0.0.4) 29 | ice_nine (~> 0.11.0) 30 | thread_safe (~> 0.3, >= 0.3.1) 31 | blankslate (2.1.2.4) 32 | builder (3.2.2) 33 | byebug (5.0.0) 34 | columnize (= 0.9.0) 35 | cf-app-utils (0.4) 36 | coderay (1.1.0) 37 | coercible (1.0.0) 38 | descendants_tracker (~> 0.0.1) 39 | columnize (0.9.0) 40 | descendants_tracker (0.0.4) 41 | thread_safe (~> 0.3, >= 0.3.1) 42 | diff-lcs (1.2.5) 43 | dotenv (2.0.2) 44 | elasticsearch (1.0.12) 45 | elasticsearch-api (= 1.0.12) 46 | elasticsearch-transport (= 1.0.12) 47 | elasticsearch-api (1.0.12) 48 | multi_json 49 | elasticsearch-transport (1.0.12) 50 | faraday 51 | multi_json 52 | equalizer (0.0.11) 53 | erubis (2.7.0) 54 | excon (0.45.4) 55 | extlib (0.9.16) 56 | faraday (0.9.1) 57 | multipart-post (>= 1.2, < 3) 58 | google-api-client (0.8.2) 59 | activesupport (>= 3.2) 60 | addressable (~> 2.3) 61 | autoparse (~> 0.3) 62 | extlib (~> 0.9) 63 | faraday (~> 0.9) 64 | launchy (~> 2.4) 65 | multi_json (~> 1.10) 66 | retriable (~> 1.4) 67 | signet (~> 0.6) 68 | google_drive (1.0.1) 69 | google-api-client (>= 0.7.0) 70 | nokogiri (>= 1.4.4, != 1.5.2, != 1.5.1) 71 | oauth (>= 0.3.6) 72 | oauth2 (>= 0.5.0) 73 | hashie (3.4.2) 74 | http_router (0.11.2) 75 | rack (>= 1.0.0) 76 | url_mount (~> 0.2.1) 77 | i18n (0.7.0) 78 | ice_nine (0.11.1) 79 | jmespath (1.0.2) 80 | multi_json (~> 1.0) 81 | json (1.8.3) 82 | jwt (1.5.1) 83 | launchy (2.4.3) 84 | addressable (~> 2.3) 85 | liquid (3.0.3) 86 | liquify (0.2.7) 87 | liquid (>= 2.2.2) 88 | loofah (2.0.3) 89 | nokogiri (>= 1.5.9) 90 | mail (2.5.4) 91 | mime-types (~> 1.16) 92 | treetop (~> 1.4.8) 93 | method_source (0.8.2) 94 | mime-types (1.25.1) 95 | mini_portile2 (2.1.0) 96 | minitest (5.8.0) 97 | moneta (0.7.20) 98 | multi_json (1.11.2) 99 | multi_xml (0.5.5) 100 | multipart-post (2.0.0) 101 | newrelic_rpm (3.14.2.312) 102 | nokogiri (1.6.8) 103 | mini_portile2 (~> 2.1.0) 104 | pkg-config (~> 1.1.7) 105 | oauth (0.4.7) 106 | oauth2 (1.0.0) 107 | faraday (>= 0.8, < 0.10) 108 | jwt (~> 1.0) 109 | multi_json (~> 1.3) 110 | multi_xml (~> 0.5) 111 | rack (~> 1.2) 112 | oj (2.12.13) 113 | padrino (0.12.5) 114 | padrino-admin (= 0.12.5) 115 | padrino-cache (= 0.12.5) 116 | padrino-core (= 0.12.5) 117 | padrino-gen (= 0.12.5) 118 | padrino-helpers (= 0.12.5) 119 | padrino-mailer (= 0.12.5) 120 | padrino-support (= 0.12.5) 121 | padrino-admin (0.12.5) 122 | padrino-core (= 0.12.5) 123 | padrino-helpers (= 0.12.5) 124 | padrino-cache (0.12.5) 125 | moneta (~> 0.7.0) 126 | padrino-core (= 0.12.5) 127 | padrino-helpers (= 0.12.5) 128 | padrino-core (0.12.5) 129 | activesupport (>= 3.1) 130 | http_router (~> 0.11.0) 131 | padrino-support (= 0.12.5) 132 | rack (< 1.6.0) 133 | rack-protection (>= 1.5.0) 134 | sinatra (~> 1.4.2) 135 | thor (~> 0.18) 136 | padrino-gen (0.12.5) 137 | bundler (~> 1.0) 138 | padrino-core (= 0.12.5) 139 | padrino-helpers (0.12.5) 140 | i18n (~> 0.6, >= 0.6.7) 141 | padrino-support (= 0.12.5) 142 | tilt (~> 1.4.1) 143 | padrino-mailer (0.12.5) 144 | mail (~> 2.5.3) 145 | padrino-core (= 0.12.5) 146 | padrino-support (0.12.5) 147 | activesupport (>= 3.1) 148 | parallel (1.6.1) 149 | parslet (1.7.1) 150 | blankslate (>= 2.0, <= 4.0) 151 | pkg-config (1.1.7) 152 | polyglot (0.3.5) 153 | pry (0.10.1) 154 | coderay (~> 1.1.0) 155 | method_source (~> 0.8.1) 156 | slop (~> 3.4) 157 | pry-byebug (3.2.0) 158 | byebug (~> 5.0) 159 | pry (~> 0.10) 160 | puma (2.15.3) 161 | rack (1.5.5) 162 | rack-protection (1.5.3) 163 | rack 164 | rack-test (0.6.3) 165 | rack (>= 1.0) 166 | rails-deprecated_sanitizer (1.0.3) 167 | activesupport (>= 4.2.0.alpha) 168 | rails-dom-testing (1.0.6) 169 | activesupport (>= 4.2.0.beta, < 5.0) 170 | nokogiri (~> 1.6.0) 171 | rails-deprecated_sanitizer (>= 1.0.1) 172 | rails-html-sanitizer (1.0.3) 173 | loofah (~> 2.0) 174 | rake (10.4.2) 175 | retriable (1.4.1) 176 | rspec (3.3.0) 177 | rspec-core (~> 3.3.0) 178 | rspec-expectations (~> 3.3.0) 179 | rspec-mocks (~> 3.3.0) 180 | rspec-core (3.3.2) 181 | rspec-support (~> 3.3.0) 182 | rspec-expectations (3.3.1) 183 | diff-lcs (>= 1.2.0, < 2.0) 184 | rspec-support (~> 3.3.0) 185 | rspec-mocks (3.3.2) 186 | diff-lcs (>= 1.2.0, < 2.0) 187 | rspec-support (~> 3.3.0) 188 | rspec-support (3.3.0) 189 | ruby-prof (0.15.9) 190 | safe_yaml (1.0.4) 191 | sass (3.4.16) 192 | signet (0.6.1) 193 | addressable (~> 2.3) 194 | extlib (~> 0.9) 195 | faraday (~> 0.9) 196 | jwt (~> 1.5) 197 | multi_json (~> 1.10) 198 | sinatra (1.4.6) 199 | rack (~> 1.4) 200 | rack-protection (~> 1.4) 201 | tilt (>= 1.3, < 3) 202 | slop (3.6.0) 203 | stretchy (0.4.7) 204 | elasticsearch (~> 1.0) 205 | excon (~> 0.45) 206 | valid (~> 0.5) 207 | virtus (~> 1.0) 208 | thor (0.19.1) 209 | thread_safe (0.3.5) 210 | tilt (1.4.1) 211 | treetop (1.4.15) 212 | polyglot 213 | polyglot (>= 0.3.1) 214 | tzinfo (1.2.2) 215 | thread_safe (~> 0.1) 216 | url_mount (0.2.1) 217 | rack 218 | valid (0.5.0) 219 | virtus (1.0.5) 220 | axiom-types (~> 0.1) 221 | coercible (~> 1.0) 222 | descendants_tracker (~> 0.0, >= 0.0.3) 223 | equalizer (~> 0.0, >= 0.0.9) 224 | 225 | PLATFORMS 226 | ruby 227 | 228 | DEPENDENCIES 229 | actionview 230 | aws-sdk (~> 2) 231 | cf-app-utils 232 | dotenv 233 | elasticsearch 234 | erubis 235 | google_drive 236 | hashie 237 | liquid (= 3.0.3) 238 | liquify 239 | newrelic_rpm 240 | oj 241 | padrino (= 0.12.5) 242 | parallel 243 | parslet 244 | pry 245 | pry-byebug 246 | puma 247 | rack-test 248 | rake 249 | rspec 250 | rspec-mocks 251 | ruby-prof 252 | safe_yaml 253 | sass 254 | stretchy 255 | 256 | BUNDLED WITH 257 | 1.11.2 258 | --------------------------------------------------------------------------------