├── log
└── .gitkeep
├── .ruby-version
├── .ruby-gemset
├── Procfile
├── public
├── javascripts
│ ├── application.js
│ └── jquery-ujs.js
└── favicon.ico
├── config
├── unicorn.rb
├── puma.rb
├── env.rb
├── boot.rb
└── apps.rb
├── spec
├── fixtures
│ ├── geo_no_files
│ │ └── data.yaml
│ ├── minimal
│ │ └── data.yaml
│ ├── cities_with_yml
│ │ ├── more.csv
│ │ ├── data.yml
│ │ ├── cities51-100.csv
│ │ └── cities50.csv
│ ├── import_with_options
│ │ ├── more_cities.csv
│ │ ├── cities4.csv
│ │ └── data.yaml
│ ├── cities_without_yml
│ │ ├── more.csv
│ │ ├── cities51-100.csv
│ │ └── cities50.csv
│ ├── import_with_dictionary
│ │ ├── more.csv
│ │ ├── data.yaml
│ │ ├── cities51-100.csv
│ │ └── cities50.csv
│ ├── invalid_utf8.csv
│ ├── bom
│ │ ├── bom.csv
│ │ └── data.yaml
│ ├── calculated_columns
│ │ ├── schools.csv
│ │ └── data.yaml
│ ├── types
│ │ ├── places.csv
│ │ └── data.yaml
│ ├── import_with_errors
│ │ ├── cities4.csv
│ │ └── data.yaml
│ ├── import_with_null_value
│ │ ├── null_values.csv
│ │ └── data.yaml
│ ├── school_names
│ │ ├── data.yaml
│ │ └── school_names.csv
│ ├── geo
│ │ ├── places.csv
│ │ └── data.yaml
│ ├── numeric_data
│ │ └── data.yaml
│ ├── nested_files
│ │ ├── school2011.csv
│ │ ├── school2012.csv
│ │ ├── data.yaml
│ │ ├── school2013.csv
│ │ └── school-data.csv
│ ├── nested2
│ │ ├── data.yaml
│ │ └── school2013.csv
│ ├── data.rb
│ ├── schools
│ │ ├── schools.csv
│ │ └── data.yaml
│ └── sample-data
│ │ └── data.yaml
├── lib
│ ├── expression
│ │ ├── variables_spec.rb
│ │ ├── eval_spec.rb
│ │ └── parser_spec.rb
│ ├── data_magic
│ │ ├── index
│ │ │ ├── importer_spec.rb
│ │ │ ├── event_logger_spec.rb
│ │ │ ├── document_spec.rb
│ │ │ └── repository_spec.rb
│ │ ├── example_spec.rb
│ │ ├── import_csv_spec.rb
│ │ ├── name_type_spec.rb
│ │ ├── calculated_columns_spec.rb
│ │ ├── create_index_spec.rb
│ │ ├── import_with_nested_files_spec.rb
│ │ ├── import_without_data_yaml_spec.rb
│ │ ├── config_field_types_spec.rb
│ │ ├── search_name_spec.rb
│ │ └── config_spec.rb
│ ├── zipcode_spec.rb
│ ├── data_magic_spec.rb
│ ├── expression_spec.rb
│ └── nested_hash_spec.rb
├── spec.rake
├── tasks
│ └── import_spec.rb
├── spec_helper.rb
└── features
│ └── web_spec.rb
├── doc
├── csv-download.png
└── data-overview.png
├── Rakefile
├── .components
├── script
├── bomstrip.sh
├── makeutf8.sh
├── s3push
├── s3pull
├── s3config.rb
└── bootstrap
├── config.ru
├── manifest-dev.yml
├── manifest-staging.yml
├── manifest-production.yml
├── manifest-ex.yml
├── manifest-indexing.yml
├── app
├── views
│ ├── layouts
│ │ └── application.erb
│ ├── home.liquid
│ └── category.liquid
├── index_app.rb
├── app.rb
├── stylesheets
│ └── application.sass
└── controllers.rb
├── .rubocop.yml
├── bin
└── open-data-maker
├── lib
├── expression
│ ├── variables.rb
│ ├── eval.rb
│ ├── expression.rb
│ └── parser.rb
├── data_magic
│ ├── category.rb
│ ├── example.rb
│ ├── index
│ │ ├── builder_data.rb
│ │ ├── event_logger.rb
│ │ ├── document.rb
│ │ ├── super_client.rb
│ │ ├── output.rb
│ │ ├── repository.rb
│ │ ├── row_importer.rb
│ │ ├── importer.rb
│ │ └── document_builder.rb
│ ├── index.rb
│ ├── error_checker.rb
│ └── query_builder.rb
├── sass_initializer.rb
├── zipcode
│ └── zipcode.rb
└── nested_hash.rb
├── .gitignore
├── tasks
├── es.rake
└── import.rake
├── circle.yml
├── NOTES.md
├── Gemfile
├── LICENSE.md
├── DICTIONARY.md
├── notes.txt
├── sample-data
└── data.yaml
├── INSTALL.md
├── README.md
└── Gemfile.lock
/log/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.ruby-version:
--------------------------------------------------------------------------------
1 | 2.2.4
2 |
--------------------------------------------------------------------------------
/.ruby-gemset:
--------------------------------------------------------------------------------
1 | open-data-maker
2 |
--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: bundle exec puma -C config/puma.rb
2 |
--------------------------------------------------------------------------------
/public/javascripts/application.js:
--------------------------------------------------------------------------------
1 | // Put your application scripts here
--------------------------------------------------------------------------------
/config/unicorn.rb:
--------------------------------------------------------------------------------
1 | worker_processes 5
2 | timeout 30
3 | preload_app true
4 |
--------------------------------------------------------------------------------
/spec/fixtures/geo_no_files/data.yaml:
--------------------------------------------------------------------------------
1 | # data.yaml for geo tests
2 | index: place-data
3 |
--------------------------------------------------------------------------------
/spec/fixtures/minimal/data.yaml:
--------------------------------------------------------------------------------
1 | # smallest possible data.yaml
2 | index: my-index
3 |
--------------------------------------------------------------------------------
/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/18F/open-data-maker/HEAD/public/favicon.ico
--------------------------------------------------------------------------------
/doc/csv-download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/18F/open-data-maker/HEAD/doc/csv-download.png
--------------------------------------------------------------------------------
/doc/data-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/18F/open-data-maker/HEAD/doc/data-overview.png
--------------------------------------------------------------------------------
/spec/fixtures/cities_with_yml/more.csv:
--------------------------------------------------------------------------------
1 | state,city,lat,lon
2 | CA,Secret City,37.727239,-123.032229
3 |
--------------------------------------------------------------------------------
/spec/fixtures/import_with_options/more_cities.csv:
--------------------------------------------------------------------------------
1 | USPS,GEOID,ANSICODE,NAME,POP10
2 | XX,0,0,YY,0
3 |
--------------------------------------------------------------------------------
/spec/fixtures/cities_without_yml/more.csv:
--------------------------------------------------------------------------------
1 | state,city,lat,lon
2 | CA,Secret City,37.727239,-123.032229
3 |
--------------------------------------------------------------------------------
/spec/fixtures/import_with_dictionary/more.csv:
--------------------------------------------------------------------------------
1 | state,city,lat,lon
2 | CA,Secret City,37.727239,-123.032229
3 |
--------------------------------------------------------------------------------
/spec/fixtures/invalid_utf8.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/18F/open-data-maker/HEAD/spec/fixtures/invalid_utf8.csv
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'bundler/setup'
2 | require 'padrino-core/cli/rake'
3 |
4 |
5 | task :default => :spec
6 |
7 | PadrinoTasks.init
8 |
--------------------------------------------------------------------------------
/spec/fixtures/bom/bom.csv:
--------------------------------------------------------------------------------
1 | UNITID,VAL
2 | 100654,00100200
3 | 100663,00105200
4 | 100690,02503400
5 | 100706,00105500
6 | 100724,00100500
7 |
--------------------------------------------------------------------------------
/spec/fixtures/calculated_columns/schools.csv:
--------------------------------------------------------------------------------
1 | UNITID,INSTNM,INT1,INT2,INT3,INT4
2 | 1,Big School,0,0,2,0
3 | 2,Small School,0,0,0,0
4 | 3,Middle School,0,1,1,0
5 |
--------------------------------------------------------------------------------
/spec/fixtures/bom/data.yaml:
--------------------------------------------------------------------------------
1 | version: byte-order-mark
2 | index: test-data
3 | api: test
4 | dictionary:
5 | id: UNITID
6 | value: VAL
7 |
8 | files:
9 | - name: bom.csv
10 |
--------------------------------------------------------------------------------
/spec/fixtures/types/places.csv:
--------------------------------------------------------------------------------
1 | id,state,name,lat,lon
2 | ca sf,CA,San Francisco,37.727239,-123.032229
3 | ny ny,NY,New York,40.664274,-73.938500
4 | la no,LA,New Orleans,30.068636,-89.939007
5 |
--------------------------------------------------------------------------------
/.components:
--------------------------------------------------------------------------------
1 | ---
2 | :orm: none
3 | :test: rspec
4 | :mock: none
5 | :script: jquery
6 | :renderer: liquid
7 | :stylesheet: sass
8 | :namespace: OpenDataMaker
9 | :migration_format: number
10 |
--------------------------------------------------------------------------------
/spec/fixtures/import_with_errors/cities4.csv:
--------------------------------------------------------------------------------
1 | USPS,GEOID,ANSICODE,NAME,POP10
2 | NY,3651000,2395220,New York,8175133
3 | CA,644000,2410877,Los Angeles,3792621
4 | IL,1714000,428803,Chicago,2695598
5 | TX,4835000,2410796,Houston,2099451
--------------------------------------------------------------------------------
/spec/fixtures/import_with_options/cities4.csv:
--------------------------------------------------------------------------------
1 | USPS,GEOID,ANSICODE,NAME,POP10
2 | NY,3651000,2395220,New York,8175133
3 | CA,644000,2410877,Los Angeles,3792621
4 | IL,1714000,428803,Chicago,2695598
5 | TX,4835000,2410796,Houston,2099451
--------------------------------------------------------------------------------
/spec/fixtures/import_with_null_value/null_values.csv:
--------------------------------------------------------------------------------
1 | USPS,GEOID,ANSICODE,NAME,POP10
2 | NY,abc123,2395220,New York,8175133
3 | CA,644000,2410877,Los Angeles,3792621
4 | IL,1714000,428803,Chicago,2695598
5 | TX,4835000,2410796,Houston,2099451
6 |
--------------------------------------------------------------------------------
/script/bomstrip.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | mkdir -p new
3 |
4 | for filename in ./*.csv; do
5 | awk 'NR==1{sub(/^\xef\xbb\xbf/,"")}1' "$filename" > new/$filename
6 | done
7 |
8 | #find . -print0 -type f | awk 'NR==1{sub(/^\xef\xbb\xbf/,"")}1' {} > new/{}
9 |
--------------------------------------------------------------------------------
/config.ru:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env rackup
2 | # encoding: utf-8
3 |
4 | # This file can be used to start Padrino,
5 | # just execute it from the command line.
6 |
7 | require File.expand_path("../config/boot.rb", __FILE__)
8 |
9 | run Padrino.application
10 |
--------------------------------------------------------------------------------
/spec/fixtures/school_names/data.yaml:
--------------------------------------------------------------------------------
1 | version: 0
2 | index: name-data
3 | api: names
4 | dictionary:
5 | id: ID
6 | school.name:
7 | source: NAME
8 | type: autocomplete
9 | school.state: STATE
10 |
11 | files:
12 | - name: school_names.csv
13 |
--------------------------------------------------------------------------------
/script/makeutf8.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # convert to utf8 and strip Byte Order Mark (BOM) if present
3 | mkdir -p utf8
4 |
5 | for file in *.csv; do
6 | echo "$file"
7 | iconv -f ascii -t utf-8 "$file" | awk 'NR==1{sub(/^\xef\xbb\xbf/,"")}1' > "./utf8/${file%.txt}"
8 | done
9 |
--------------------------------------------------------------------------------
/manifest-dev.yml:
--------------------------------------------------------------------------------
1 | ---
2 | applications:
3 | - name: ccapi-dev
4 | command: bundle exec puma -C ./config/puma.rb
5 | instances: 1
6 | memory: 2G
7 | services:
8 | - bservice
9 | - eservice
10 | env:
11 | MAX_THREADS: 5
12 | WEB_CONCURRENCY: 3
13 |
--------------------------------------------------------------------------------
/manifest-staging.yml:
--------------------------------------------------------------------------------
1 | ---
2 | applications:
3 | - name: ccapi-staging
4 | command: bundle exec puma -C ./config/puma.rb
5 | instances: 3
6 | memory: 2G
7 | services:
8 | - bservice
9 | - eservice
10 | env:
11 | MAX_THREADS: 5
12 | WEB_CONCURRENCY: 3
13 |
--------------------------------------------------------------------------------
/manifest-production.yml:
--------------------------------------------------------------------------------
1 | ---
2 | applications:
3 | - name: ccapi-production
4 | command: bundle exec puma -C ./config/puma.rb
5 | instances: 3
6 | memory: 2G
7 | services:
8 | - bservice
9 | - eservice
10 | env:
11 | MAX_THREADS: 5
12 | WEB_CONCURRENCY: 3
13 |
--------------------------------------------------------------------------------
/spec/fixtures/import_with_null_value/data.yaml:
--------------------------------------------------------------------------------
1 | index: city-data
2 | api: cities
3 | unique: ['name']
4 | null_value: 'abc123'
5 | options:
6 | columns: all
7 |
8 | dictionary:
9 | state: USPS
10 | population: POP10
11 | name: NAME
12 |
13 | files:
14 | - name: null_values.csv
15 |
--------------------------------------------------------------------------------
/spec/fixtures/import_with_errors/data.yaml:
--------------------------------------------------------------------------------
1 | version: fixture-type-error
2 | index: expect-errors
3 | api: nothing
4 |
5 | dictionary:
6 | state: USPS
7 | name: NAME
8 | population:
9 | source: POP10
10 | type: broken
11 |
12 | files:
13 | - name: cities4.csv
14 | add:
15 | year: 2010
16 |
--------------------------------------------------------------------------------
/manifest-ex.yml:
--------------------------------------------------------------------------------
1 | ---
2 | applications:
3 | - name: ccapi-ex
4 | command: bundle exec puma -C ./config/puma.rb
5 | instances: 1
6 | memory: 2G
7 | services:
8 | - bservice
9 | - eservice
10 | env:
11 | MAX_THREADS: 5
12 | WEB_CONCURRENCY: 1
13 | INDEX_APP: enable
14 | NPROCS: 2
15 |
--------------------------------------------------------------------------------
/config/puma.rb:
--------------------------------------------------------------------------------
1 | workers Integer(ENV['WEB_CONCURRENCY'] || 2)
2 | threads_count = Integer(ENV['MAX_THREADS'] || 5)
3 | threads threads_count, threads_count
4 | worker_timeout 30
5 |
6 | preload_app!
7 |
8 | rackup DefaultRackup
9 | port ENV['PORT'] || 3000
10 | environment ENV['RACK_ENV'] || 'development'
11 |
--------------------------------------------------------------------------------
/manifest-indexing.yml:
--------------------------------------------------------------------------------
1 | ---
2 | applications:
3 | - name: ccapi-indexing
4 | command: bundle exec puma -C ./config/puma.rb
5 | instances: 1
6 | memory: 2G
7 | services:
8 | - bservice
9 | - eservice
10 | env:
11 | MAX_THREADS: 5
12 | WEB_CONCURRENCY: 1
13 | INDEX_APP: enable
14 | NPROCS: 2
15 |
--------------------------------------------------------------------------------
/app/views/layouts/application.erb:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | <%== yield %>
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.rubocop.yml:
--------------------------------------------------------------------------------
1 | AllCops:
2 | Exclude:
3 | - 'bin/**/*'
4 | - 'db/**/*'
5 | Metrics/LineLength:
6 | Enabled: false
7 | Style/CommentAnnotation:
8 | Enabled: false
9 | Style/Documentation:
10 | Enabled: false
11 | Style/DotPosition:
12 | Enabled: false
13 | Style/RedundantSelf:
14 | Enabled: false
15 | Style/StringLiterals:
16 | Enabled: false
17 |
--------------------------------------------------------------------------------
/spec/fixtures/school_names/school_names.csv:
--------------------------------------------------------------------------------
1 | ID,STATE,NAME
2 | 1,AL,Stillman College
3 | 2,NY,New York University
4 | 3,AZ,Arizona State University
5 | 4,CA,University of California-Berkeley
6 | 5,MA,Berklee College of Music
7 | 6,NY,Berk Trade and Business School
8 | 7,AZ,University of Phoenix-Online Campus
9 | 8,AZ,University of Phoenix-Phoenix Campus
10 | 9,AZ,Phoenix College
11 |
--------------------------------------------------------------------------------
/spec/fixtures/geo/places.csv:
--------------------------------------------------------------------------------
1 | state,city,lat,lon
2 | CA,"San Francisco",37.727239,-123.032229
3 | NY,"New York",40.664274,-73.938500
4 | CA,"Los Angeles",34.019394,-118.410825
5 | IL,Chicago,41.837551,-87.681844
6 | TX,Houston,29.780472,-95.386342
7 | PA,Philadelphia,40.009376,-75.133346
8 | CA,"San Jose",37.296867,-121.819306
9 | MA,Boston,42.331960,-71.020173
10 | WA,Seattle,47.620499,-122.350876
11 |
--------------------------------------------------------------------------------
/spec/fixtures/numeric_data/data.yaml:
--------------------------------------------------------------------------------
1 | # cities100.txt
2 | # Test YAML file
3 | index: numeric-data
4 | api: cities
5 |
6 | dictionary:
7 | name:
8 | source: name
9 | type: string
10 | address:
11 | source: address
12 | type: string
13 | city:
14 | source: city
15 | type: string
16 | age:
17 | source: age
18 | type: integer
19 | height:
20 | source: height
21 | type: float
22 |
--------------------------------------------------------------------------------
/bin/open-data-maker:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | Dir.chdir(File.dirname(__FILE__)+'/..')
4 |
5 | # Start the app with Padrino::Server
6 | require 'rubygems'
7 | require 'bundler/setup'
8 | require 'padrino-core/cli/launcher'
9 |
10 | ARGV.unshift('start') if ARGV.first.nil? || ARGV.first.start_with?('-')
11 | Padrino::Cli::Launcher.start ARGV
12 |
13 | # Start the app with Rack::Server
14 | #require "rack"
15 | #Rack::Server.start
16 |
--------------------------------------------------------------------------------
/spec/fixtures/import_with_options/data.yaml:
--------------------------------------------------------------------------------
1 | version: fixture-import-options
2 | index: city-data
3 | api: cities
4 | options:
5 | columns: all
6 | limit_files: 1
7 | limit_rows: 3
8 |
9 | dictionary:
10 | state: USPS
11 | name: NAME
12 | population: POP10
13 |
14 | files:
15 | - name: cities4.csv
16 | add:
17 | year: 2010
18 | - name: more_cities.csv # this shouldn't get imported
19 | add:
20 | year: 1000
21 |
--------------------------------------------------------------------------------
/lib/expression/variables.rb:
--------------------------------------------------------------------------------
1 | require 'parslet'
2 |
3 | class Expression
4 | class Variables < Parslet::Transform
5 | rule(:var => simple(:var)) {
6 | [String(var)]
7 | }
8 | rule(:or => { :left => subtree(:left), :right => subtree(:right) }) do
9 | (left + right)
10 | end
11 |
12 | rule(:and => { :left => subtree(:left), :right => subtree(:right) }) do
13 | (left + right)
14 | end
15 |
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | log/**/*
3 | log/*.log
4 | tmp/**/*
5 | vendor/gems/*
6 | !vendor/gems/cache/
7 | .sass-cache/*
8 | db/*.db
9 | .*.sw*
10 | .env
11 | .*.env
12 | .cfignore
13 | .vagrant
14 | .idea/
15 | *profile*
16 |
17 | public/stylesheets/application.css*
18 |
19 | # expect people to put their own data in /data
20 | data
21 |
22 | # another commonly used data directory
23 | real-data
24 |
25 | # contains Google API tokens
26 | client_secret.json
27 |
--------------------------------------------------------------------------------
/script/s3push:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby -v
2 |
3 | require_relative 's3config.rb'
4 |
5 | @s3 = ::Aws::S3::Client.new
6 |
7 | dirname = 'real-data'
8 | bucket_name = ENV['s3_bucket']
9 | datayamlpath = File.expand_path("../../#{dirname}/#{bucket_name}.yaml", __FILE__)
10 |
11 | puts "copying #{datayamlpath}"
12 | puts "to S3 #{bucket_name}"
13 | File.open(datayamlpath, 'r') do |file|
14 | @s3.put_object(bucket: bucket_name, key: 'data.yaml', body: file)
15 | end
16 |
--------------------------------------------------------------------------------
/tasks/es.rake:
--------------------------------------------------------------------------------
1 | require 'data_magic'
2 |
3 | namespace :es do
4 | desc "delete elasticsearch index (_all for all)"
5 | task :delete, [:index_name] => :environment do |t, args|
6 | DataMagic.client.indices.delete(index: args[:index_name])
7 | end
8 |
9 | desc "list elasticsearch indices"
10 | task :list => :environment do |t, args|
11 | result = DataMagic.client.indices.get(index: '_all').keys
12 | puts result.join("\n")
13 | end
14 | end
15 |
--------------------------------------------------------------------------------
/script/s3pull:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby -v
2 |
3 | require_relative 's3config.rb'
4 |
5 | @s3 = ::Aws::S3::Client.new
6 |
7 | bucket = ENV['s3_bucket']
8 |
9 | dirname = 'real-data'
10 | unless File.directory?(dirname)
11 | FileUtils.mkdir_p(dirname)
12 | end
13 | datayamlpath = File.expand_path("../../#{dirname}/#{bucket}.yaml", __FILE__)
14 |
15 | File.open(datayamlpath, 'w') do |file|
16 | response = @s3.get_object(bucket: bucket, key: 'data.yaml')
17 | file << response.body.read
18 | end
19 |
--------------------------------------------------------------------------------
/app/index_app.rb:
--------------------------------------------------------------------------------
1 | require 'csv'
2 |
3 | module OpenDataMaker
4 |
5 | class IndexApp < Padrino::Application
6 | register SassInitializer
7 | register Padrino::Helpers
8 |
9 | enable :sessions
10 |
11 | get '/' do
12 | DataMagic.config.scoped_index_name
13 | end
14 |
15 | get '/init' do
16 | DataMagic.init(load_now: true)
17 | "ok"
18 | end
19 |
20 | get '/reindex' do
21 | DataMagic.reindex
22 | "reindexing..."
23 | end
24 | end
25 |
26 | end
27 |
--------------------------------------------------------------------------------
/lib/data_magic/category.rb:
--------------------------------------------------------------------------------
1 | Category = Struct.new(:category_id) do
2 | def assemble
3 | category_entry = DataMagic.config.data['categories'][category_id]
4 | dictionary = DataMagic.config.dictionary
5 | field_details = {}
6 | category_entry['fields'].each do |field_name|
7 | field_details[field_name] = dictionary[field_name] || { "description"=>"" }
8 | end
9 | field_details = { "field_details" => field_details }
10 | assemble = category_entry.merge(field_details)
11 | end
12 | end
13 |
--------------------------------------------------------------------------------
/lib/expression/eval.rb:
--------------------------------------------------------------------------------
1 |
2 | class Expression
3 | class Eval < Parslet::Transform
4 | rule(:var => simple(:var)) {
5 | variables[String(var)]
6 | }
7 |
8 | # in Ruby 0 is 'truthy' but that's not what most people expect
9 | rule(:or => { :left => subtree(:left), :right => subtree(:right) }) do
10 | left == 0 ? right : (left or right)
11 | end
12 |
13 | rule(:and => { :left => subtree(:left), :right => subtree(:right) }) do
14 | left == 0 ? left : (left and right)
15 | end
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/lib/sass_initializer.rb:
--------------------------------------------------------------------------------
1 | module SassInitializer
2 | def self.registered(app)
3 | # Enables support for SASS template reloading in rack applications.
4 | # See http://nex-3.com/posts/88-sass-supports-rack for more details.
5 | # Store SASS files (by default) within 'app/stylesheets'.
6 | require 'sass/plugin/rack'
7 | Sass::Plugin.options[:template_location] = Padrino.root("app/stylesheets")
8 | Sass::Plugin.options[:css_location] = Padrino.root("public/stylesheets")
9 | app.use Sass::Plugin::Rack
10 | end
11 | end
12 |
--------------------------------------------------------------------------------
/lib/data_magic/example.rb:
--------------------------------------------------------------------------------
1 | class Example < Hashie::Mash
2 | include Hashie::Extensions::Coercion
3 | include Hashie::Extensions::MergeInitializer
4 | coerce_key :name, String
5 | coerce_key :description, String
6 | coerce_key :params, String
7 | coerce_key :endpoint, String
8 | coerce_key :link, String
9 | def initialize(hash = {})
10 | super
11 | # we want to use this in a liquid template
12 | # so all attributes needs to be plain data, not code
13 | self[:link] = "/v1/#{endpoint}?#{params}" if self[:link].nil?
14 | end
15 |
16 | end
17 |
--------------------------------------------------------------------------------
/spec/fixtures/geo/data.yaml:
--------------------------------------------------------------------------------
1 |
2 |
3 | # cities100.txt
4 | # National Places Gazetteer Files, from US Census 2010
5 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html
6 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt
7 | # head -n 101 results.txt > cities100.txt
8 | # then convertes to csv and removed " city" from after each city name
9 | dictionary:
10 | city: city
11 | location.lat: lat
12 | location.lon: lon
13 |
14 | index: place-data
15 | api: places
16 | files:
17 | - name: places.csv
18 |
--------------------------------------------------------------------------------
/lib/data_magic/index/builder_data.rb:
--------------------------------------------------------------------------------
1 | module DataMagic
2 | module Index
3 | class BuilderData
4 | attr_reader :data, :options
5 |
6 | def initialize(data, options)
7 | @options = options
8 | @data = data
9 | end
10 |
11 | def additional_fields
12 | options[:mapping] || {}
13 | end
14 |
15 | def new_field_names
16 | field_names = options[:fields] || {}
17 | field_names.merge(additional_fields)
18 | end
19 |
20 | def additional_data
21 | options[:add_data]
22 | end
23 | end
24 | end
25 | end
26 |
--------------------------------------------------------------------------------
/spec/lib/expression/variables_spec.rb:
--------------------------------------------------------------------------------
1 | require 'expression/parser'
2 | require 'expression/variables'
3 |
4 | describe Expression::Variables do
5 |
6 | let(:parser) { Expression::Parser.new }
7 | let(:variables) { Expression::Variables.new }
8 | it "gets one variable name" do
9 | expect(variables.apply(parser.parse('one'))).to eq(['one'])
10 | end
11 | it "preserves case " do
12 | expect(variables.apply(parser.parse('ONe'))).to eq(['ONe'])
13 | end
14 | it "multiple variables" do
15 | expect(variables.apply(parser.parse('fox or cow or goat'))).to eq(%w[fox cow goat])
16 | end
17 |
18 | end
19 |
--------------------------------------------------------------------------------
/spec/spec.rake:
--------------------------------------------------------------------------------
1 | begin
2 | require 'rspec/core/rake_task'
3 |
4 | spec_tasks = Dir['spec/*/'].each_with_object([]) do |d, result|
5 | result << File.basename(d) unless Dir["#{d}*"].empty?
6 | end
7 |
8 | spec_tasks.each do |folder|
9 | desc "Run the spec suite in #{folder}"
10 | RSpec::Core::RakeTask.new("spec:#{folder}") do |t|
11 | t.pattern = "./spec/#{folder}/**/*_spec.rb"
12 | t.rspec_opts = "--color"
13 | end
14 | end
15 |
16 | desc "Run complete application spec suite"
17 | RSpec::Core::RakeTask.new(:spec)
18 | rescue LoadError
19 | puts "RSpec is not part of this bundle, skip specs."
20 | end
21 |
--------------------------------------------------------------------------------
/config/env.rb:
--------------------------------------------------------------------------------
1 | # define core environment that we need in tests and for the app
2 |
3 | # Defines our constants
4 | ENV['RACK_ENV'] ||= 'development'
5 | RACK_ENV = ENV['RACK_ENV'] unless defined?(RACK_ENV)
6 | PADRINO_ROOT = File.expand_path('../..', __FILE__) unless defined?(PADRINO_ROOT)
7 |
8 | # Load our dependencies
9 | require 'rubygems' unless defined?(Gem)
10 | require 'bundler/setup'
11 | require 'newrelic_rpm'
12 | Bundler.require(:default, RACK_ENV)
13 |
14 | # do this early so we can log during startup
15 | require './lib/data_magic/config.rb'
16 | DataMagic::Config.logger=Logger.new(STDOUT) if ENV['VCAP_APPLICATION'] # Cloud Foundry
17 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/index/importer_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 |
4 | describe "DataMagic::Index::Importer" do
5 | before do
6 | ENV['DATA_PATH'] = './spec/fixtures/minimal'
7 | DataMagic.init(load_now: false)
8 | end
9 | after do
10 | DataMagic.destroy
11 | end
12 |
13 | it "indexes in parallel based on NPROCS" do
14 | stub_const('ENV', { 'NPROCS' => '2' })
15 |
16 | data_str = <<-eos
17 | a,b
18 | 1,2
19 | 3,4
20 | eos
21 | data = StringIO.new(data_str)
22 | num_rows, fields = DataMagic.import_csv(data)
23 | expect(num_rows).to be(2)
24 | expect(fields).to eq(['a', 'b'])
25 | end
26 | end
27 |
--------------------------------------------------------------------------------
/spec/lib/zipcode_spec.rb:
--------------------------------------------------------------------------------
1 | require 'zipcode/zipcode'
2 |
3 | describe Zipcode do
4 | it "gives a location based on zipcode" do
5 | location = Zipcode.latlon('94132')
6 | expect(location).to eq(lat: 37.7211, lon: -122.4754)
7 | end
8 | it "supports zipcode given as a number" do
9 | location = Zipcode.latlon(94132)
10 | expect(location).to eq(lat: 37.7211, lon: -122.4754)
11 | end
12 |
13 | describe '#valid' do
14 | it "returns true if the zipcode is valid" do
15 | expect(Zipcode.valid? 94132).to eq(true)
16 | end
17 | it "returns false if the zipcode is invalid" do
18 | expect(Zipcode.valid? 00002).to eq(false)
19 | end
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/spec/fixtures/types/data.yaml:
--------------------------------------------------------------------------------
1 |
2 | version: 0
3 | # cities100.txt
4 | # National Places Gazetteer Files, from US Census 2010
5 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html
6 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt
7 | # head -n 101 results.txt > cities100.txt
8 | # then convertes to csv and removed " city" from after each city name
9 | dictionary:
10 | id:
11 | source: id
12 | type: literal
13 | city.name:
14 | source: name
15 | type: name
16 | city.state: state
17 | location.lat: lat
18 | location.lon: lon
19 |
20 | index: place-data
21 | api: places
22 | files:
23 | - name: places.csv
24 |
--------------------------------------------------------------------------------
/tasks/import.rake:
--------------------------------------------------------------------------------
1 | require 'data_magic'
2 | require 'ruby-prof'
3 |
4 | desc "import files from DATA_PATH, rake import[profile=true] for profile output"
5 | task :import, [:profile] => :environment do |t, args|
6 | options = {}
7 | start_time = Time.now
8 | RubyProf.start if args[:profile]
9 |
10 | DataMagic.import_with_dictionary(options)
11 |
12 | if args[:profile]
13 | result = RubyProf.stop
14 | end_time = Time.now
15 | puts "indexing complete: #{distance_of_time_in_words(end_time, start_time)}"
16 | puts "duration: #{end_time - start_time}"
17 |
18 | printer = RubyProf::MultiPrinter.new(result);
19 | printer.print(path: ".", profile: "profile", min_percent: 2)
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/spec/fixtures/cities_with_yml/data.yml:
--------------------------------------------------------------------------------
1 | # cities100.txt
2 | # National Places Gazetteer Files, from US Census 2010
3 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html
4 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt
5 | # head -n 101 results.txt > cities100.txt
6 | # then convertes to csv and removed " city" from after each city name
7 | version: fixture-import-all
8 | index: city-data
9 | api: cities
10 | global_mapping:
11 | USPS: state
12 | NAME: name
13 | POP10: population
14 | INTPTLAT: latitude
15 | INTPTLONG: longitude
16 |
17 | files:
18 | - name: cities50.csv
19 | add:
20 | category: 'top50'
21 | - name: cities51-100.csv
22 |
--------------------------------------------------------------------------------
/spec/fixtures/import_with_dictionary/data.yaml:
--------------------------------------------------------------------------------
1 | # cities100.txt
2 | # National Places Gazetteer Files, from US Census 2010
3 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html
4 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt
5 | # head -n 101 results.txt > cities100.txt
6 | # then convertes to csv and removed " city" from after each city name
7 | version: fixture-import-all
8 | index: city-data
9 | api: cities
10 | dictionary:
11 | state: USPS
12 | name: NAME
13 | population: POP10
14 | latitude: INTPTLAT
15 | longitude: INTPTLONG
16 |
17 | files:
18 | - name: cities50.csv
19 | add:
20 | category: 'top50'
21 | - name: cities51-100.csv
22 |
--------------------------------------------------------------------------------
/lib/expression/expression.rb:
--------------------------------------------------------------------------------
1 | require_relative 'parser'
2 | require_relative 'eval'
3 | require_relative 'variables'
4 | require 'hashie'
5 |
6 | class Expression
7 | attr_accessor :name # purely for reporting Errors
8 | attr_reader :variables
9 |
10 | def initialize(expr, name = 'unknown')
11 | @tree = Parser.new.parse(expr)
12 | @variables = Variables.new.apply(@tree)
13 | end
14 |
15 | def evaluate(vars)
16 | Hashie.stringify_keys! vars
17 | Eval.new.apply(@tree, variables: vars)
18 | end
19 |
20 | def self.find_or_create(expr, name = 'unknown')
21 | @cached_expression ||= {}
22 | @cached_expression[expr] ||= Expression.new(expr, name)
23 | @cached_expression[expr]
24 | end
25 | end
26 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/example_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe Example do
4 | let(:hash) do
5 | { name: 'foo',
6 | description: 'interesting thing',
7 | params: 'a=1&b=something',
8 | endpoint: 'api' }
9 | end
10 | subject(:e) { Example.new(hash) }
11 |
12 | it "has a name" do
13 | expect(e.name).to eq(hash[:name])
14 | end
15 | it "has a description" do
16 | expect(e.description).to eq(hash[:description])
17 | end
18 | it "has a params" do
19 | expect(e.params).to eq(hash[:params])
20 | end
21 | it "has an endpoint" do
22 | expect(e.endpoint).to eq(hash[:endpoint])
23 | end
24 |
25 | it "has a link" do
26 | expect(e.link).to eq("/v1/#{e.endpoint}?#{e.params}")
27 | end
28 | end
29 |
--------------------------------------------------------------------------------
/spec/fixtures/nested_files/school2011.csv:
--------------------------------------------------------------------------------
1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6
2 | 1,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1025,4048,0.92
3 | 2,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,866,45556,0.34
4 | 3,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,453,4675,0.71
5 | 4,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,572,15466,0.34
6 | 5,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1111,11266,0.86
7 | 6,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,818,23357,0.58
8 | 7,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1392,32584,0.39
9 | 8,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,718,252,0.26
10 | 9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1297,36088,0.63
11 | 10,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,635,3259,0.70
12 |
--------------------------------------------------------------------------------
/circle.yml:
--------------------------------------------------------------------------------
1 | dependencies:
2 | cache_directories:
3 | - elasticsearch-1.7.1
4 | pre:
5 | - curl -v -L -o cf-cli_amd64.deb 'https://cli.run.pivotal.io/stable?release=debian64&source=github'
6 | - sudo dpkg -i cf-cli_amd64.deb
7 | - cf -v
8 | post:
9 | - if [[ ! -e elasticsearch-1.7.1 ]]; then wget https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch-1.7.1.tar.gz && tar -xvf elasticsearch-1.7.1.tar.gz; fi
10 | - elasticsearch-1.7.1/bin/elasticsearch: {background: true}
11 |
12 | test:
13 | post:
14 | - cf api https://api.cloud.gov
15 | - cf auth $CF_USER $CF_PASSWORD
16 | - cf target -o ed -s dev
17 | - cf a
18 |
19 | deployment:
20 | development:
21 | branch: dev
22 | commands:
23 | - cf push -f manifest-dev.yml
24 |
--------------------------------------------------------------------------------
/lib/data_magic/index/event_logger.rb:
--------------------------------------------------------------------------------
1 | module DataMagic
2 | module Index
3 | class EventLogger
4 | def trigger(event, *args)
5 | self.send(event, *args)
6 | end
7 |
8 | ['debug', 'info', 'warn', 'error'].each do |level|
9 | class_eval <<-RUBY, __FILE__, __LINE__ + 1
10 | def #{level}(message, object=nil, limit=nil)
11 | logger.#{level}(full_message(message, object, limit))
12 | end
13 | RUBY
14 | end
15 |
16 | def full_message(prefix, object, limit)
17 | return prefix unless object
18 | message = "#{prefix}: "
19 | if limit
20 | message << object.inspect[0..limit]
21 | else
22 | message << object.inspect
23 | end
24 | message
25 | end
26 | end
27 | end
28 | end
29 |
--------------------------------------------------------------------------------
/spec/fixtures/nested_files/school2012.csv:
--------------------------------------------------------------------------------
1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6
2 | 1,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,461,35231,0.01
3 | 2,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,986,34095,0.71
4 | 3,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1094,42579,0.39
5 | 4,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,854,37589,0.15
6 | 5,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,650,13611,0.04
7 | 6,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,797,36924,0.64
8 | 7,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,994,31799,0.60
9 | 8,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1420,30063,0.97
10 | 9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1292,42150,0.83
11 | 10,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,605,2608,0.92
12 | 11,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,2608,0.92
13 |
--------------------------------------------------------------------------------
/spec/tasks/import_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'bundler/setup'
3 | require 'padrino-core/cli/rake'
4 |
5 | describe 'elastic search index management rake task' do
6 | before do
7 | PadrinoTasks.init
8 | DataMagic.init(load_now: true)
9 | end
10 |
11 | after do
12 | DataMagic.destroy
13 | end
14 |
15 | context "imports" do
16 | it "default sample-data" do
17 | ENV['DATA_PATH'] = nil
18 | expect { Rake::Task['import'].invoke }.not_to raise_exception
19 | end
20 |
21 | it "correct configuration" do
22 | dir_path = './spec/fixtures/import_with_dictionary'
23 | ENV['DATA_PATH'] = dir_path
24 | expect { Rake::Task['import'].invoke }.not_to raise_exception
25 | expect(DataMagic.config.api_endpoint_names).to eq(['cities'])
26 | end
27 |
28 | end
29 |
30 | end
31 |
--------------------------------------------------------------------------------
/spec/fixtures/calculated_columns/data.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | version: Aug6-2015-08-10-23:48-0600
3 | api: fakeschool
4 | index: fakeschool-data
5 | unique:
6 | - id
7 | options:
8 | limit_files: 1
9 | limit_rows: 100
10 |
11 | dictionary:
12 | id:
13 | source: UNITID
14 | type: integer
15 | description: Unit ID for institution
16 | school.name:
17 | source: INSTNM
18 | description: Institution name
19 | integer1:
20 | source: INT1
21 | type: integer
22 | integer2:
23 | source: INT2
24 | type: integer
25 | integer3:
26 | source: INT3
27 | type: integer
28 | integer4:
29 | source: INT4
30 | type: integer
31 | summarybool:
32 | calculate: INT1 or INT2 or INT3 or INT4
33 | type: boolean
34 | description: are any of the unparsed booleans true?
35 |
36 | files:
37 | - name: schools.csv
38 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/import_csv_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 |
4 | describe "DataMagic #import_csv" do
5 | before do
6 | ENV['DATA_PATH'] = './spec/fixtures/minimal'
7 | DataMagic.init(load_now: false)
8 | end
9 | after do
10 | DataMagic.destroy
11 | #expect(DataMagic.client.indices.get(index: '_all')).to be_empty
12 | end
13 |
14 | it "throws errors for bad format" do
15 | data = StringIO.new("not csv format")
16 | expect{DataMagic.import_csv(data)}.to raise_error(DataMagic::InvalidData)
17 | end
18 |
19 | it "reads file and reports number of rows and headers" do
20 | data_str = <<-eos
21 | a,b
22 | 1,2
23 | 3,4
24 | eos
25 | data = StringIO.new(data_str)
26 | num_rows, fields = DataMagic.import_csv(data)
27 | expect(num_rows).to be(2)
28 | expect(fields).to eq(['a', 'b'])
29 | end
30 |
31 | end
32 |
--------------------------------------------------------------------------------
/lib/data_magic/index/document.rb:
--------------------------------------------------------------------------------
1 | module DataMagic
2 | module Index
3 | class Document
4 | attr_reader :data, :id
5 |
6 | def initialize(data)
7 | @data = data
8 | @id = calculate_id
9 | end
10 |
11 | def remove_ids
12 | config.data['unique'].each { |key| data.delete key }
13 | end
14 |
15 | def headers
16 | data.keys.map(&:to_s) # does this only return top level fields?
17 | end
18 |
19 | def preview(n=500)
20 | data.inspect[0..n]
21 | end
22 |
23 | def id_empty?
24 | id && id.empty?
25 | end
26 |
27 | private
28 |
29 | def calculate_id
30 | return nil if config.data['unique'].length == 0
31 | config.data['unique'].map { |field| data[field] }.join(':')
32 | end
33 |
34 | def config
35 | DataMagic.config
36 | end
37 | end
38 | end
39 | end
40 |
--------------------------------------------------------------------------------
/app/app.rb:
--------------------------------------------------------------------------------
1 | require 'csv'
2 |
3 | module OpenDataMaker
4 | class App < Padrino::Application
5 | register SassInitializer
6 | register Padrino::Helpers
7 |
8 | # This app is stateless and session cookies prevent caching of API responses
9 | disable :sessions
10 |
11 | # This app has no sensitive bits and csrf protection requires sessions
12 | disable :protect_from_csrf
13 |
14 | if ENV['DATA_AUTH'] and not ENV['DATA_AUTH'].empty?
15 | auth = ENV['DATA_AUTH']
16 | authorized_user, authorized_pass = auth.split(',')
17 | use Rack::Auth::Basic, "Restricted Area" do |username, password|
18 | username == authorized_user and password == authorized_pass
19 | end
20 | end
21 |
22 | ## app setup
23 | if ENV['RACK_ENV'] == 'test'
24 | DataMagic.init(load_now: true)
25 | else
26 | DataMagic.init(load_now: false) # don't index data
27 | end
28 |
29 | end
30 |
31 | end
32 |
--------------------------------------------------------------------------------
/lib/data_magic/index/super_client.rb:
--------------------------------------------------------------------------------
1 | require 'forwardable'
2 |
3 | module DataMagic
4 | module Index
5 | class SuperClient
6 | attr_reader :client, :options
7 |
8 | def initialize(client, options)
9 | @client = client
10 | @options = options
11 | end
12 |
13 | def create_index
14 | DataMagic.create_index unless config.index_exists?
15 | end
16 |
17 | def refresh_index
18 | client.indices.refresh index: index_name
19 | end
20 |
21 | def creating?
22 | options[:nest] == nil
23 | end
24 |
25 | def allow_skips?
26 | options[:nest][:parent_missing] == 'skip'
27 | end
28 |
29 | def index_name
30 | config.scoped_index_name
31 | end
32 |
33 | def config
34 | DataMagic.config
35 | end
36 |
37 | extend Forwardable
38 |
39 | def_delegators :client, :index, :update
40 | end
41 | end
42 | end
43 |
--------------------------------------------------------------------------------
/config/boot.rb:
--------------------------------------------------------------------------------
1 | require_relative 'env.rb'
2 |
3 | ##
4 | # ## Enable devel logging
5 | #
6 | # Padrino::Logger::Config[:development][:log_level] = :devel
7 | # Padrino::Logger::Config[:development][:log_static] = true
8 | #
9 | # ## Configure your I18n
10 | #
11 | # I18n.default_locale = :en
12 | # I18n.enforce_available_locales = false
13 | #
14 | # ## Configure your HTML5 data helpers
15 | #
16 | # Padrino::Helpers::TagHelpers::DATA_ATTRIBUTES.push(:dialog)
17 | # text_field :foo, :dialog => true
18 | # Generates:
19 | #
20 | # ## Add helpers to mailer
21 | #
22 | # Mail::Message.class_eval do
23 | # include Padrino::Helpers::NumberHelpers
24 | # include Padrino::Helpers::TranslationHelpers
25 | # end
26 |
27 | ##
28 | # Add your before (RE)load hooks here
29 | #
30 | Padrino.before_load do
31 | end
32 |
33 | ##
34 | # Add your after (RE)load hooks here
35 | #
36 | Padrino.after_load do
37 | end
38 |
39 | Padrino.load!
40 |
--------------------------------------------------------------------------------
/spec/fixtures/nested_files/data.yaml:
--------------------------------------------------------------------------------
1 | version: 0.2
2 | api: school
3 | index: fake-nested
4 | unique: [id]
5 |
6 | dictionary:
7 | id: UNITID
8 | name:
9 | source: INSTNM
10 | type: literal
11 | city: CITY_MAIN
12 | state: STABBR_MAIN
13 | zipcode: ZIP_MAIN
14 | sat_average: SAT_AVG
15 | location.lat: LATITUDE_MAIN
16 | location.lon: LONGITUDE_MAIN
17 |
18 | earnings.6_yrs_after_entry.median:
19 | source: earn_2002_p10
20 | description: Median earnings of students
21 | type: integer
22 |
23 | earnings.6_yrs_after_entry.percent_gt_25k:
24 | source: gt_25k_2006_p6
25 | description: Share of students earning over $25,000/year
26 | type: float
27 |
28 | files:
29 | - name: school-data.csv
30 | only: [id, name, city, state]
31 | - name: school2013.csv
32 | nest:
33 | key: 2013
34 | contents: [earnings, sat_average]
35 | - name: school2012.csv
36 | nest:
37 | key: 2012
38 | contents: [earnings, sat_average]
39 |
--------------------------------------------------------------------------------
/spec/fixtures/nested2/data.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | version: Aug6-2015-08-10-23:48-0600
3 | api: fakeschool
4 | index: fakeschool-data
5 | unique:
6 | - id
7 | options:
8 | # columns: all
9 | limit_files: 1
10 | limit_rows: 100
11 | search: dictionary_only
12 |
13 | dictionary:
14 | id:
15 | source: UNITID
16 | type: integer
17 | description: Unit ID for institution
18 | ope8_id:
19 | source: OPEID
20 | type: integer
21 | description: 8-digit OPE ID for institution
22 | ope6_id:
23 | source: opeid6
24 | type: integer
25 | description: 6-digit OPE ID for institution
26 | school.name:
27 | source: INSTNM
28 | type: literal
29 | description: Institution name
30 | school.city:
31 | source: CITY_MAIN
32 | description: City
33 | school.state:
34 | source: STABBR_MAIN
35 | description: State postcode
36 | school.zip:
37 | source: ZIP_MAIN
38 | type: integer
39 | description: ZIP code
40 |
41 | files:
42 | - name: school2013.csv
43 |
--------------------------------------------------------------------------------
/spec/lib/data_magic_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 | require 'fixtures/data.rb'
4 |
5 | describe DataMagic do
6 | it "cleans up after itself" do
7 | DataMagic.init(load_now: true)
8 | DataMagic.destroy
9 | DataMagic.logger.info "just destroyed"
10 | #expect(DataMagic.client.indices.get(index: '_all')).to be_empty
11 | end
12 |
13 | describe '.es_field_types' do
14 | it 'returns the given fields with their specified type' do
15 | expect(described_class.es_field_types({ 'state' => 'string', land_area: 'string' }))
16 | .to eq("state" => { :type => "string" },
17 | :land_area => { :type => "string" })
18 | end
19 |
20 | context 'with custom type "literal"' do
21 | it 'returns string type with :index of "not_analyzed"' do
22 | expect(described_class.es_field_types({ 'state' => 'string', 'name' => 'literal' }))
23 | .to eq({"state"=>{:type=>"string"}, "name"=>{:type=>"string", :index=>"not_analyzed"}})
24 | end
25 | end
26 |
27 | end
28 |
29 | end
30 |
--------------------------------------------------------------------------------
/NOTES.md:
--------------------------------------------------------------------------------
1 |
2 | ## Data
3 |
4 | Details about the data are specified by DATA_PATH/data.yaml.
5 | Where DATA_PATH is an environment variable, which may be:
6 |
7 | * `s3://username:password@bucket_name/path`
8 | * `s3://bucket_name/path`
9 | * `s3://bucket_name`
10 | * a local path like: `./data`
11 |
12 |
13 | This file is loaded the first time it is needed and then stored in memory. The contents of `data.yaml` are stored as JSON in Elasticsearch in a single document of type `config` with id `1`.
14 |
15 | The version field of this document is checked at startup. If the new config has a new version, then we delete the whole index and re-index all of the files referred to in the `data.yaml` files section.
16 |
17 | If no data.yml or data.yaml file is found, then all CSV files in `DATA_PATH` will be loaded, and all fields in their headers will be used.
18 |
19 | ## Debugging
20 |
21 | `ES_DEBUG` environment variable will turn on verbose tracer in the Elasticsearch client
22 |
23 | optional performance profiling for rake import: `rake import[profile=true]`
24 |
--------------------------------------------------------------------------------
/spec/fixtures/nested2/school2013.csv:
--------------------------------------------------------------------------------
1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6
2 | 1,Normal,AL,1,35762,5,34.7834,-86.5685,Reichert University,1195,26318,0.53
3 | 2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Montgomery School,770,6785,0.61
4 | 3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Indigo Card Community College,526,16767,0.50
5 | 4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Warm Meadow School of Fine Art,457,1836,0.09
6 | 5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Kovacek Institute of Technology,1511,19372,0.82
7 | 6,Athens,AL,1,35611,5,34.8056,-86.9651,Athens Institute,1057,49203,0.06
8 | 7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Alabama Beauty College of Auburn University,486,44097,0.50
9 | 8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Condemned Balloon Institute,616,59759,0.59
10 | 9,Tanner,AL,1,35671,5,34.6543,-86.9491,Inquisitive Farm College,971,34183,0.19
11 | 10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Enterprise University,920,42629,0.59
12 |
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | ENV['DATA_PATH'] = nil
2 | ENV['RACK_ENV'] ||= 'test'
3 | RACK_ENV = ENV['RACK_ENV'] unless defined?(RACK_ENV)
4 |
5 | #require File.expand_path(File.dirname(__FILE__) + "/../config/boot")
6 | require_relative '../config/env.rb'
7 | Dir[File.expand_path(File.dirname(__FILE__) + "/../app/helpers/**/*.rb")].each(&method(:require))
8 |
9 | RSpec.configure do |config|
10 | config.include Rack::Test::Methods
11 |
12 | config.before(:type => :feature) do
13 | # load the Padrino web app defined in app/app.rb
14 | require_relative '../config/boot'
15 | end
16 | config.before do
17 | ENV['DATA_PATH'] = nil
18 | end
19 | end
20 |
21 | # You can use this method to custom specify a Rack app
22 | # you want rack-test to invoke:
23 | #
24 | # app OpenDataMaker::App
25 | # app OpenDataMaker::App.tap { |a| }
26 | # app(OpenDataMaker::App) do
27 | # set :foo, :bar
28 | # end
29 | #
30 | def app(app = nil, &blk)
31 | @app ||= block_given? ? app.instance_eval(&blk) : app
32 | @app ||= Padrino.application
33 | end
34 |
--------------------------------------------------------------------------------
/spec/fixtures/nested_files/school2013.csv:
--------------------------------------------------------------------------------
1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6
2 | 1,Normal,AL,1,35762,5,34.7834,-86.5685,Reichert University,1195,26318,0.53
3 | 2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Montgomery School,770,6785,0.61
4 | 3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Indigo Card Community College,526,16767,0.50
5 | 4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Warm Meadow School of Fine Art,457,1836,0.09
6 | 5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Kovacek Institute of Technology,1511,19372,0.82
7 | 6,Athens,AL,1,35611,5,34.8056,-86.9651,Athens Institute,1057,49203,0.06
8 | 7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Alabama Beauty College of Auburn University,486,44097,0.50
9 | 8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Condemned Balloon Institute,616,59759,0.59
10 | 9,Tanner,AL,1,35671,5,34.6543,-86.9491,Inquisitive Farm College,971,34183,0.19
11 | 10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Enterprise University,920,42629,0.59
12 |
--------------------------------------------------------------------------------
/lib/zipcode/zipcode.rb:
--------------------------------------------------------------------------------
1 | # Zipcode latitude and longitude data in us_zipcodes.txt
2 | # provided by [GeoNames](http://www.geonames.org/)
3 | # under under a Creative Commons Attribution 3.0 License:
4 | # http://creativecommons.org/licenses/by/3.0/
5 |
6 | # this code is in public domain (CC0 1.0)
7 | # https://github.com/18F/open-data-maker/blob/dev/LICENSE.md
8 |
9 | require 'csv'
10 |
11 | class Zipcode
12 | @@zipcode_hash = nil
13 |
14 | def Zipcode.latlon(zipcode)
15 | zipcode = zipcode.to_s
16 | @@zipcode_hash ||= converted_zipcodes
17 | @@zipcode_hash[zipcode]
18 | end
19 |
20 | def Zipcode.valid?(zipcode)
21 | !!self.latlon(zipcode)
22 | end
23 |
24 | private
25 | def self.converted_zipcodes
26 | parsed_file = CSV.read(File.expand_path("../us_zipcodes.txt", __FILE__), { :col_sep => "\t" })
27 | zipcode_hash = {}
28 | parsed_file.each do |row|
29 | zipcode = row[1]
30 | lat = row[9].to_f
31 | lon = row[10].to_f
32 | zipcode_hash[zipcode] = {'lat': lat, 'lon': lon}
33 | end
34 | zipcode_hash
35 | end
36 |
37 | end
38 |
--------------------------------------------------------------------------------
/script/s3config.rb:
--------------------------------------------------------------------------------
1 | # configure S3 with local credentials based on environment
2 | # usage (from ruby script or irb):
3 | # require 's3config.rb'
4 | # @s3 = ::Aws::S3::Client.new
5 |
6 | require 'dotenv'
7 |
8 | branch = `echo $(git symbolic-ref --short HEAD)`.chomp
9 |
10 | if ENV['APP_ENV']
11 | APP_ENV = ENV['APP_ENV']
12 | puts "using APP_ENV from environment #{APP_ENV}"
13 | else
14 | case branch
15 | when "master"
16 | APP_ENV = "production"
17 | when "staging"
18 | APP_ENV = "staging"
19 | else
20 | puts "not on master or staging branch lets use dev"
21 | APP_ENV = "dev"
22 | end
23 | end
24 |
25 | Dotenv.load(
26 | File.expand_path("../../.#{APP_ENV}.env", __FILE__),
27 | File.expand_path("../../.env", __FILE__))
28 |
29 | require 'aws-sdk'
30 | puts "app env: #{APP_ENV}"
31 | puts "bucket name: #{ENV['s3_bucket']}"
32 |
33 |
34 | s3cred = {'access_key'=> ENV['s3_access_key'], 'secret_key' => ENV['s3_secret_key']}
35 |
36 | ::Aws.config[:credentials] = ::Aws::Credentials.new(s3cred['access_key'], s3cred['secret_key'])
37 | ::Aws.config[:region] = 'us-east-1'
38 |
--------------------------------------------------------------------------------
/spec/fixtures/data.rb:
--------------------------------------------------------------------------------
1 | # Ages adjusted for Springfield residents to average to 42
2 | # Heights randomly set to generate a max of 142
3 | def address_data
4 | @address_data ||= StringIO.new <<-eos
5 | name,address,city,age,height
6 | Paul,15 Penny Lane,Liverpool,10,142
7 | Michelle,600 Pennsylvania Avenue,Washington,12,1
8 | Marilyn,1313 Mockingbird Lane,Springfield,14,2
9 | Sherlock,221B Baker Street,London,16,123
10 | Clark,66 Lois Lane,Smallville,18,141
11 | Bart,742 Evergreen Terrace,Springfield,70,142
12 | Paul,19 N Square,Boston,70,55.2
13 | Peter,66 Parker Lane,New York,74,11.5123
14 | eos
15 | @address_data.rewind
16 | @address_data
17 | end
18 |
19 | def geo_data
20 | @geo_data ||= StringIO.new <<-eos
21 | state,city,lat,lon
22 | CA,San Francisco,37.727239,-123.032229
23 | NY,"New York",40.664274,-73.938500
24 | CA,"Los Angeles",34.019394,-118.410825
25 | IL,Chicago,41.837551,-87.681844
26 | TX,Houston,29.780472,-95.386342
27 | PA,Philadelphia,40.009376,-75.133346
28 | CA,"San Jose",37.296867,-121.819306
29 | MA,Boston,42.331960,-71.020173
30 | WA,Seattle,47.620499,-122.350876
31 | eos
32 | @geo_data.rewind
33 | @geo_data
34 | end
35 |
--------------------------------------------------------------------------------
/spec/fixtures/nested_files/school-data.csv:
--------------------------------------------------------------------------------
1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6
2 | 1,Normal,AL,1,35762,5,34.7834,-86.5685,Reichert University,1195,26318,0.53
3 | 2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Montgomery School,770,6785,0.61
4 | 3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Indigo Card Community College,526,16767,0.50
5 | 4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Warm Meadow School of Fine Art,457,1836,0.09
6 | 5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Kovacek Institute of Technology,1511,19372,0.82
7 | 6,Athens,AL,1,35611,5,34.8056,-86.9651,Athens Institute,1057,49203,0.06
8 | 7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Alabama Beauty College of Auburn University,486,44097,0.50
9 | 8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Condemned Balloon Institute,616,59759,0.59
10 | 9,Tanner,AL,1,35671,5,34.6543,-86.9491,Inquisitive Farm College,971,34183,0.19
11 | 10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Enterprise University,920,42629,0.59
12 | 11,Montgomery,NULL,1,36117,5,32.3643,-86.2957,Auburn University at Montgomery,940,49879,0.64
13 |
--------------------------------------------------------------------------------
/spec/lib/expression/eval_spec.rb:
--------------------------------------------------------------------------------
1 | require 'expression/parser'
2 | require 'expression/eval'
3 |
4 | describe Expression::Eval do
5 |
6 | let(:parser) { Expression::Parser.new }
7 | let(:eval) { Expression::Eval.new }
8 | let(:values) {{ 'f' => 0, 't' => 1 }}
9 |
10 | it "simple 'or'" do
11 | expect(
12 | eval.apply(parser.parse('t or f'), variables: values)
13 | ).to eq(1)
14 | end
15 |
16 | describe "simple 'and'" do
17 | it "true and false" do
18 | expect(
19 | eval.apply(parser.parse('t and f'), variables: values)
20 | ).to eq(0)
21 | end
22 |
23 | it "false and true" do
24 | expect(
25 | eval.apply(parser.parse('f and t'), variables: values)
26 | ).to eq(0)
27 | end
28 | end
29 |
30 | it "multiple operands" do
31 | expect(
32 | eval.apply(parser.parse('f or f or t'), variables: values)
33 | ).to eq(1)
34 | end
35 |
36 | describe "parens" do
37 | it "nested 'or'" do
38 | expect(
39 | eval.apply(parser.parse('(f or t) and t'), variables: values)
40 | ).to eq(1)
41 | end
42 |
43 | it "nested 'and'" do
44 | expect(
45 | eval.apply(parser.parse('(f and t) or f'), variables: values)
46 | ).to eq(0)
47 | end
48 | end
49 | end
50 |
--------------------------------------------------------------------------------
/spec/fixtures/schools/schools.csv:
--------------------------------------------------------------------------------
1 | UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,PREDDEG,UGDS,MENONLY,WOMENONLY,C150_4_POOLED_SUPP,C150_L4_POOLED_SUPP,earn_2002_p10,gt_25k_2006_p6
2 | 1,Normal,AL,1,35762,5,34.7834,-86.5685,Indigo Peak School,639,1,183504,0,0,NULL,0.16,3800,0.61
3 | 2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Warm Thread Beauty College,1218,3,210739,0,0,0.62,NULL,13566,0.10
4 | 3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Arrogant Abyss University,613,1,116967,0,0,NULL,0,1177,0.84
5 | 4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Greasy Marsh Institute,590,1,81254,0,1,NULL,NULL,54146,0.49
6 | 5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Harber Institute of Technology,1355,1,256538,1,0,0,0.91,38553,0.32
7 | 6,Athens,AL,1,35611,5,34.8056,-86.9651,Unsightly Mountain School of Fine Art,1201,1,139899,0,0,NULL,0.87,55899,0.95
8 | 7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Auburn University College,740,3,165974,0,0,0.21,NULL,51608,0.73
9 | 8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Birmingham School,1084,1,224554,0,0,NULL,0.70,29545,0.67
10 | 9,Tanner,AL,1,35671,5,34.6543,-86.9491,Conn Institute of Technology,1171,4,87710,0,0,NULL,0.56,58307,0.63
11 | 10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Hollow Resonance Institute,1058,2,97265,0,0,NULL,0.59,17880,0.36
12 |
--------------------------------------------------------------------------------
/spec/fixtures/schools/data.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | version: Aug6-2015-08-10-23:48-0600
3 | api: fakeschool
4 | index: fakeschool-data
5 | unique:
6 | - id
7 | options:
8 | limit_files: 1
9 | limit_rows: 100
10 |
11 | dictionary:
12 | id:
13 | source: UNITID
14 | type: integer
15 | description: Unit ID for institution
16 | school.name:
17 | source: INSTNM
18 | description: Institution name
19 | school.city:
20 | source: CITY_MAIN
21 | description: City
22 | school.state:
23 | source: STABBR_MAIN
24 | description: State postcode
25 | school.zip:
26 | source: ZIP_MAIN
27 | type: integer
28 | description: ZIP code
29 | completion.rate.lt_four_year:
30 | source: C150_L4_POOLED_SUPP
31 | type: float
32 | description: 150% completion rate for less-than-four-year institutions, pooled in two-year rolling averages and suppressed for small n size
33 | completion.rate.four_year:
34 | source: C150_4_POOLED_SUPP
35 | type: float
36 | description: 150% completion rate for four-year institutions, pooled in two-year rolling averages and suppressed for small n size
37 | completion.rate.overall:
38 | calculate: C150_L4_POOLED_SUPP or C150_4_POOLED_SUPP
39 | type: float
40 | description: 150% completion rate for the institution, independent of degree
41 |
42 | files:
43 | - name: schools.csv
44 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | ruby '2.2.4'
3 |
4 | # Distribute your app as a gem
5 | # gemspec
6 |
7 | # Server requirements
8 | # gem 'thin' # or mongrel
9 | # gem 'trinidad', :platform => 'jruby'
10 |
11 | # Optional JSON codec (faster performance)
12 | # gem 'oj'
13 |
14 | # open-data-maker requirements
15 | gem 'elasticsearch'
16 | gem 'stretchy'
17 | gem 'hashie'
18 | gem 'cf-app-utils'
19 | #gem 'unicorn'
20 | gem 'puma'
21 | gem 'safe_yaml'
22 | gem 'aws-sdk', '~> 2'
23 | gem 'actionview'
24 | gem 'dotenv'
25 | gem 'oj'
26 | gem 'parslet'
27 | gem 'parallel'
28 |
29 | # Project requirements
30 | gem 'rake'
31 |
32 | # Component requirements
33 | gem 'sass'
34 | gem 'liquify'
35 | gem 'liquid', '= 3.0.3'
36 | gem 'erubis'
37 |
38 | # Test requirements
39 | group :test do
40 | gem 'rspec'
41 | gem 'rspec-mocks'
42 | gem 'rack-test', :require => 'rack/test'
43 | end
44 |
45 | group 'dev' do
46 | gem 'google_drive'
47 | gem 'ruby-prof'
48 |
49 | end
50 | # Padrino Stable Gem
51 | gem 'padrino', '0.12.5'
52 |
53 | gem 'pry', :group => ['development', 'test']
54 | gem 'pry-byebug', :group => ['development', 'test']
55 | gem 'newrelic_rpm'
56 |
57 | # Or Padrino Edge
58 | # gem 'padrino', :github => 'padrino/padrino-framework'
59 |
60 | # Or Individual Gems
61 | # %w(core support gen helpers cache mailer admin).each do |g|
62 | # gem 'padrino-' + g, '0.12.5'
63 | # end
64 |
--------------------------------------------------------------------------------
/lib/data_magic/index/output.rb:
--------------------------------------------------------------------------------
1 | module DataMagic
2 | module Index
3 | class Output
4 | attr_reader :row_count, :headers, :skipped
5 |
6 | def initialize
7 | @row_count = 0
8 | @skipped = []
9 | end
10 |
11 | def set_headers(doc)
12 | return if headers
13 | @headers = doc.headers
14 | end
15 |
16 | def skipping(id)
17 | skipped << id
18 | end
19 |
20 | def increment(count = 1)
21 | @row_count += count
22 | end
23 |
24 | def validate!
25 | raise DataMagic::InvalidData, "zero rows" if empty?
26 | end
27 |
28 | def empty?
29 | row_count == 0
30 | end
31 |
32 | def log(doc)
33 | log_0(doc) if empty?
34 | log_marker if row_count % 500 == 0
35 | end
36 |
37 | def log_skips
38 | return if skipped.empty?
39 | logger.info "skipped (missing parent id): #{skipped.join(',')}"
40 | end
41 |
42 | def log_limit
43 | logger.info "done now, limiting rows to #{row_count}"
44 | end
45 |
46 | private
47 |
48 | def log_0(document)
49 | logger.debug "csv parsed"
50 | logger.info "row#{row_count} -> #{document.preview}"
51 | end
52 |
53 | def log_marker
54 | logger.info "indexing rows: #{row_count}..."
55 | end
56 | end
57 | end
58 | end
59 |
--------------------------------------------------------------------------------
/spec/features/web_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'app', type: 'feature' do
4 | before do
5 | DataMagic.destroy
6 | ENV['DATA_PATH'] = './spec/fixtures/sample-data'
7 | DataMagic.init(load_now: true)
8 | end
9 |
10 | after do
11 | DataMagic.destroy
12 | end
13 |
14 | it "should load the home page" do
15 | get '/'
16 | expect(last_response).to be_ok
17 | end
18 |
19 | it "should display links to endpoints" do
20 | get '/'
21 | expect(last_response.body).to include 'cities'
22 | end
23 |
24 | it "should display a list of categories" do
25 | get '/'
26 | expect(last_response.body).to include('Browse Data Details by Category')
27 | expect(last_response.body).to include('General') # category name
28 | expect(last_response.body).to include('general information about the city, including standard identifiers')
29 | end
30 |
31 | it "should load the correct category page" do
32 | get '/category/general'
33 | expect(last_response.body).to include('Data Details for the')
34 | expect(last_response.body).to include('category_entry = {"title":"General"')
35 | expect(last_response.body).to include('population') # a field name
36 | expect(last_response.body).to include('The name of the city') # a field description
37 | expect(last_response.body).to include('literal') # field type
38 | end
39 | end
40 |
--------------------------------------------------------------------------------
/script/bootstrap:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | fancy_echo() {
6 | local fmt="$1"; shift
7 |
8 | # shellcheck disable=SC2059
9 | printf "\n$fmt\n" "$@"
10 | }
11 |
12 | brew_install_or_upgrade() {
13 | if brew_is_installed "$1"; then
14 | if brew_is_upgradable "$1"; then
15 | fancy_echo "Upgrading %s ..." "$1"
16 | brew upgrade "$@"
17 | else
18 | fancy_echo "Already using the latest version of %s. Skipping ..." "$1"
19 | fi
20 | else
21 | fancy_echo "Installing %s ..." "$1"
22 | brew install "$@"
23 | fi
24 | }
25 |
26 | brew_is_installed() {
27 | brew list -1 | grep -Fqx "$1"
28 | }
29 |
30 | brew_is_upgradable() {
31 | ! brew outdated --quiet "$1" >/dev/null
32 | }
33 |
34 | brew_tap_is_installed() {
35 | brew tap | grep -Fqx "$1"
36 | }
37 |
38 | brew_tap() {
39 | if ! brew_tap_is_installed "$1"; then
40 | fancy_echo "Tapping $1..."
41 | brew tap "$1" 2> /dev/null
42 | fi
43 | }
44 |
45 | echo 'Installing dependencies...'
46 |
47 | if command -v brew >/dev/null; then
48 | brew update
49 |
50 | brew_tap 'homebrew/services'
51 | brew_tap 'homebrew/versions'
52 | brew_install_or_upgrade 'elasticsearch17'
53 |
54 | brew services restart elasticsearch17
55 |
56 | # elasticsearch takes several seconds to load
57 | sleep 10
58 | fi
59 |
60 | gem install bundler --conservative
61 | bundle check || bundle install
62 |
63 | echo "All done!"
64 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | As a work of the United States Government, this project is in the
2 | public domain within the United States.
3 |
4 | Additionally, we waive copyright and related rights in the work
5 | worldwide through the CC0 1.0 Universal public domain dedication.
6 |
7 | ## CC0 1.0 Universal Summary
8 |
9 | This is a human-readable summary of the
10 | [Legal Code (read the full text)](https://creativecommons.org/publicdomain/zero/1.0/legalcode).
11 |
12 | ### No Copyright
13 |
14 | The person who associated a work with this deed has dedicated the work to
15 | the public domain by waiving all of his or her rights to the work worldwide
16 | under copyright law, including all related and neighboring rights, to the
17 | extent allowed by law.
18 |
19 | You can copy, modify, distribute and perform the work, even for commercial
20 | purposes, all without asking permission.
21 |
22 | ### Other Information
23 |
24 | In no way are the patent or trademark rights of any person affected by CC0,
25 | nor are the rights that other persons may have in the work or in how the
26 | work is used, such as publicity or privacy rights.
27 |
28 | Unless expressly stated otherwise, the person who associated a work with
29 | this deed makes no warranties about the work, and disclaims liability for
30 | all uses of the work, to the fullest extent permitted by applicable law.
31 | When using or citing the work, you should not imply endorsement by the
32 | author or the affirmer.
33 |
34 |
--------------------------------------------------------------------------------
/DICTIONARY.md:
--------------------------------------------------------------------------------
1 | # Dictionary Format
2 |
3 | The data dictionary format may be (optionally) specified in the `data.yaml` file. If unspecified, all columns are imported as strings.
4 |
5 | ## Simple Data Types
6 |
7 | ```
8 | dictionary:
9 | name:
10 | source: COLUMN_NAME
11 | type: integer
12 | description: explanation of where this data comes from and its meaning
13 | ```
14 |
15 | In the above example:
16 | * `source:` is the name of the column in the csv. (This doesn't have to be all caps, we just find that to be common in government datasets.)
17 | * `type:` may be `integer`, `float`, `string`
18 | * `description:` text description suitable for developer documentation or information provided to data analysts
19 |
20 | ## Calculated columns
21 |
22 | Optionally, you can add "columns" by calculating fields at import based on multiple csv columns.
23 |
24 | ```
25 | academics.program.degree.health:
26 | calculate: CIP51ASSOC or CIP51BACHL
27 | type: integer
28 | description: Associate or Bachelor's degree in Health
29 | ```
30 |
31 | Multiple operations are supported. In the following example, if the columns `apples`, `oranges` and `plums` had a `0` value when there were none, and a `1` to represent if they were available, then these values could be combines with `or` to create a data field representing if any were true.
32 |
33 | ```
34 | fruit:
35 | calculate: apples or oranges or plums
36 | type: integer
37 | description: is there any fruit available?
38 | ```
39 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/name_type_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 |
4 | describe "DataMagic name types" do
5 |
6 | before :example do
7 | DataMagic.destroy
8 | ENV['DATA_PATH'] = './spec/fixtures/types'
9 | DataMagic.init(load_now: true)
10 | end
11 | after :example do
12 | DataMagic.destroy
13 | end
14 |
15 | it "can search for one word" do
16 | response = DataMagic.search({'city.name' => 'New'}, fields:['city.name'])
17 | results = response['results'].sort {|a,b| a['city.name'] <=> b['city.name']}
18 | expect(results).to eq(
19 | [{"city.name"=>"New Orleans"}, {"city.name"=>"New York"}])
20 | end
21 |
22 | it "can search for multiple words" do
23 | response = DataMagic.search({'city.name' => 'New York'}, fields:['city.name'])
24 | results = response['results']
25 | expect(results).to eq(
26 | [{"city.name"=>"New York"}])
27 | end
28 |
29 | it "can search for partial words" do
30 | response = DataMagic.search({'city.name' => 'S Fran'}, fields:['city.name'])
31 | results = response['results']
32 | expect(results).to eq(
33 | [{"city.name"=>"San Francisco"}])
34 | end
35 |
36 | it "is not case sensitive" do
37 | response = DataMagic.search({'city.name' => 'nEW'}, fields:['city.name'])
38 | results = response['results'].sort {|a,b| a['city.name'] <=> b['city.name']}
39 | expect(results).to eq(
40 | [{"city.name"=>"New Orleans"}, {"city.name"=>"New York"}])
41 | end
42 | end
43 |
--------------------------------------------------------------------------------
/spec/lib/expression/parser_spec.rb:
--------------------------------------------------------------------------------
1 | require 'expression/parser'
2 |
3 | describe Expression::Parser do
4 |
5 | let(:parser) { Expression::Parser.new }
6 | describe 'vars' do
7 | it "parses one" do
8 | expect(parser.parse('one')).to eq(var: 'one')
9 | end
10 | it "preserves case " do
11 | expect(parser.parse('ONe')).to eq(var: 'ONe')
12 | end
13 | it "consumes trailing white space" do
14 | expect(parser.parse('one ')).to eq(var: 'one')
15 | end
16 | end
17 |
18 | it "parses or expression" do
19 | expect(parser.parse('apples or oranges')).to eq(
20 | {or: {left: {var: "apples"}, right: {var: "oranges"}}}
21 | )
22 | end
23 |
24 | it "parses and expression" do
25 | expect(parser.parse('apples and oranges')).to eq(
26 | {and: {left: {var: "apples"}, right: {var: "oranges"}}}
27 | )
28 | end
29 |
30 | describe "parens" do
31 | it "nested 'or'" do
32 | expect(parser.parse('(apples or cranberries) and nuts')).to eq(
33 | {:and => {
34 | :left=>{:or=>{:left=>{:var=>"apples"}, :right=>{:var=>"cranberries"}}},
35 | :right=>{:var=>"nuts"}}}
36 | )
37 | end
38 | it "nested 'and'" do
39 | expect(parser.parse('(nuts and cranberries) or apples')).to eq(
40 | { or: {
41 | left: { and: { left: {var: "nuts"}, right: {var:"cranberries"}}},
42 | right: { var: "apples" }
43 | }
44 | }
45 | )
46 | end
47 |
48 | end
49 |
50 | end
51 |
--------------------------------------------------------------------------------
/lib/expression/parser.rb:
--------------------------------------------------------------------------------
1 | require 'parslet'
2 | # based on https://github.com/kschiess/parslet/blob/master/example/boolean_algebra.rb
3 | # usage:
4 | # def parse(str)
5 | # ExpressionParser.new.parse(str)
6 | #
7 | # rescue Parslet::ParseFailed => failure
8 | # puts failure.cause.ascii_tree
9 | # end
10 | #
11 | # tree = ExpressionParser.new.parse("one or two")
12 | # => {:or=>{:left=>{:var=>"one"@0}, :right=>{:var=>"two"@7}}}
13 | # Eval.new.apply(tree, variables: {"one"=>1, "two"=>2})
14 | #
15 | # Variables.new.apply(tree)
16 |
17 | class Expression
18 | class Parser < Parslet::Parser
19 | rule(:space) { match[" "].repeat(1) }
20 | rule(:space?) { space.maybe }
21 |
22 | rule(:lparen) { str("(") >> space? }
23 | rule(:rparen) { str(")") >> space? }
24 |
25 | rule(:and_operator) { str("and") >> space? }
26 | rule(:or_operator) { str("or") >> space? }
27 |
28 | rule(:var) { match["[^\s\(\)]"].repeat(1).as(:var) >> space? }
29 |
30 | # The primary rule deals with parentheses.
31 | rule(:primary) { lparen >> or_operation >> rparen | var }
32 |
33 | # Note that following rules are both right-recursive.
34 | rule(:and_operation) {
35 | (primary.as(:left) >> and_operator >>
36 | and_operation.as(:right)).as(:and) |
37 | primary }
38 |
39 | rule(:or_operation) {
40 | (and_operation.as(:left) >> or_operator >>
41 | or_operation.as(:right)).as(:or) |
42 | and_operation }
43 |
44 | # We start at the lowest precedence rule.
45 | root(:or_operation)
46 | end
47 | end
48 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/index/event_logger_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 |
4 | describe DataMagic::Index::EventLogger do
5 | let(:event_logger) {
6 | l = DataMagic::Index::EventLogger.new
7 | allow(l).to receive(:logger).and_return(logger)
8 | l
9 | }
10 |
11 | let(:logger) { double('logger') }
12 |
13 | context 'when triggering an event with only a message argument' do
14 | it 'logs the message with the right level' do
15 | expect(logger).to receive(:info).with('hey!')
16 | event_logger.trigger('info', 'hey!')
17 |
18 | expect(logger).to receive(:debug).with('what happened?')
19 | event_logger.trigger('debug', 'what happened?')
20 |
21 | expect(logger).to receive(:warn).with('dude? everything ok?')
22 | event_logger.trigger('warn', 'dude? everything ok?')
23 |
24 | expect(logger).to receive(:error).with('FIRE IN THE HOLE!')
25 | event_logger.trigger('error', 'FIRE IN THE HOLE!')
26 | end
27 | end
28 |
29 | context 'when triggering an event with a message and an object' do
30 | it 'logs as a key value pair with an inspection of the object' do
31 | expect(logger).to receive(:info).with("foo: {:wild=>\"bar\"}")
32 | event_logger.trigger('info', 'foo', {wild: 'bar'})
33 | end
34 |
35 | it 'will shorten the object inspection when provided a limit' do
36 | expect(logger).to receive(:warn).with("foo: {:wild")
37 | event_logger.trigger('warn', 'foo', {wild: 'bar'}, 5)
38 | end
39 | end
40 | end
41 |
--------------------------------------------------------------------------------
/lib/data_magic/index/repository.rb:
--------------------------------------------------------------------------------
1 | module DataMagic
2 | module Index
3 | class Repository
4 | attr_reader :client, :document
5 |
6 | def initialize(client, document)
7 | @client = client
8 | @document = document
9 | end
10 |
11 | def save
12 | @skipped = false
13 | if client.creating?
14 | create
15 | else
16 | update
17 | end
18 | end
19 |
20 | def skipped?
21 | @skipped
22 | end
23 |
24 | def save
25 | if client.creating?
26 | create
27 | else
28 | update
29 | end
30 | end
31 |
32 | private
33 |
34 | def update
35 | if client.allow_skips?
36 | update_with_rescue
37 | else
38 | update_without_rescue
39 | end
40 | end
41 |
42 | def create
43 | client.index({
44 | index: client.index_name,
45 | id: document.id,
46 | type: 'document',
47 | body: document.data
48 | })
49 | end
50 |
51 | def update_without_rescue
52 | client.update({
53 | index: client.index_name,
54 | id: document.id,
55 | type: 'document',
56 | body: {doc: document.data}
57 | })
58 | end
59 |
60 | def update_with_rescue
61 | update_without_rescue
62 | rescue Elasticsearch::Transport::Transport::Errors::NotFound
63 | @skipped = true
64 | end
65 | end
66 | end
67 | end
68 |
--------------------------------------------------------------------------------
/lib/nested_hash.rb:
--------------------------------------------------------------------------------
1 | class NestedHash < Hash
2 |
3 | def initialize(hash = {}, default = nil, &block)
4 | default ? super(default) : super(&block)
5 | self.add(hash)
6 | end
7 |
8 | def add(hash)
9 | hash.each do |full_name, value|
10 | parts = full_name.to_s.split('.')
11 | last = parts.length - 1
12 | add_to = self
13 | parts.each_with_index do |name, index|
14 | if index == last
15 | add_to[name] = value
16 | else
17 | add_to[name] ||= {}
18 | add_to = add_to[name]
19 | end
20 | end
21 | end
22 | self
23 | end
24 |
25 | # generate a flat, non-nested hash
26 | # with keys that have dots representing the hierarchy
27 | def withdotkeys(deep_hash = self, flat_hash = {}, root = '')
28 | deep_hash.each do |key, value|
29 | if deep_hash[key].is_a?(Hash)
30 | flat_hash.merge! withdotkeys(value, flat_hash, key + '.')
31 | else
32 | key = "#{root}#{key}" if not root.empty?
33 | flat_hash[key] = value
34 | end
35 | end
36 | flat_hash
37 | end
38 |
39 | # generate a list of the keys with dots representing the hierarchy
40 | def dotkeys(row = self, prefix = '', path = [])
41 | human_names = []
42 | paths = []
43 | row.keys.each do |key|
44 | if row[key].is_a?(Hash)
45 | new_human_names = dotkeys(row[key], key + '.')
46 | human_names += new_human_names
47 | else
48 | human_names << prefix + key
49 | end
50 | end
51 | human_names
52 | end
53 |
54 | end
55 |
--------------------------------------------------------------------------------
/app/views/home.liquid:
--------------------------------------------------------------------------------
1 |
10 |
11 | API endpoints
12 |
13 |
14 | {% for name in endpoints %}
15 | - {{ name }}
16 | {% endfor %}
17 |
18 |
19 | {% if examples.size > 0 %}
20 | Examples
21 |
22 |
23 | {% for ex in examples %}
24 | - {{ ex.name }} {{ ex.description }}
25 | {% endfor %}
26 |
27 | {% endif %}
28 |
29 | Browse Data Details by Category
30 |
33 |
34 |
53 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/calculated_columns_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 |
4 | describe "calculated columns" do
5 |
6 | before :example do
7 | DataMagic.destroy
8 | ENV['DATA_PATH'] = data_path
9 | end
10 | after :example do
11 | DataMagic.destroy
12 | end
13 |
14 | describe "combine into float" do
15 | let(:data_path) { "./spec/fixtures/schools" }
16 | it "can combine two columns" do
17 | DataMagic.config = DataMagic::Config.new
18 | DataMagic.import_with_dictionary
19 | result = DataMagic.search({}, fields: ['id', 'completion.rate.overall'])
20 | results = result['results'].sort_by { |hash| hash['id'] }
21 | expect(results[0]).to eq('id' => 1, 'completion.rate.overall' => 0.16)
22 | expect(results[1]).to eq('id' => 2, 'completion.rate.overall' => 0.62)
23 | expect(results[2]).to eq('id' => 3, 'completion.rate.overall' => nil)
24 | expect(results[3]).to eq('id' => 4, 'completion.rate.overall' => nil)
25 | expect(results[4]).to eq('id' => 5, 'completion.rate.overall' => 0.91)
26 | end
27 | end
28 |
29 | describe "combine into boolean" do
30 | let(:data_path) { "./spec/fixtures/calculated_columns" }
31 | it "can combine multiple columns" do
32 | DataMagic.config = DataMagic::Config.new
33 | DataMagic.import_with_dictionary
34 | result = DataMagic.search({}, fields: %w(id summarybool))
35 | results = result['results'].sort_by { |hash| hash['id'] }
36 | expect(results[0]).to eq('id' => 1, 'summarybool' => true)
37 | expect(results[1]).to eq('id' => 2, 'summarybool' => false)
38 | expect(results[2]).to eq('id' => 3, 'summarybool' => true)
39 | end
40 | end
41 | end
42 |
--------------------------------------------------------------------------------
/spec/lib/expression_spec.rb:
--------------------------------------------------------------------------------
1 | require 'expression/expression'
2 |
3 | describe Expression do
4 | context "simple or expression" do
5 | it "can find variables" do
6 | expr = "ONE or TWO"
7 | expect(Expression.new(expr).variables).to eq(%w(ONE TWO))
8 | end
9 |
10 | it "evaluates: 0 OR 1 to be 1" do
11 | expr = "f or t"
12 | values = {f:0, t:1}
13 | expect(Expression.new(expr).evaluate(values)).to eq(1)
14 | end
15 |
16 | it "evaluates: 1 OR 0 to be 1" do
17 | expr = "t or f"
18 | values = {f:0, t:1}
19 | expect(Expression.new(expr).evaluate(values)).to eq(1)
20 | end
21 |
22 | it "evaluates: 0 OR 0 to be 0" do
23 | expr = "f1 or f2"
24 | values = {f1:0, f2:0}
25 | expect(Expression.new(expr).evaluate(values)).to eq(0)
26 | end
27 |
28 | it "evaluates: 1 OR 1 to be 1" do
29 | expr = "t1 or t2"
30 | values = {t1:1, t2:1}
31 | expect(Expression.new(expr).evaluate(values)).to eq(1)
32 | end
33 |
34 | it "evaluates: 1 OR nil to be 1" do
35 | expr = "t1 or t2"
36 | values = {t1:1, t2:nil}
37 | expect(Expression.new(expr).evaluate(values)).to eq(1)
38 | end
39 |
40 | it "evaluates: 0 OR nil to be nil" do
41 | expr = "t1 or t2"
42 | values = {t1:0, t2:nil}
43 | expect(Expression.new(expr).evaluate(values)).to eq(nil)
44 | end
45 |
46 | it "evaluates: nil OR 0 to be 0" do
47 | expr = "t1 or t2"
48 | values = {t1:nil, t2:0}
49 | expect(Expression.new(expr).evaluate(values)).to eq(0)
50 | end
51 |
52 | it "evaluates: nil OR nil to be nil" do
53 | expr = "t1 or t2"
54 | values = {t1:nil, t2:nil}
55 | expect(Expression.new(expr).evaluate(values)).to eq(nil)
56 | end
57 | end
58 | end
59 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/create_index_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 |
4 | describe "DataMagic #init" do
5 | before (:all) do
6 | ENV['DATA_PATH'] = './spec/fixtures/import_with_dictionary'
7 | end
8 |
9 | after(:each) do
10 | DataMagic.destroy
11 | end
12 |
13 | context "with no options" do
14 | it "creates index only once" do
15 | expect(DataMagic).to receive(:create_index).once
16 | DataMagic.init
17 | end
18 |
19 | it "creates index" do
20 | DataMagic.init
21 | expect(DataMagic.config.index_exists?).to be true
22 | end
23 |
24 | it "does not re-create index with subsequent call to #import_with_dictionary" do
25 | expect(DataMagic).to receive(:create_index).once
26 | DataMagic.init
27 | DataMagic.import_with_dictionary
28 | end
29 | end
30 |
31 |
32 | context "with load_now: false" do
33 | it "does not call #create_index" do
34 | expect(DataMagic).not_to receive(:create_index)
35 | DataMagic.init(load_now: false)
36 | end
37 |
38 | it "does not create index" do
39 | DataMagic.init(load_now: false)
40 | expect(DataMagic.config.index_exists?).to be false
41 | end
42 |
43 | it "creates index with subsequent call to #import_with_dictionary" do
44 | DataMagic.init(load_now: false)
45 | DataMagic.import_with_dictionary
46 | expect(DataMagic.config.index_exists?).to be true
47 | end
48 |
49 | it "creates index with subsequent call to #import_csv" do
50 | ENV['DATA_PATH'] = './spec/fixtures/minimal'
51 | DataMagic.init(load_now: false)
52 | data_str = <<-eos
53 | a,b
54 | 1,2
55 | 3,4
56 | eos
57 | data = StringIO.new(data_str)
58 | DataMagic.import_csv(data)
59 | expect(DataMagic.config.index_exists?).to be true
60 | end
61 | end
62 | end
--------------------------------------------------------------------------------
/spec/lib/nested_hash_spec.rb:
--------------------------------------------------------------------------------
1 | require 'nested_hash'
2 |
3 | describe NestedHash do
4 | let(:input) { {"loc.x" => 1, "loc.y" => 2, "foo.a" => 10, "foo.b" => 20, "loc.z" => 3}}
5 | let(:expected) {{"loc" => {"x" => 1, "y" => 2, "z" => 3}, "foo" => {"a" => 10, "b" => 20}}}
6 |
7 | let(:symbol_keys) { {x:1, y:2}}
8 | let(:symbol_keys_result) { {'x' => 1, 'y' => 2}}
9 |
10 |
11 | it ".add created nested hash elements for string keys with '.'" do
12 | result = NestedHash.new.add(input)
13 | expect(result).to eq(expected)
14 | end
15 |
16 | it "does no harm when initialized with an already nested hash" do
17 | expect(NestedHash.new(expected)).to eq(expected)
18 | end
19 |
20 | context "methods" do
21 | let (:result) { NestedHash.new(input) }
22 | it "can initialize with another Hash" do
23 | expect(result).to eq(expected)
24 | end
25 |
26 | it "can generate dotkeys" do
27 | expect(result.dotkeys.sort).to eq(input.keys.sort)
28 | end
29 |
30 | it "withdotkeys generates keys with '.'" do
31 | expect(result.withdotkeys).to eq(input)
32 | end
33 |
34 | it "dotkeys and withdotkeys have same order" do
35 | expect(result.withdotkeys.keys).to eq(result.dotkeys)
36 | end
37 | end
38 |
39 |
40 | it "turns symbol keys into simple strings" do
41 | result = NestedHash.new.add(symbol_keys)
42 | expect(result).to eq(symbol_keys_result)
43 | end
44 |
45 | context "deeply nested" do
46 | let(:input) { {"info.loc.x" => 0.11, "info.loc.y" => 0.222, "foo.a" => 10, "foo.b" => 20}}
47 | let(:expected) { {"info" => {"loc" => {"x" => 0.11, "y" => 0.222}}, "foo" => {"a" => 10, "b" => 20}}}
48 |
49 | it "creates nested hash elements for string keys with '.'" do
50 | result = NestedHash.new.add(input)
51 | expect(result).to eq(expected)
52 | end
53 |
54 | end
55 |
56 | end
57 |
--------------------------------------------------------------------------------
/app/views/category.liquid:
--------------------------------------------------------------------------------
1 |
10 |
11 | Data Details for the Category
12 |
13 |
17 |
18 |
19 | Back to the list of Categories
20 |
21 |
55 |
--------------------------------------------------------------------------------
/config/apps.rb:
--------------------------------------------------------------------------------
1 | ##
2 | # This file mounts each app in the Padrino project to a specified sub-uri.
3 | # You can mount additional applications using any of these commands below:
4 | #
5 | # Padrino.mount('blog').to('/blog')
6 | # Padrino.mount('blog', :app_class => 'BlogApp').to('/blog')
7 | # Padrino.mount('blog', :app_file => 'path/to/blog/app.rb').to('/blog')
8 | #
9 | # You can also map apps to a specified host:
10 | #
11 | # Padrino.mount('Admin').host('admin.example.org')
12 | # Padrino.mount('WebSite').host(/.*\.?example.org/)
13 | # Padrino.mount('Foo').to('/foo').host('bar.example.org')
14 | #
15 | # Note 1: Mounted apps (by default) should be placed into the project root at '/app_name'.
16 | # Note 2: If you use the host matching remember to respect the order of the rules.
17 | #
18 | # By default, this file mounts the primary app which was generated with this project.
19 | # However, the mounted app can be modified as needed:
20 | #
21 | # Padrino.mount('AppName', :app_file => 'path/to/file', :app_class => 'BlogApp').to('/')
22 | #
23 |
24 | ##
25 | # Setup global project settings for your apps. These settings are inherited by every subapp. You can
26 | # override these settings in the subapps as needed.
27 | #
28 | Padrino.configure_apps do
29 | # enable :sessions
30 | set :session_secret, 'ffb8bfc2d71e2ad938950169de2757ab7b73b1cd5fbf91b4b912ae493dc5b70f'
31 | set :protection, :except => :path_traversal
32 | set :protect_from_csrf, true
33 |
34 | set :allow_origin, :any
35 |
36 | end
37 |
38 | # If needed, mount the app that does indexing
39 | if ENV['INDEX_APP'] == "enable"
40 | puts "mounting index app"
41 | Padrino.mount('OpenDataMaker::IndexApp', :app_file => Padrino.root('app/index_app.rb')).to('/index')
42 | end
43 |
44 | # Mounts the core application for this project
45 | Padrino.mount('OpenDataMaker::App', :app_file => Padrino.root('app/app.rb')).to('/')
46 |
--------------------------------------------------------------------------------
/lib/data_magic/index/row_importer.rb:
--------------------------------------------------------------------------------
1 | require 'forwardable'
2 |
3 | module DataMagic
4 | module Index
5 | class RowImporter
6 | attr_reader :row, :importer
7 |
8 | def initialize(row, importer)
9 | @row = row
10 | @importer = importer
11 | end
12 |
13 | def process
14 | log_row_start
15 | before_save
16 | save
17 | after_save
18 | log_row_end
19 | end
20 |
21 | def document
22 | @document ||= DocumentBuilder.create(row, importer.builder_data, config)
23 | end
24 |
25 | def repository
26 | @repository ||= Repository.new(importer.client, document)
27 | end
28 |
29 | private
30 |
31 | def log_row_start
32 | trigger("debug", "csv parsed") if importer.empty?
33 | trigger("info", "row #{importer.row_count}", document, 500) if importer.row_count % 500 == 0
34 | #trigger("info", "id", document.id)
35 | if document.id_empty?
36 | trigger("warn", "blank id")
37 | trigger("warn", "unique", config.data["unique"])
38 | trigger("warn", "in row", document, 255)
39 | end
40 | end
41 |
42 | def before_save
43 | importer.set_headers(document)
44 | end
45 |
46 | def save
47 | repository.save
48 | end
49 |
50 | def after_save
51 | importer.skipping(document.id) if repository.skipped?
52 | importer.increment
53 | end
54 |
55 | def log_row_end
56 | return if !importer.at_limit?
57 | trigger("info", "done now, limiting rows to #{importer.row_count}")
58 | end
59 |
60 | def config
61 | DataMagic.config
62 | end
63 |
64 | extend Forwardable
65 |
66 | def_delegators :importer, :trigger
67 |
68 | def self.process(*args)
69 | new(*args).process
70 | end
71 | end
72 | end
73 | end
74 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/index/document_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 |
4 | describe DataMagic::Index::Document do
5 | before do
6 | allow(DataMagic).to receive(:config).and_return(config)
7 | end
8 |
9 | let(:document) { DataMagic::Index::Document.new(data) }
10 | let(:config) { DataMagic::Config.new() }
11 | let(:data) { {} }
12 |
13 | context 'when configured without any unique keys' do
14 | before do
15 | config.data['unique'] = []
16 | end
17 |
18 | it 'id should be nil' do
19 | expect(document.id).to be(nil)
20 | end
21 |
22 | it 'id should not be empty though' do
23 | expect(document.id_empty?).to be_falsey
24 | end
25 | end
26 |
27 | context 'when configured with the default keys' do
28 | context 'and there is no data' do
29 | it 'id should be an empty string' do
30 | expect(document.id).to eq('')
31 | end
32 |
33 | it 'id should be considered empty' do
34 | expect(document.id_empty?).to be_truthy
35 | end
36 | end
37 |
38 | context 'when there is data' do
39 | let(:data) {
40 | {"name" => "foo", "state"=>"MA"}
41 | }
42 |
43 | it 'id should be the value for the name key' do
44 | expect(document.id).to eq('foo')
45 | end
46 |
47 | it 'id should not be considered empty' do
48 | expect(document.id_empty?).to be_falsey
49 | end
50 | end
51 | end
52 |
53 | context 'with custom id configuration' do
54 | let(:data) {
55 | {"name" => "foo", "state"=>"MA"}
56 | }
57 |
58 | before do
59 | config.data['unique'] = ['name', 'state']
60 | end
61 |
62 | it 'id should build the right id for the data' do
63 | expect(document.id).to eq('foo:MA')
64 | end
65 |
66 | it 'id should not be considered empty' do
67 | expect(document.id_empty?).to be_falsey
68 | end
69 | end
70 | end
71 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/import_with_nested_files_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 |
4 | describe "unique key(s)" do
5 |
6 | before :example do
7 | DataMagic.destroy
8 | ENV['DATA_PATH'] = './spec/fixtures/nested_files'
9 | DataMagic.config = DataMagic::Config.new
10 | DataMagic.import_with_dictionary
11 | end
12 | after :example do
13 | DataMagic.destroy
14 | end
15 | let(:query) { {} }
16 | let(:sort) { nil }
17 | let(:result) { DataMagic.search(query, sort: sort) }
18 | let(:first) { result['results'].first }
19 | let(:id_one) { result['results'].find { |item| item['id'] == '1' } }
20 | let(:total) { result['metadata']['total'] }
21 |
22 | it "creates one document per unique id" do
23 | expect(total).to eq(11)
24 | end
25 |
26 | it "nests documents per unique id" do
27 | expect(id_one['id']).to eq('1')
28 | expect(id_one['2013']).to_not be_nil
29 | end
30 |
31 | it "root document contains special 'only' fields" do
32 | expect(id_one['id']).to eq('1')
33 | expect(id_one['name']).to eq('Reichert University')
34 | expect(id_one['city']).to eq('Normal')
35 | expect(id_one['state']).to eq('AL')
36 | end
37 |
38 | context "can import a subset of fields" do
39 | context "and when searching for a field value" do
40 | let(:query) { {zipcode: "35762"} }
41 | it "and doesn't find column" do
42 | expect(total).to eq(0)
43 | end
44 | end
45 | it "and doesn't include extra field" do
46 | expect(first['zipcode']).to be(nil)
47 | end
48 | end
49 |
50 | context "when searching on a nested field" do
51 | let(:query) { { '2013.earnings.6_yrs_after_entry.median' => 26318 } }
52 | it "can find the correct results" do
53 | expect(total).to eq(1)
54 | expect(first['2013']['earnings']['6_yrs_after_entry']).to eq({"percent_gt_25k"=>0.53, "median"=>26318})
55 | end
56 | end
57 |
58 | context "when sorting by a nested field" do
59 | let(:sort) { '2013.earnings.6_yrs_after_entry.median' }
60 | it "can find the right first result" do
61 | expect(total).to eq(11)
62 | expect(first['2013']['earnings']['6_yrs_after_entry']).to eq({"percent_gt_25k"=>0.09, "median"=>1836})
63 | end
64 | end
65 | end
66 |
--------------------------------------------------------------------------------
/app/stylesheets/application.sass:
--------------------------------------------------------------------------------
1 | body
2 | -webkit-font-smoothing: antialiased
3 | font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif
4 | font-size: 1em
5 | line-height: 1.5
6 | color: #333
7 |
8 | h1, h2, h3, h4, h5, h6
9 | font-family: "Raleway", "Helvetica Neue", Helvetica, Arial, sans-serif
10 | line-height: 1.1em
11 | margin: 0
12 | text-rendering: optimizeLegibility
13 |
14 | p
15 | margin: 0 0 0.75em
16 |
17 | hr
18 | border-bottom: 1px solid silver
19 | border-left: none
20 | border-right: none
21 | border-top: none
22 | margin: 1em 0
23 |
24 | img
25 | -webkit-user-select: none
26 | cursor: zoom-in
27 | margin: 0
28 | max-width: 50%
29 |
30 | .logo
31 | height: 150px
32 | width: 150px
33 | top: 50px
34 | left: 50px
35 | z-index: 20
36 |
37 | @media screen and (max-width: 995px)
38 | .logo
39 | height: 100px
40 | width: 100px
41 | top: 40px
42 | left: 20px
43 |
44 | @media screen and (max-width: 785px)
45 | .logo
46 | height: 75px
47 | width: 75px
48 |
49 | @media screen and (max-width: 590px)
50 | .logo
51 | top: 73px
52 |
53 | @media screen and (max-width: 480px)
54 | .logo
55 | top: 16px
56 | left: 0px
57 |
58 | .bottom-margin
59 | margin-bottom: 0.5em
60 | color: #c00
61 |
62 | .title
63 | text-align: center
64 | font-family: "Raleway", "Helvetica Neue", Helvetica, Arial, sans-serif
65 | font-size: 2em
66 | line-height: 2em
67 |
68 | .header
69 | background-color: #9cf
70 |
71 | .categories .category
72 | margin: 5px
73 | padding: 15px
74 | border: solid 1px silver
75 | word-wrap: break-word
76 | display: inline-block
77 | width: 92%
78 | background-color: #ffc
79 | a
80 | color: black
81 | text-decoration: none
82 | &:visited
83 | color: black
84 |
85 | .categories__column
86 | display: inline-block
87 | width: 100%
88 | vertical-align: top
89 | -webkit-column-count: 2
90 | -moz-column-count: 2
91 | column-count: 2
92 | column-gap: .2em
93 | -webkit-column-gap: .2em
94 | -moz-column-gap: .2em
95 |
96 | .category__name
97 | font-size: 18px
98 | font-weight: bold
99 | margin-bottom: 5px
100 | color: #c00
101 |
102 | .category__fields
103 | list-style: none
104 | padding: 0
105 |
106 | .category__field-name
107 | font-size: 15px
108 | font-weight: bold
109 | margin-bottom: 2px
110 | color: #c00
111 | width: 80%
112 |
113 | .category__field-type
114 | font-size: 15px
115 | font-weight: bold
116 | color: #c00
117 | width: 10%
118 | float: right
119 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/index/repository_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 |
4 | describe DataMagic::Index::Repository do
5 | let(:repository) { DataMagic::Index::Repository.new(super_client, document) }
6 |
7 | let(:super_client) { double('super client', index_name: 'index') }
8 | let(:document) { double('document', {id: 'id', data: 'data'}) }
9 |
10 | context 'when super client is creating' do
11 | before do
12 | allow(super_client).to receive(:creating?).and_return(true)
13 | allow(super_client).to receive(:index)
14 | end
15 |
16 | it '#save creates an index' do
17 | expect(super_client).to receive(:index).with({
18 | index: 'index',
19 | id: 'id',
20 | type: 'document',
21 | body: 'data'
22 | })
23 | repository.save
24 | end
25 |
26 | it '#save will not be skipped when successful' do
27 | repository.save
28 | expect(repository.skipped?).to be_falsey
29 | end
30 | end
31 |
32 | context 'when super client is not creating' do
33 | before do
34 | allow(super_client).to receive(:creating?).and_return(false)
35 | allow(super_client).to receive(:allow_skips?)
36 | allow(super_client).to receive(:update)
37 | end
38 |
39 | it '#save updates an index' do
40 | expect(super_client).to receive(:update).with({
41 | index: 'index',
42 | id: 'id',
43 | type: 'document',
44 | body: {doc: 'data'}
45 | })
46 | repository.save
47 | end
48 |
49 | it '#save will not be skipped when successful' do
50 | repository.save
51 | expect(repository.skipped?).to be_falsey
52 | end
53 | end
54 |
55 | context 'when super client is not creating, not skipping and an error is raised' do
56 | before do
57 | allow(super_client).to receive(:creating?).and_return(false)
58 | allow(super_client).to receive(:allow_skips?).and_return(false)
59 | end
60 |
61 | it '#save raises an error' do
62 | allow(super_client).to receive(:update).and_raise(Elasticsearch::Transport::Transport::Errors::NotFound)
63 | expect {
64 | repository.save
65 | }.to raise_error(Elasticsearch::Transport::Transport::Errors::NotFound)
66 | end
67 | end
68 |
69 | context 'when super client is not creating, skipping and an error is raised' do
70 | before do
71 | allow(super_client).to receive(:creating?).and_return(false)
72 | allow(super_client).to receive(:allow_skips?).and_return(true)
73 | end
74 |
75 | it '#save marks the repository as skipped' do
76 | allow(super_client).to receive(:update).and_raise(Elasticsearch::Transport::Transport::Errors::NotFound)
77 | expect {
78 | repository.save
79 | }.not_to raise_error
80 | expect(repository.skipped?).to eq(true)
81 | end
82 | end
83 | end
84 |
--------------------------------------------------------------------------------
/lib/data_magic/index.rb:
--------------------------------------------------------------------------------
1 | require 'forwardable'
2 |
3 | require_relative 'config'
4 | require_relative 'index/builder_data'
5 | require_relative 'index/event_logger'
6 | require_relative 'index/document'
7 | require_relative 'index/document_builder'
8 | require_relative 'index/importer'
9 | require_relative 'index/output'
10 | require_relative 'index/repository'
11 | require_relative 'index/row_importer'
12 | require_relative 'index/super_client'
13 |
14 | require 'action_view' # for distance_of_time_in_words (logging time)
15 | include ActionView::Helpers::DateHelper # for distance_of_time_in_words (logging time)
16 |
17 | module DataMagic
18 | # data could be a String or an io stream
19 | def self.import_csv(data, options={})
20 | Index::Importer.process(data, options)
21 | end
22 |
23 | # pre-condition: index is already created w/ config
24 | def self.index_with_dictionary(options = {})
25 | start_time = Time.now
26 | Config.logger.debug "--- index_with_dictionary, starting at #{start_time}"
27 |
28 | logger.info "files: #{self.config.files}"
29 | config.files.each_with_index do |filepath, index|
30 | fname = filepath.split('/').last
31 | logger.debug "indexing #{fname} #{index} file config:#{config.additional_data_for_file(index).inspect}"
32 | options[:add_data] = config.additional_data_for_file(index)
33 | options[:only] = config.info_for_file(index, :only)
34 | options[:nest] = config.info_for_file(index, :nest)
35 | begin
36 | logger.debug "*"*40
37 | logger.debug "* #{filepath}"
38 | logger.debug "*"*40
39 | file_start = Time.now
40 | data = config.read_path(filepath)
41 | rows, _ = DataMagic.import_csv(data, options)
42 | file_end = Time.now
43 | logger.debug "imported #{rows} rows in #{distance_of_time_in_words(file_end, file_start)}, ms: #{file_end - file_start}"
44 | rescue DataMagic::InvalidData => e
45 | Config.logger.debug "Error: skipping #{filepath}, #{e.message}"
46 | end
47 | end
48 | end_time = Time.now
49 | logger.debug "indexing complete: #{distance_of_time_in_words(end_time, start_time)}"
50 | logger.debug "duration: #{end_time - start_time}"
51 | end
52 |
53 | def self.import_with_dictionary(options = {})
54 | #logger.debug("field_mapping: #{field_mapping.inspect}")
55 | options[:mapping] = config.field_mapping
56 | options = options.merge(config.options)
57 |
58 | es_index_name = self.config.load_datayaml(options[:data_path])
59 | unless config.index_exists?(es_index_name)
60 | logger.info "creating #{es_index_name}" # TO DO: fix #14
61 | create_index es_index_name, config.field_types
62 | end
63 |
64 | index_with_dictionary(options)
65 |
66 | end # import_with_dictionary
67 |
68 | private
69 | def self.valid_types
70 | %w[integer float string literal name autocomplete boolean]
71 | end
72 |
73 | end # module DataMagic
74 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/import_without_data_yaml_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 |
4 | describe "DataMagic #import_without_data_yaml" do
5 | describe "without ALLOW_MISSING_YML" do
6 | it "not found locally raises error" do
7 | ENV['DATA_PATH'] = './spec/fixtures/cities_without_yml'
8 | expect {
9 | DataMagic.init(load_now: true)
10 | }.to raise_error(IOError, "No data.y?ml found at ./spec/fixtures/cities_without_yml. Did you mean to define ALLOW_MISSING_YML environment variable?")
11 | end
12 | it "not found on s3 raises error" do
13 | ENV['DATA_PATH'] = 's3://mybucket'
14 | fake_s3 = Aws::S3::Client.new(stub_responses: true)
15 | fake_s3.stub_responses(:get_object, Aws::S3::Errors::NoSuchKey.new(Seahorse::Client::RequestContext, 'Fake Error'))
16 | expect {
17 | config = DataMagic::Config.new(s3: fake_s3)
18 | }.to raise_error(IOError, "No data.y?ml found at s3://mybucket. Did you mean to define ALLOW_MISSING_YML environment variable?")
19 | end
20 |
21 | end
22 | describe "with ALLOW_MISSING_YML" do
23 | let (:expected) do
24 | {
25 | "metadata" => {
26 | "total" => 1,
27 | "page" => 0,
28 | "per_page" => DataMagic::DEFAULT_PAGE_SIZE
29 | },
30 | "results" => []
31 | }
32 | end
33 |
34 | before(:all) do
35 | DataMagic.destroy
36 | ENV['ALLOW_MISSING_YML'] = 'allow'
37 | ENV['DATA_PATH'] = './spec/fixtures/cities_without_yml'
38 | DataMagic.init(load_now: true)
39 | end
40 | after(:all) do
41 | DataMagic.destroy
42 | ENV['ALLOW_MISSING_YML'] = ''
43 | end
44 |
45 | it "can get list of imported csv files" do
46 | file_list = [
47 | "./spec/fixtures/cities_without_yml/cities50.csv",
48 | "./spec/fixtures/cities_without_yml/cities51-100.csv",
49 | "./spec/fixtures/cities_without_yml/more.csv",
50 | ]
51 | expect(DataMagic.config.files.sort).to eq(file_list)
52 | end
53 |
54 | it "can get index name from api endpoint" do
55 | expect(DataMagic.config.find_index_for('cities-without-yml')).to eq('cities-without-yml')
56 | end
57 |
58 | it "indexes files with yaml mapping" do
59 | result = DataMagic.search({NAME: "Chicago"}, api: 'cities-without-yml')
60 | expected["results"] = [
61 | {
62 | "USPS"=>"IL",
63 | "GEOID"=>"1714000",
64 | "ANSICODE"=>"00428803",
65 | "NAME"=>"Chicago",
66 | "LSAD"=>"25",
67 | "FUNCSTAT"=>"A",
68 | "POP10"=>"2695598",
69 | "HU10"=>"1194337",
70 | "ALAND"=>"589571105",
71 | "AWATER"=>"16781658",
72 | "ALAND_SQMI"=>"227.635",
73 | "AWATER_SQMI"=>"6.479",
74 | "INTPTLAT"=>"41.837551",
75 | "INTPTLONG"=>"-87.681844",
76 | }
77 | ]
78 | expect(result).to eq(expected)
79 | end
80 | end
81 | end
82 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/config_field_types_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'DataMagic::Config #field_types' do
4 | let(:config) { DataMagic::Config.new(load_datayaml: false) }
5 |
6 | it "returns empty if dictionary is empty" do
7 | allow(config).to receive(:file_config).and_return([{'name' => 'one.csv'}])
8 | allow(config).to receive(:dictionary).and_return({})
9 | expect(config.field_types).to eq({})
10 | end
11 |
12 | context "when no type is given" do
13 | before do
14 | allow(config).to receive(:file_config).and_return([{'name' => 'one.csv'}])
15 | allow(config).to receive(:dictionary).and_return({
16 | 'name' => {source:'NAME_COLUMN'}
17 | })
18 | end
19 |
20 | it "defaults to string" do
21 | expect(config.field_types).to eq({
22 | 'name' => 'string'
23 | })
24 | end
25 | end
26 |
27 | it "supports integers" do
28 | allow(config).to receive(:file_config).and_return([{'name' => 'one.csv'}])
29 | allow(config).to receive(:dictionary).and_return(
30 | IndifferentHash.new count:
31 | {source:'COUNT_COLUMN', type: 'integer'}
32 | )
33 | expect(config.field_types).to eq({'count' => 'integer'})
34 | end
35 |
36 | context "with float type" do
37 | it "sets float mapping" do
38 | allow(config).to receive(:file_config).and_return([{'name' => 'one.csv'}])
39 | allow(config).to receive(:dictionary).and_return(
40 | IndifferentHash.new percent:
41 | {source:'PERCENT_COLUMN', type: 'float'}
42 | )
43 | expect(config.field_types).to eq({'percent' => 'float'})
44 | end
45 |
46 | it "can be excluded" do
47 | allow(config).to receive(:dictionary).and_return(
48 | IndifferentHash.new id: {source:'ID', type: 'integer'},
49 | percent: {source:'PERCENT', type: 'float'}
50 | )
51 | allow(config).to receive(:file_config).and_return([
52 | IndifferentHash.new({ name:'one.csv', only: ['id'] })
53 | ])
54 | expect(config.field_types).to eq({'id' => 'integer'})
55 | end
56 |
57 | it "can be nested" do
58 | allow(config).to receive(:dictionary).and_return(
59 | IndifferentHash.new id: {source:'ID', type: 'integer'},
60 | percent: {source:'PERCENT', type: 'float'}
61 | )
62 | allow(config).to receive(:file_config).and_return([
63 | IndifferentHash.new({name:'one.csv',
64 | only: ['id']}),
65 | IndifferentHash.new({name:'two.csv',
66 | nest: {key: '2012', contents: ['percent']}})
67 | ])
68 | expect(config.field_types).to eq({
69 | 'id' => 'integer',
70 | '2012.percent' => 'float'
71 | })
72 | end
73 | end
74 |
75 | it "supports special case for location fields as nil" do
76 | # special case for location in create_index
77 | allow(config).to receive(:dictionary).and_return(
78 | IndifferentHash.new 'location.lat': {source:'LAT_COLUMN'},
79 | 'location.lon': {source:'LON_COLUMN'}
80 |
81 | )
82 | expect(config.field_types).to eq({})
83 | end
84 | end
85 |
--------------------------------------------------------------------------------
/spec/fixtures/sample-data/data.yaml:
--------------------------------------------------------------------------------
1 | version: cities100-2010
2 | # cities100.txt
3 | # National Places Gazetteer Files, from US Census 2010
4 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html
5 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt
6 | # head -n 101 results.txt > cities100.txt
7 | # then convertes to csv and removed " city" from after each city name
8 | index: city-data
9 | api: cities
10 | unique: ['name']
11 | options:
12 | search: dictionary_only
13 |
14 | dictionary:
15 | id:
16 | source: GEOID
17 | description: >
18 | Geographic Identifier - fully concatenated geographic code (State FIPS and
19 | County FIPS). The Census Bureau and other state and federal agencies are
20 | responsible for assigning geographic identifiers, or GEOIDs, to geographic
21 | entities to facilitate the organization, presentation, and
22 | exchange of geographic and statistical data. GEOIDs are numeric codes that
23 | uniquely identify all administrative/legal and statistical geographic areas for
24 | which the Census Bureau tabulates data. From Alaska, the largest state,
25 | to the smallest census block in New York City, every geographic area
26 | has a unique GEOID. Data users rely on GEOIDs to join the appropriate
27 | demographic data from censuses and surveys, such as the
28 | American Community Survey (ACS), to various levels of geography for data
29 | analysis, interpretation and mapping.
30 | code:
31 | source: ANSICODE
32 | description: >
33 | American National Standards Institute codes (ANSI codes)
34 | are standardized numeric or alphabetic codes issued by the American
35 | National Standards Institute (ANSI) to ensure uniform identification of
36 | geographic entities through all federal government agencies.
37 | name:
38 | source: NAME
39 | description: The name of the city
40 | type: literal
41 | state:
42 | source: USPS
43 | description: Two letter state abbreviation
44 | population:
45 | source: POP10
46 | description: City population from 2010 Census data
47 | type: integer
48 | location.lat: INTPTLAT
49 | location.lon: INTPTLONG
50 | area.land:
51 | description: Land Area (square miles)
52 | source: ALAND_SQMI
53 | type: float
54 | area.water:
55 | description: Water Area (square miles)
56 | source: AWATER_SQMI
57 | type: float
58 |
59 | categories:
60 | general:
61 | title: General
62 | description: >
63 | general information about the city, including standard
64 | identifiers and actual census summary data about the population of the city.
65 | fields: [id, code, name, state, population]
66 | geographic:
67 | title: Geographic
68 | description: >
69 | Geographic characteristics of the area. These are created for
70 | statistical purposes only. Depiction and designation for statistical
71 | purposes does not constitute a determination of jurisdictional authority
72 | or rights of ownership or entitlement.
73 | fields: [location, area.land, area.water]
74 |
75 | files:
76 | - name: cities100.csv
77 |
--------------------------------------------------------------------------------
/notes.txt:
--------------------------------------------------------------------------------
1 | commit eabfb903751cc5b7bc9ae0affeb15ad020e1d783
2 | Merge: 0b017e5 a5c5a18
3 | Author: Yoz Grahame
4 | Date: Tue Sep 8 17:53:20 2015 -0700
5 |
6 | Merge pull request #198 from 18F/source-false
7 |
8 | Use `_source: false` to limit JSON coming back from ES
9 |
10 | commit a5c5a18381214d3a51ab54152b463727bb3f79bc
11 | Author: Sarah Allen
12 | Date: Tue Sep 8 17:41:21 2015 -0700
13 |
14 | exclude fields starting with _
15 |
16 | when the whole source is returned
17 | when we’re not specifying fields
18 | we need to explicitly exclude _names
19 |
20 | commit 94bef492dca538f3e27f20824712f47793f8ab8c
21 | Author: Yoz (Jeremy) Grahame
22 | Date: Tue Sep 8 17:05:28 2015 -0700
23 |
24 | Also record "took" MS value from ES result
25 |
26 | commit 2205dce984c52a50382030d37edeefbcd4316873
27 | Merge: df7f98b 0b017e5
28 | Author: Yoz (Jeremy) Grahame
29 | Date: Tue Sep 8 17:02:09 2015 -0700
30 |
31 | Merge branch 'dev' into source-false
32 |
33 | commit df7f98b216163eb1f27a038a535817ac95aea742
34 | Author: Yoz (Jeremy) Grahame
35 | Date: Tue Sep 8 17:01:20 2015 -0700
36 |
37 | Use `_source: false` for proper field exclusion
38 |
39 | also the "oj" gem for faster JSON
40 |
41 | commit 0b017e56e3a6ed7c9549214a52ed6c2b568c4746
42 | Merge: 9761906 e4f6dfd
43 | Author: Sarah Allen
44 | Date: Tue Sep 8 17:00:48 2015 -0700
45 |
46 | Merge pull request #197 from 18F/log-query-time
47 |
48 | Log ES query time, and show with "debug" option
49 |
50 | commit e4f6dfd219b514a3d4523d836ad083005848186c
51 | Author: Yoz (Jeremy) Grahame
52 | Date: Tue Sep 8 16:58:55 2015 -0700
53 |
54 | No, THAT's how you test a hash value
55 |
56 | commit 97619061da0badde70890ed1f26912c0355c5245
57 | Merge: 9c1e04e 63ead03
58 | Author: Sarah Allen
59 | Date: Tue Sep 8 16:25:19 2015 -0700
60 |
61 | Merge pull request #196 from 18F/max-page-size
62 |
63 | Only allow up to MAX_PAGE_SIZE per page
64 |
65 | commit 63ead033b40c619ac143065d36be3a4427458d68
66 | Author: Yoz (Jeremy) Grahame
67 | Date: Tue Sep 8 16:01:54 2015 -0700
68 |
69 | String-to-int bugfix
70 |
71 | commit f85c981da6622d8d8925037325822ada4c4bfce1
72 | Author: Yoz (Jeremy) Grahame
73 | Date: Tue Sep 8 15:32:57 2015 -0700
74 |
75 | Added MAX_PAGE_SIZE test
76 |
77 | commit 4ba1cedcf73a8ff7e6415ab54dcbdf80ff6d211c
78 | Author: Yoz (Jeremy) Grahame
79 | Date: Tue Sep 8 15:25:21 2015 -0700
80 |
81 | Only allow up to MAX_PAGE_SIZE per page
82 |
83 | commit 226fb43f73268c91e4044118f84ceca707cceadb
84 | Author: Yoz (Jeremy) Grahame
85 | Date: Tue Sep 8 15:04:03 2015 -0700
86 |
87 | Log ES query time, and show with "debug" option
88 |
89 | commit 9c1e04ecf2d25c505941c55d71868c2894102983
90 | Merge: 6461581 99a490b
91 | Author: Yoz Grahame
92 | Date: Tue Sep 8 09:13:40 2015 -0700
93 |
94 | Merge pull request #194 from 18F/dev-sort
95 |
96 | autocomplete type alpha sort
97 |
98 | commit 99a490b1b12046895b49f9b3001c05f30b2dfa82
99 | Author: Sarah Allen
100 | Date: Tue Sep 8 01:51:27 2015 -0700
101 |
102 | autocomplete type alpha sort
103 |
--------------------------------------------------------------------------------
/lib/data_magic/index/importer.rb:
--------------------------------------------------------------------------------
1 | require 'forwardable'
2 |
3 | module DataMagic
4 | module Index
5 | class Importer
6 | attr_reader :raw_data, :options
7 |
8 | def initialize(raw_data, options)
9 | @raw_data = raw_data
10 | @options = options
11 | end
12 |
13 | def process
14 | setup
15 | parse_and_log
16 | finish!
17 |
18 | [row_count, headers]
19 | end
20 |
21 | def client
22 | @client ||= SuperClient.new(es_client, options)
23 | end
24 |
25 | def builder_data
26 | @builder_data ||= BuilderData.new(raw_data, options)
27 | end
28 |
29 | def output
30 | @output ||= Output.new
31 | end
32 |
33 | def parse_and_log
34 | parse_csv
35 | rescue InvalidData => e
36 | trigger("error", e.message)
37 | raise InvalidData, "invalid file format" if empty?
38 | end
39 |
40 | def chunk_size
41 | (ENV['CHUNK_SIZE'] || 100).to_i
42 | end
43 |
44 | def nprocs
45 | (ENV['NPROCS'] || 1).to_i
46 | end
47 |
48 | def parse_csv
49 | if nprocs == 1
50 | parse_csv_whole
51 | else
52 | parse_csv_chunked
53 | end
54 | data.close
55 | end
56 |
57 | def parse_csv_whole
58 | CSV.new(
59 | data,
60 | headers: true,
61 | header_converters: lambda { |str| str.strip.to_sym }
62 | ).each do |row|
63 | RowImporter.process(row, self)
64 | break if at_limit?
65 | end
66 | end
67 |
68 | def parse_csv_chunked
69 | CSV.new(
70 | data,
71 | headers: true,
72 | header_converters: lambda { |str| str.strip.to_sym }
73 | ).each.each_slice(chunk_size) do |chunk|
74 | break if at_limit?
75 | chunks_per_proc = (chunk.size / nprocs.to_f).ceil
76 | Parallel.each(chunk.each_slice(chunks_per_proc)) do |rows|
77 | rows.each_with_index do |row, idx|
78 | RowImporter.process(row, self)
79 | end
80 | end
81 | if !headers
82 | single_document = DocumentBuilder.create(chunk.first, builder_data, DataMagic.config)
83 | set_headers(single_document)
84 | end
85 | increment(chunk.size)
86 | end
87 | end
88 |
89 | def setup
90 | client.create_index
91 | log_setup
92 | end
93 |
94 | def finish!
95 | validate!
96 | refresh_index
97 | log_finish
98 | end
99 |
100 | def log_setup
101 | opts = options.reject { |k,v| k == :mapping }
102 | trigger("info", "options", opts)
103 | trigger("info", "new_field_names", new_field_names)
104 | trigger("info", "additional_data", additional_data)
105 | end
106 |
107 | def log_finish
108 | trigger("info", "skipped (missing parent id)", output.skipped) if !output.skipped.empty?
109 | trigger('info', "done #{row_count} rows")
110 | end
111 |
112 | def event_logger
113 | @event_logger ||= EventLogger.new
114 | end
115 |
116 | def at_limit?
117 | options[:limit_rows] && row_count == options[:limit_rows]
118 | end
119 |
120 | extend Forwardable
121 |
122 | def_delegators :output, :set_headers, :skipping, :skipped, :increment, :row_count, :log_limit,
123 | :empty?, :validate!, :headers
124 | def_delegators :builder_data, :data, :new_field_names, :additional_data
125 | def_delegators :client, :refresh_index
126 | def_delegators :event_logger, :trigger
127 |
128 | def self.process(*args)
129 | new(*args).process
130 | end
131 |
132 | private
133 |
134 | def es_client
135 | DataMagic.client
136 | end
137 | end
138 | end
139 | end
140 |
--------------------------------------------------------------------------------
/sample-data/data.yaml:
--------------------------------------------------------------------------------
1 | version: cities100-2010
2 | # cities100.txt
3 | # National Places Gazetteer Files, from US Census 2010
4 | # https://www.census.gov/geo/maps-data/data/gazetteer2010.html
5 | # (head -n 1 source.txt && tail -n +2 source.txt | LC_ALL=C sort -k7rn,7 -t$'\t' source.txt) > result.txt
6 | # head -n 101 results.txt > cities100.txt
7 | # then convertes to csv and removed " city" from after each city name
8 | index: city-data
9 | api: cities
10 | unique: ['name']
11 |
12 | options:
13 | search: dictionary_only # API provides error when requesting fields not in dictionary
14 |
15 | dictionary:
16 | id:
17 | source: GEOID
18 | description: >
19 | Geographic Identifier - fully concatenated geographic code (State FIPS and
20 | County FIPS). The Census Bureau and other state and federal agencies are
21 | responsible for assigning geographic identifiers, or GEOIDs, to geographic
22 | entities to facilitate the organization, presentation, and
23 | exchange of geographic and statistical data. GEOIDs are numeric codes that
24 | uniquely identify all administrative/legal and statistical geographic areas for
25 | which the Census Bureau tabulates data. From Alaska, the largest state,
26 | to the smallest census block in New York City, every geographic area
27 | has a unique GEOID. Data users rely on GEOIDs to join the appropriate
28 | demographic data from censuses and surveys, such as the
29 | American Community Survey (ACS), to various levels of geography for data
30 | analysis, interpretation and mapping.
31 | code:
32 | source: ANSICODE
33 | description: >
34 | American National Standards Institute codes (ANSI codes)
35 | are standardized numeric or alphabetic codes issued by the American
36 | National Standards Institute (ANSI) to ensure uniform identification of
37 | geographic entities through all federal government agencies.
38 | name:
39 | source: NAME
40 | description: The name of the city
41 | type: literal
42 | state:
43 | source: USPS
44 | description: Two letter state abbreviation
45 | population:
46 | source: POP10
47 | description: City population from 2010 Census data
48 | type: integer
49 | location.lat: INTPTLAT
50 | location.lon: INTPTLONG
51 | land_area:
52 | source: ALAND_SQMI
53 | description: Land Area (square miles)
54 | source: ALAND_SQMI
55 | type: float
56 | area.water:
57 | description: Water Area (square miles)
58 | source: AWATER_SQMI
59 | type: float
60 |
61 | categories:
62 | general:
63 | title: General
64 | description: >
65 | general information about the city, including standard
66 | identifiers and actual census summary data about the population of the city.
67 | fields: [id, code, name, state, population]
68 | geographic:
69 | title: Geographic
70 | description: >
71 | Geographic characteristics of the area. These are created for
72 | statistical purposes only. Depiction and designation for statistical
73 | purposes does not constitute a determination of jurisdictional authority
74 | or rights of ownership or entitlement.
75 | fields: [location, area.land, area.water]
76 | general2:
77 | title: General2
78 | description: >
79 | general information about the city, including standard
80 | identifiers and actual census summary data about the population of the city.
81 | fields: [id, code, name, state, population]
82 | general3:
83 | title: General3
84 | description: >
85 | short
86 | fields: [id, code, name, state, population]
87 | general4:
88 | title: General4
89 | description: >
90 | short
91 | fields: [id, code, name, state, population]
92 | general5:
93 | title: General5
94 | description: >
95 | general information about the city, including standard
96 | identifiers and actual census summary data about the population of the city.
97 | fields: [id, code, name, state, population]
98 |
99 | files:
100 | - name: cities100.csv
101 |
--------------------------------------------------------------------------------
/public/javascripts/jquery-ujs.js:
--------------------------------------------------------------------------------
1 | /*
2 | * Padrino Javascript Jquery Adapter
3 | * Created for use with Padrino Ruby Web Framework (http://www.padrinorb.com)
4 | **/
5 |
6 | /* Remote Form Support
7 | * form_for @user, '/user', :remote => true
8 | **/
9 |
10 | $(function(){
11 | $('form').on('submit', function(e) {
12 | var element = $(this), message = element.data('confirm');
13 | if (message && !confirm(message)) { return false; }
14 | if (element.data('remote') == true) {
15 | e.preventDefault(); e.stopped = true;
16 | JSAdapter.sendRequest(element, {
17 | verb: element.data('method') || element.attr('method') || 'post',
18 | url: element.attr('action'),
19 | dataType: element.data('type') || ($.ajaxSettings && $.ajaxSettings.dataType) || 'script',
20 | params: element.serializeArray()
21 | });
22 | }
23 | });
24 |
25 | /* Confirmation Support
26 | * link_to 'sign out', '/logout', :confirm => 'Log out?'
27 | **/
28 |
29 | $(document).on('click', 'a[data-confirm]', function(e) {
30 | var message = $(this).data('confirm');
31 | if (!confirm(message)) { e.preventDefault(); e.stopped = true; }
32 | });
33 |
34 | /*
35 | * Link Remote Support
36 | * link_to 'add item', '/create', :remote => true
37 | **/
38 |
39 | $(document).on('click', 'a[data-remote=true]', function(e) {
40 | var element = $(this);
41 | if (e.stopped) return;
42 | e.preventDefault(); e.stopped = true;
43 | JSAdapter.sendRequest(element, {
44 | verb: element.data('method') || 'get',
45 | url: element.attr('href')
46 | });
47 | });
48 |
49 | /*
50 | * Link Method Support
51 | * link_to 'delete item', '/destroy', :method => :delete
52 | **/
53 |
54 | $(document).on('click', 'a[data-method]:not([data-remote])', function(e) {
55 | if (e.stopped) return;
56 | JSAdapter.sendMethod($(this));
57 | e.preventDefault(); e.stopped = true;
58 | });
59 |
60 | /* JSAdapter */
61 | var JSAdapter = {
62 | // Sends an xhr request to the specified url with given verb and params
63 | // JSAdapter.sendRequest(element, { verb: 'put', url : '...', params: {} });
64 | sendRequest: function(element, options) {
65 | var verb = options.verb, url = options.url, params = options.params, dataType = options.dataType;
66 | var event = element.trigger('ajax:before');
67 | if (event.stopped) return false;
68 | $.ajax({
69 | url: url,
70 | type: verb.toUpperCase() || 'POST',
71 | data: params || [],
72 | dataType: dataType,
73 |
74 | beforeSend: function(request) { element.trigger('ajax:loading', [ request ]); },
75 | complete: function(request) { element.trigger('ajax:complete', [ request ]); },
76 | success: function(request) { element.trigger('ajax:success', [ request ]); },
77 | error: function(request) { element.trigger('ajax:failure', [ request ]); }
78 | });
79 | element.trigger('ajax:after');
80 | },
81 | // Triggers a particular method verb to be triggered in a form posting to the url
82 | // JSAdapter.sendMethod(element);
83 | sendMethod: function(element) {
84 | var verb = element.data('method');
85 | var url = element.attr('href');
86 | var form = $('');
87 | var csrf_token = $('meta[name=csrf-token]').attr('content');
88 | var csrf_param = $('meta[name=csrf-param]').attr('content');
89 | form.hide().appendTo('body');
90 | if (verb !== 'post') {
91 | var field = '';
92 | form.append(field);
93 | }
94 | if (csrf_param !== undefined && csrf_token !== undefined) {
95 | var field = '';
96 | form.append(field);
97 | }
98 | form.submit();
99 | }
100 | };
101 |
102 | // Every xhr request is sent along with the CSRF token.
103 | $.ajaxPrefilter(function(options, originalOptions, xhr) {
104 | if (options.verb !== 'GET') {
105 | var token = $('meta[name="csrf-token"]').attr('content');
106 | if (token) xhr.setRequestHeader('X-CSRF-Token', token);
107 | }
108 | });
109 | });
110 |
--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
1 | # Running Open Data Maker on your computer
2 |
3 | If you just want to install and run, then you can just download a
4 | [zip file](https://github.com/18F/open-data-maker/archive/master.zip).
5 |
6 | You will still need the the dependencies below, but you don't need to
7 | clone the git repo for the source code.
8 |
9 | ## Install Prerequisites
10 |
11 | You can run our bootstrap script to make sure you have all the dependencies.
12 | It will also install and start up Elasticsearch:
13 |
14 | ```
15 | script/bootstrap
16 | ```
17 |
18 | To run Open Data Maker, you will need to have the following software installed on your computer:
19 | * [Elasticsearch] 1.7.3
20 | * [Ruby] 2.2.2
21 |
22 | **NOTE: Open Data Maker does not currently work with Elasticsearch versions 2.x and above.**
23 | You can follow or assist our progress towards 2.x compatibility [at this GitHub issue](https://github.com/18F/open-data-maker/issues/248).
24 |
25 | ### Mac OS X
26 |
27 | On a Mac, we recommend installing Ruby 2.2.2 via [RVM], and Elasticsearch 1.7.3 via
28 | [Homebrew]. If you don't want to use the bootstrap script above, you can install
29 | elasticsearch 1.7 with brew using the following command:
30 |
31 | ```
32 | brew install elasticsearch17
33 | ```
34 |
35 | If you are contributing to development, you will also need [Git].
36 | If you don't already have these tools, the 18F [laptop] script will install
37 | them for you.
38 |
39 | ## Get the Source Code
40 |
41 | For development, [fork](http://help.github.com/fork-a-repo/) the repo
42 | first, then clone your fork.
43 |
44 | ```
45 | git clone https://github.com//open-data-maker.git
46 | cd open-data-maker
47 | ```
48 |
49 | ## Run the App
50 |
51 | ### Make sure Elasticsearch is up and running
52 | If you just ran `script/bootstrap`, then Elasticsearch should already be
53 | running. But if you stopped it or restarted your computer, you'll need to
54 | start it back up. Assuming you installed Elasticsearch via our `bootstrap`
55 | script, you can restart it with this command:
56 |
57 | ```brew services restart elasticsearch```
58 |
59 |
60 | ### Import the data
61 |
62 | To get started, you can import sample data with:
63 |
64 | `rake import`
65 |
66 | ### Start the app
67 |
68 | ```
69 | padrino start
70 | ```
71 | Go to: http://127.0.0.1:3000/
72 |
73 | and you should see the text `Welcome to Open Data Maker` with a link to
74 | the API created by the [sample data](sample-data).
75 |
76 | You can verify that the import was successful by visiting
77 | http://127.0.0.1:3000/v1/cities?name=Cleveland. You should see something like:
78 |
79 | ```json
80 | {
81 | "state": "OH",
82 | "name": "Cleveland",
83 | "population": 396815,
84 | "land_area": 77.697,
85 | "location": {
86 | "lat": 41.478138,
87 | "lon": -81.679486
88 | }
89 | ```
90 |
91 | ### Custom Datasets
92 |
93 | While the app is running (or anytime) you can run `rake import`. For instance, if you had a `presidents/data.yaml` file, you would import
94 | it with:
95 |
96 | ```sh
97 | export DATA_PATH=presidents
98 | rake import
99 | # or, more succintly:
100 | DATA_PATH=presidents rake import
101 | ```
102 |
103 | to clear the data, assuming the data set had an index named "president-data"
104 |
105 | ```
106 | rake es:delete[president-data]
107 | ```
108 |
109 | you may alternately delete all the indices (which could affect other apps if
110 | they are using your local Elasticsearch)
111 |
112 | ```
113 | rake es:delete[_all]
114 | ```
115 |
116 | The data directory can optionally include a file called `data.yaml` (see [the sample one](sample-data/data.yaml) for its schema) that references one or more `.csv` files and specifies data types,
117 | field name mapping, and other support data.
118 |
119 | ## Experimental web UI for indexing
120 |
121 | Optionally, you can enable indexing from webapp, but this option is still experimental:
122 | * `export INDEX_APP=enable`
123 | * in your browser, go to /index/reindex
124 |
125 | the old index (if present) will be deleted and re-created from source files at DATA_PATH.
126 |
127 | ## Want to help?
128 |
129 | See [Contribution Guide](CONTRIBUTING.md)
130 |
131 | Read additional [implementation notes](NOTES.md)
132 |
133 | [Elasticsearch]: https://www.elastic.co/products/elasticsearch
134 | [Homebrew]: http://brew.sh/
135 | [RVM]: https://github.com/wayneeseguin/rvm
136 | [rbenv]: https://github.com/sstephenson/rbenv
137 | [Ruby]: https://www.ruby-lang.org/en/
138 | [Git]: https://git-scm.com/
139 | [laptop]: https://github.com/18F/laptop
140 |
--------------------------------------------------------------------------------
/app/controllers.rb:
--------------------------------------------------------------------------------
1 | # Main front page
2 | OpenDataMaker::App.controllers do
3 | get :index do
4 | render :home, layout: true, locals: {
5 | 'title' => 'Open Data Maker',
6 | 'endpoints' => DataMagic.config.api_endpoint_names,
7 | 'examples' => DataMagic.config.examples,
8 | 'categories' => DataMagic.config.categories.to_json
9 | }
10 | end
11 |
12 | get :category, :with => :id do
13 | category_entry = DataMagic.config.category_by_id(params[:id])
14 | render :category, layout: true, locals: {
15 | 'title' => 'Open Data Maker',
16 | 'category_entry' => category_entry.to_json,
17 | 'field_details' => category_entry['field_details'].to_json
18 | }
19 | end
20 | end
21 |
22 | CACHE_TTL = 300
23 |
24 | # All API requests are prefixed by the API version
25 | # in this case, "v1" - e.g. "/vi/endpoints" etc.
26 | OpenDataMaker::App.controllers :v1 do
27 | before do
28 | content_type :json
29 | headers 'Access-Control-Allow-Origin' => '*',
30 | 'Access-Control-Allow-Methods' => ['GET'],
31 | 'Surrogate-Control' => "max-age=#{CACHE_TTL}"
32 | cache_control :public, max_age: CACHE_TTL
33 | end
34 |
35 | get :endpoints do
36 | endpoints = DataMagic.config.api_endpoints.keys.map do |key|
37 | {
38 | name: key,
39 | url: url_for(:v1, :index, endpoint: key)
40 | }
41 | end
42 | return { endpoints: endpoints }.to_json
43 | end
44 |
45 | get '/data.json' do
46 | data = DataMagic.config.data
47 | data.to_json
48 | end
49 |
50 | get :index, with: ':endpoint/:command', provides: [:json] do
51 | process_params
52 | end
53 |
54 | get :index, with: ':endpoint', provides: [:json, :csv] do
55 | process_params
56 | end
57 | end
58 |
59 | def process_params
60 | options = get_search_args_from_params(params)
61 | DataMagic.logger.debug "-----> APP GET #{params.inspect} with options #{options.inspect}"
62 |
63 | check_endpoint!(options)
64 | set_content_type(options)
65 | search_and_respond(options)
66 | end
67 |
68 | def search_and_respond(options)
69 | data = DataMagic.search(params, options)
70 | halt 400, data.to_json if data.key?(:errors)
71 |
72 | if content_type == :csv
73 | output_data_as_csv(data['results'])
74 | else
75 | data.to_json
76 | end
77 | end
78 |
79 | def check_endpoint!(options)
80 | unless DataMagic.config.api_endpoints.keys.include? options[:endpoint]
81 | halt 404, {
82 | error: 404,
83 | message: "#{options[:endpoint]} not found. Available endpoints: #{DataMagic.config.api_endpoints.keys.join(',')}"
84 | }.to_json
85 | end
86 | end
87 |
88 | def set_content_type(options)
89 | if options[:command] == 'stats'
90 | content_type :json
91 | else
92 | content_type(options[:format].nil? ? :json : options[:format].to_sym)
93 | end
94 | end
95 |
96 | # TODO: Use of non-underscore-prefixed option parameters is still
97 | # supported but deprecated, and should be removed at some point soon -
98 | # see comment in method body
99 | def get_search_args_from_params(params)
100 | options = {}
101 | %w(metrics sort fields zip distance page per_page debug).each do |opt|
102 | options[opt.to_sym] = params.delete("_#{opt}")
103 | # TODO: remove next line to end support for un-prefixed option parameters
104 | options[opt.to_sym] ||= params.delete(opt)
105 | end
106 | options[:endpoint] = params.delete("endpoint") # these two params are
107 | options[:format] = params.delete("format") # supplied by Padrino
108 | options[:fields] = (options[:fields] || "").split(',')
109 | options[:command] = params.delete("command")
110 |
111 | options[:metrics] = options[:metrics].split(/\s*,\s*/) if options[:metrics]
112 | options
113 | end
114 |
115 | def output_data_as_csv(results)
116 | # We assume all rows have the same keys
117 | if results.empty?
118 | ''
119 | else
120 | CSV.generate(force_quotes: true, headers: true) do |csv|
121 | results.each_with_index do |row, row_num|
122 | row = NestedHash.new(row).withdotkeys
123 | # make the order match data.yaml order
124 | output = DataMagic.config.field_types.each_with_object({}) do |(name, type), output|
125 | output[name] = row[name] unless row[name].nil?
126 | if name == "location"
127 | output["location.lat"] = row["location.lat"] unless row["location.lat"].nil?
128 | output["location.lon"] = row["location.lon"] unless row["location.lon"].nil?
129 | end
130 | end
131 | csv << output.keys if row_num == 0
132 | csv << output
133 | end
134 | end
135 | end
136 | end
137 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/search_name_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'data_magic'
3 | require 'csv'
4 |
5 | describe "DataMagic intuitive search" do
6 |
7 | before :example do
8 | DataMagic.destroy
9 | ENV['DATA_PATH'] = './spec/fixtures/school_names'
10 | DataMagic.init(load_now: true)
11 | end
12 | after :example do
13 | DataMagic.destroy
14 | end
15 |
16 | RSpec.configure do |c|
17 | c.alias_it_should_behave_like_to :it_correctly, 'correctly:'
18 | end
19 |
20 | let(:expected_meta) {{"metadata"=>{"total"=>1, "page"=>0, "per_page"=>20}}}
21 | let(:expected_match) { "" }
22 | let(:response) { DataMagic.search(
23 | {'school.name' => subject}, fields:['school.name']) }
24 |
25 | context "full request" do
26 | let(:response) { DataMagic.search({id: 1}) }
27 | let(:expected_match) { [{"id"=>"1", "school"=>{"state"=>"AL", "name"=>"Stillman College"}}]}
28 | it "provides expected document" do
29 | expect(response['results']).to eql expected_match
30 | end
31 | end
32 |
33 | context "sort" do
34 | shared_examples "returns" do
35 | it "sorted results " do
36 | expect(response['results'].map { |i| i['school.name'] })
37 | .to eql expected_match
38 | end
39 | end
40 |
41 | context "with list of names" do
42 | let(:response) { DataMagic.search({}, fields:['school.name'],
43 | sort: 'school.name') }
44 | # fields:['name'],
45 | let(:expected_match) {
46 | csv_path = File.expand_path("../../fixtures/school_names/school_names.csv", __dir__)
47 | data = CSV.read(csv_path).slice(1..-1)
48 | data.map { |row| row[2] }
49 | .sort.slice(0,20)
50 | }
51 | it_correctly "returns"
52 | end
53 |
54 | end
55 |
56 | context "basic search" do
57 | shared_examples "finds" do
58 | it "correct results " do
59 | expect(response['results']
60 | .map { |i| i['school.name'] }
61 | .sort )
62 | .to eql expected_match
63 | end
64 | it "correct metadata" do
65 | expect(response.reject { |k, _| k == 'results' }).to eql expected_meta
66 | end
67 | end
68 |
69 | context "for exact match" do
70 | subject { 'New York University' }
71 | let(:expected_match) { ['New York University'] }
72 | it_correctly "finds"
73 | end
74 | context "for exact match (case insensitive)" do
75 | subject { 'new YORK UniverSity' }
76 | let(:expected_match) { ['New York University'] }
77 | it_correctly "finds"
78 | end
79 |
80 | context "for exact match (case insensitive)" do
81 | subject { 'new YORK UniverSity' }
82 | let(:expected_match) { ['New York University'] }
83 | it_correctly "finds"
84 | end
85 |
86 | context "by prefix" do
87 | subject { 'Still' }
88 | let(:expected_match) { ['Stillman College'] }
89 | it_correctly "finds"
90 | end
91 |
92 | context "by prefix (case insensitive)" do
93 | subject { 'still' }
94 | let(:expected_match) { ['Stillman College'] }
95 | it_correctly "finds"
96 | end
97 |
98 | context "by prefix in the middle of the name" do
99 | subject { 'Phoenix' }
100 | let(:expected_meta) {{"metadata"=>{"total"=>3, "page"=>0, "per_page"=>20}}}
101 | let(:expected_match) { ['Phoenix College',
102 | 'University of Phoenix-Online Campus',
103 | "University of Phoenix-Phoenix Campus"] }
104 | it_correctly "finds"
105 | end
106 |
107 | context "with words in the wrong order" do
108 | subject { 'University New York' }
109 | let(:expected_match) { ['New York University'] }
110 | it_correctly "finds"
111 | end
112 |
113 | context "partial word after dash" do
114 | subject { 'berk' }
115 | let(:expected_meta) {{"metadata"=>{"total"=>3, "page"=>0, "per_page"=>20}}}
116 | let(:expected_match) { ['Berk Trade and Business School',
117 | 'Berklee College of Music',
118 | 'University of California-Berkeley'] }
119 | it_correctly "finds"
120 | end
121 |
122 | context "words separated by dash" do
123 | subject { 'phoenix online' }
124 | let(:expected_match) { ['University of Phoenix-Online Campus'] }
125 | it_correctly "finds"
126 | end
127 | end
128 | # TO DO
129 | # "pheonix" (mis-spelling) should probably work
130 | # "phoenix college" should also probably return "university of phoenix" --- since college is a synonym for unversity
131 |
132 | end
133 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Open Data Maker
2 | [](https://circleci.com/gh/18F/open-data-maker/tree/dev)
3 |
4 | The goal of this project is to make it easy to turn a lot of potentially large
5 | csv files into open data via an API and the ability for people to download
6 | smaller csv files with a subset of the data.
7 |
8 | Preliminary research suggests that open data users (journalists and others)
9 | actually know how to work with spreadsheets really well, but a lot of the
10 | data sets that we have in government are huge.
11 |
12 | The first version of this project will allow us to host a website for an
13 | agency with a specific set of csv files, which are deployed with the app.
14 | This will allows us to deploy more quickly since there will be a lower risk
15 | security profile than if an agency could upload the CSV files (which might
16 | be a nice longer term feature).
17 |
18 |
19 | ## Install and Run the App (as a developer)
20 |
21 | See our [Installation Guide](INSTALL.md)
22 |
23 | ## How this works
24 |
25 | By default, data will be loaded from /sample-data when you run `rake import`
26 |
27 | * [cities100.csv](sample-data/cities100.csv) - dataset of 100 most populous cities in the US
28 | * [data.yaml](sample-data/data.yaml) - configuration for
29 | * index name *city-data*
30 | * api endpoint name *cities*
31 | * how columns are mapped to fields in json output
32 | * data types
33 | * unique columns *name*
34 |
35 | When you run the app, you can query the dataset via json API, like: /cities?name=Chicago
36 |
37 | * http://localhost:3000/cities?name=Chicago
38 | * http://localhost:3000/cities?name=Chicago&state=IL
39 | * http://localhost:3000/cities?state=NY,MA
40 | * http://localhost:3000/cities?state=CA&fields=name,size
41 |
42 | To use your own data, you can set a different directory, for example:
43 |
44 | ```
45 | export DATA_PATH='./data'
46 | ```
47 |
48 | 1. Put csv files into /data
49 | 1. Import files from /data: ```rake import``` (or restart the app)
50 | 1. There can be multiple files (must end in .csv)
51 | 1. Optional [data.yaml](sample-data/data.yaml) file that specifies index name, API endpoint, file list, and a dictionary of column -> field name mapping and types
52 | 1. Optionally import all the columns, not just ones specified in dictionary (see example: [import: all](spec/fixtures/import_with_options/data.yaml))
53 | 1. If data.yaml not provided, all fields and fields will be imported with folder or bucket name used as the API endpoint (name is 'slugified' with dashes replacing spaces)
54 | 1. api endpoint to get the data /api=endpoint?field_or_column_name=value
55 |
56 | ## More Configuration Options
57 |
58 | Often while you are developing an API and data dictionary,
59 | it is helpful to include all the columns in the csv. If you add the following to
60 | data.yaml, the field names and types from the dictionary will be used and any
61 | unspecified columns will simply use the column name as the field name.
62 |
63 | ```
64 | options:
65 | columns: all
66 | ```
67 |
68 | You can use the dictionary to provide nice errors to developers who use the API.
69 | This can be used in conjunction with the above ```columns: all``` which will
70 | make it so that columns that are not referenced in the dictionary are not
71 | searchable, but will make it so that unspecified fields cause errors to be
72 | reported.
73 |
74 | ```
75 | options:
76 | search: dictionary_only
77 | ```
78 |
79 | Also for debugging, you can limit the number of files that will be imported. This is helpful when the import process is time consuming because you have many, many files, but can test format changes with a subset of the files.
80 |
81 | ```
82 | options:
83 | limit: 4
84 | ```
85 |
86 |
87 |
88 | ## Help Wanted
89 |
90 | 1. Try out importing multiple data sets with different endpoints and data.yaml configuration
91 | 2. Take a look at our [open issues](https://github.com/18F/open-data-maker/issues) and our [Contribution Guide](CONTRIBUTING.md)
92 |
93 | ## More Info
94 |
95 | Here's how it might look in the future:
96 |
97 | 
98 |
99 |
100 | 
101 |
102 | ### Acknowledgements
103 | Zipcode latitude and longitude provided by [GeoNames](http://www.geonames.org/) under under a [Creative Commons Attribution 3.0 License](http://creativecommons.org/licenses/by/3.0/).
104 |
105 | ### Public domain
106 |
107 | Except as noted above, this project is in the worldwide [public domain](LICENSE.md). As stated in [CONTRIBUTING](CONTRIBUTING.md):
108 |
109 | > This project is in the public domain within the United States, and copyright and related rights in the work worldwide are waived through the [CC0 1.0 Universal public domain dedication](https://creativecommons.org/publicdomain/zero/1.0/).
110 | >
111 | > All contributions to this project will be released under the CC0 dedication. By submitting a pull request, you are agreeing to comply with this waiver of copyright interest.
112 |
--------------------------------------------------------------------------------
/lib/data_magic/error_checker.rb:
--------------------------------------------------------------------------------
1 | module DataMagic
2 | module ErrorChecker
3 | class << self
4 | def check(params, options, config)
5 | report_required_params_absent(options) +
6 | report_nonexistent_params(params, config) +
7 | report_nonexistent_operators(params) +
8 | report_nonexistent_fields(options[:fields], config) +
9 | report_bad_range_argument(params) +
10 | report_wrong_field_type(params, config) +
11 | report_wrong_zip(options) +
12 | report_distance_requires_zip(options)
13 | end
14 |
15 | private
16 |
17 | def report_required_params_absent(options)
18 | if options[:command] == 'stats' && options[:fields].length == 0
19 | [build_error(error: 'invalid_or_incomplete_parameters', input: options[:command])]
20 | else
21 | []
22 | end
23 | end
24 |
25 | def report_distance_requires_zip(params)
26 | # if distance, must have zip
27 | return [] if (params[:distance] && params[:zip]) || (!params[:distance])
28 | [build_error(
29 | error: 'distance_error'
30 | )]
31 | end
32 |
33 | def report_wrong_zip(params)
34 | return [] if !params[:zip] || Zipcode.valid?(params[:zip])
35 | [build_error(
36 | error: 'zipcode_error',
37 | parameter: :zip,
38 | input: params[:zip].to_s
39 | )]
40 | end
41 |
42 | def report_nonexistent_params(params, config)
43 | return [] unless config.dictionary_only_search?
44 | params.keys.reject { |p| config.field_type(strip_op(p)) }.
45 | map { |p| build_error(error: 'parameter_not_found', input: strip_op(p)) }
46 | end
47 |
48 | def report_nonexistent_operators(params)
49 | params.keys.select { |p| p =~ /__(\w+)$/ && $1 !~ /range|not|ne/i }.
50 | map do |p|
51 | (param, op) = p.match(/^(.*)__(\w+)$/).captures
52 | build_error(error: 'operator_not_found', parameter: param, input: op)
53 | end
54 | end
55 |
56 | def report_nonexistent_fields(fields, config)
57 | if fields && !fields.empty? && config.dictionary_only_search?
58 | fields.reject { |f| config.field_type(f.to_s) }.
59 | map { |f| build_error(error: 'field_not_found', input: f.to_s) }
60 | else
61 | []
62 | end
63 | end
64 |
65 | def report_bad_range_argument(params)
66 | ranges = params.select do |p,v|
67 | p =~ /__range$/ and
68 | v !~ / ^(\d+(\.\d+)?)? # optional starting number
69 | \.\. # range dots
70 | (\d+(\.\d+)?)? # optional ending number
71 | (,(\d+(\.\d+)?)?\.\.(\d+(\.\d+)?)?)* # and more, with commas
72 | $/x
73 | end
74 | ranges.map do |p,v|
75 | build_error(error: 'range_format_error', parameter: strip_op(p), input: v)
76 | end
77 | end
78 |
79 | def report_wrong_field_type(params, config)
80 | bad_fields = params.select do |p, v|
81 | next false if p =~ /__range$/
82 | param_type = config.field_type(strip_op(p))
83 | value_type = guess_value_type(v)
84 | (param_type == "float" && value_type != "float" && value_type != "integer") or
85 | (param_type == "integer" && value_type != "integer")
86 | end
87 | bad_fields.map do |p, v|
88 | build_error(error: 'parameter_type_error', parameter: p, input: v,
89 | expected_type: config.field_type(strip_op(p)),
90 | input_type: guess_value_type(v))
91 | end
92 | end
93 |
94 | def build_error(opts)
95 | opts[:message] =
96 | case opts[:error]
97 | when 'invalid_or_incomplete_parameters'
98 | "The command #{opts[:input]} requires a fields parameter."
99 | when 'parameter_not_found'
100 | "The input parameter '#{opts[:input]}' is not known in this dataset."
101 | when 'field_not_found'
102 | "The input field '#{opts[:input]}' (in the fields parameter) is not a field in this dataset."
103 | when 'operator_not_found'
104 | "The input operator '#{opts[:input]}' (appended to the parameter '#{opts[:parameter]}') is not known or supported. (Known operators: range, ne, not)"
105 | when 'parameter_type_error'
106 | "The parameter '#{opts[:parameter]}' expects a value of type #{opts[:expected_type]}, but received '#{opts[:input]}' which is a value of type #{opts[:input_type]}."
107 | when 'range_format_error'
108 | "The range '#{opts[:input]}' supplied to parameter '#{opts[:parameter]}' isn't in the correct format."
109 | when 'zipcode_error'
110 | "The provided zipcode, '#{opts[:input]}', is not valid."
111 | when 'distance_error'
112 | "Use of the 'distance' parameter also requires a 'zip' parameter."
113 | end
114 | opts
115 | end
116 |
117 | def guess_value_type(value)
118 | case value.to_s
119 | when /^-?\d+$/
120 | "integer"
121 | when /^(-?\d+,?)+$/ # list of integers
122 | "integer"
123 | when /^-?\d+\.\d+$/
124 | "float"
125 | else
126 | "string"
127 | end
128 | end
129 |
130 | def strip_op(param)
131 | param.sub(/__\w+$/, '')
132 | end
133 | end
134 | end
135 | end
136 |
--------------------------------------------------------------------------------
/lib/data_magic/query_builder.rb:
--------------------------------------------------------------------------------
1 | module DataMagic
2 | module QueryBuilder
3 | class << self
4 | # Creates query from parameters passed into endpoint
5 | def from_params(params, options, config)
6 | per_page = (options[:per_page] || config.page_size || DataMagic::DEFAULT_PAGE_SIZE).to_i
7 | page = options[:page].to_i || 0
8 | per_page = DataMagic::MAX_PAGE_SIZE if per_page > DataMagic::MAX_PAGE_SIZE
9 | query_hash = {
10 | from: page * per_page,
11 | size: per_page,
12 | }
13 |
14 | query_hash[:query] = generate_squery(params, options, config).to_search
15 |
16 | if options[:command] == 'stats'
17 | query_hash.merge! add_aggregations(params, options, config)
18 | end
19 |
20 | if options[:fields] && !options[:fields].empty?
21 | query_hash[:fields] = get_restrict_fields(options)
22 | query_hash[:_source] = false
23 | else
24 | query_hash[:_source] = {
25 | exclude: ["_*"]
26 | }
27 | end
28 | query_hash[:sort] = get_sort_order(options[:sort], config) if options[:sort] && !options[:sort].empty?
29 | query_hash
30 | end
31 |
32 | private
33 |
34 | def generate_squery(params, options, config)
35 | squery = Stretchy.query(type: 'document')
36 | squery = search_location(squery, options)
37 | search_fields_and_ranges(squery, params, config)
38 | end
39 |
40 | # Wrapper for Stretchy aggregation clause builder (which wraps ElasticSearch (ES) :aggs parameter)
41 | # Extracts all extended_stats aggregations from ES, to be filtered later
42 | # Is a no-op if no fields are specified, or none of them are numeric
43 | def add_aggregations(params, options, config)
44 | agg_hash = options[:fields].inject({}) do |memo, f|
45 | if config.column_field_types[f.to_s] && ["integer", "float"].include?(config.column_field_types[f.to_s])
46 | memo[f.to_s] = { extended_stats: { "field" => f.to_s } }
47 | end
48 | memo
49 | end
50 |
51 | agg_hash.empty? ? {} : { aggs: agg_hash }
52 | end
53 |
54 | def get_restrict_fields(options)
55 | options[:fields].map(&:to_s)
56 | end
57 |
58 | # @description turns a string like "state,population:desc" into [{'state' => {order: 'asc'}},{ "population" => {order: "desc"} }]
59 | # @param [String] sort_param
60 | # @return [Array]
61 | def get_sort_order(sort_param, config)
62 | sort_param.to_s.scan(/(\w+[\.\w]*):?(\w*)/).map do |field_name, direction|
63 | direction = 'asc' if direction.empty?
64 | type = config.field_type(field_name)
65 | # for 'autocomplete' search on lowercase not analyzed indexed in _name
66 | field_name = "_#{field_name}" if type == 'autocomplete'
67 | { field_name => { order: direction } }
68 | end
69 | end
70 |
71 | def to_number(value)
72 | value =~ /\./ ? value.to_f : value.to_i
73 | end
74 |
75 | def search_fields_and_ranges(squery, params, config)
76 | params.each do |param, value|
77 | field_type = config.field_type(param)
78 | if field_type == "name"
79 | squery = include_name_query(squery, param, value)
80 | elsif field_type == "autocomplete"
81 | squery = autocomplete_query(squery, param, value)
82 | elsif match = /(.+)__(range|ne|not)\z/.match(param)
83 | field, operator = match.captures.map(&:to_sym)
84 | squery = range_query(squery, operator, field, value)
85 | elsif field_type == "integer" && value.is_a?(String) && /,/.match(value) # list of integers
86 | squery = integer_list_query(squery, param, value)
87 | else # field equality
88 | squery = squery.where(param => value)
89 | end
90 | end
91 | squery
92 | end
93 |
94 | def include_name_query(squery, field, value)
95 | value = value.split(' ').map { |word| "#{word}*"}.join(' ')
96 | squery.match.query(
97 | # we store lowercase name in field with prefix _
98 | "wildcard": { "_#{field}" => { "value": value.downcase } }
99 | )
100 | end
101 |
102 | def range_query(squery, operator, field, value)
103 | if operator == :ne or operator == :not # field negation
104 | squery.where.not(field => value)
105 | else # field range
106 | squery.filter(
107 | or: build_ranges(field, value.split(','))
108 | )
109 | end
110 | end
111 |
112 | def autocomplete_query(squery, field, value)
113 | squery.match.query(
114 | common: {
115 | field => {
116 | query: value,
117 | cutoff_frequency: 0.001,
118 | low_freq_operator: "and"
119 | }
120 | })
121 | end
122 |
123 | def integer_list_query(squery, field, value)
124 | squery.filter(
125 | terms: {
126 | field => value.split(',').map(&:to_i) }
127 | )
128 | end
129 |
130 | def build_ranges(field, range_strings)
131 | range_strings.map do |range|
132 | min, max = range.split('..')
133 | values = {}
134 | values[:gte] = to_number(min) unless min.empty?
135 | values[:lte] = to_number(max) if max
136 | {
137 | range: { field => values }
138 | }
139 | end
140 | end
141 |
142 | # Handles location (currently only uses SFO location)
143 | def search_location(squery, options)
144 | distance = options[:distance]
145 | location = Zipcode.latlon(options[:zip])
146 |
147 | if distance && !distance.empty?
148 | # default to miles if no distance given
149 | unit = distance[-2..-1]
150 | distance = "#{distance}mi" if unit != "km" and unit != "mi"
151 |
152 | squery = squery.geo('location', distance: distance, lat: location[:lat], lng: location[:lon])
153 | end
154 | squery
155 | end
156 | end
157 | end
158 | end
159 |
--------------------------------------------------------------------------------
/spec/lib/data_magic/config_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe DataMagic::Config do
4 | before(:all) do
5 | ENV['DATA_PATH'] = './spec/fixtures/import_with_dictionary'
6 | end
7 |
8 | it "detects data.yml files" do
9 | ENV['DATA_PATH'] = './spec/fixtures/cities_with_yml'
10 | config = DataMagic::Config.new
11 | expect(config.data["api"]).to eq("cities")
12 | end
13 |
14 | describe 'slugification' do
15 | it 'slugifies local paths' do
16 | config = DataMagic::Config.new
17 | slugified = config.clean_index('path/to/my_directory')
18 | expect(slugified).to eq('my-directory')
19 | end
20 |
21 | it 'slugifes s3 bucket names' do
22 | config = DataMagic::Config.new
23 | slugified = config.clean_index('s3://user:pass@my_bucket')
24 | expect(slugified).to eq('my-bucket')
25 | end
26 | end
27 |
28 | context "s3" do
29 | it "detects data.yaml" do
30 | ENV['DATA_PATH'] = 's3://mybucket'
31 | fake_s3 = class_spy("Fake Aws::S3::Client")
32 | fake_get_object_response = double(
33 | "S3 response",
34 | body: StringIO.new({ 'index' => 'fake-index' }.to_yaml),
35 | isOK: true,
36 | status: 200
37 | )
38 | allow(fake_s3).to receive(:get_object)
39 | .with(bucket: 'mybucket', key: 'data.yaml', response_target: duck_type(:read))
40 | .and_return(fake_get_object_response)
41 | config = DataMagic::Config.new(s3: fake_s3)
42 | expect(config.s3).to eq(fake_s3)
43 | expect(config.data["index"]).to eq("fake-index")
44 | end
45 |
46 | it "raises error if s3 errors" do
47 | ENV['DATA_PATH'] = 's3://mybucket'
48 | fake_s3 = class_spy("Fake Aws::S3::Client")
49 |
50 | allow(fake_s3).to receive(:get_object)
51 | .with(bucket: 'mybucket', key: 'data.yaml', response_target: duck_type(:read))
52 | .and_raise(RuntimeError)
53 | expect {
54 | DataMagic::Config.new(s3: fake_s3)
55 | }.to raise_error(RuntimeError)
56 | end
57 |
58 | end
59 |
60 | context "create" do
61 | it "works with zero args" do
62 | expect(DataMagic::Config.new).to_not be_nil
63 | end
64 | it "can set s3 client" do
65 | # TODO: mock s3
66 | s3_client = "s3 client"
67 | config = DataMagic::Config.new(s3: s3_client)
68 | expect(config.s3).to eq(s3_client)
69 | end
70 | end
71 |
72 | context "when loaded" do
73 | let(:config) { DataMagic::Config.new }
74 |
75 | after do
76 | config.clear_all
77 | end
78 |
79 | context "#scoped_index_name" do
80 | it "includes environment prefix" do
81 | expect(config.scoped_index_name).to eq('test-city-data')
82 | end
83 | end
84 |
85 | it "has config data" do
86 | default_config = {
87 | "version" => "cities100-2010",
88 | "index" => "city-data", "api" => "cities",
89 | "files" => [{ "name" => "cities100.csv" }],
90 | "options" => {:search=>"dictionary_only"},
91 | "unique" => ["name"],
92 | "data_path" => "./sample-data"
93 | }
94 | expect(config.data.keys).to include('dictionary')
95 | dictionary = config.data.delete 'dictionary'
96 |
97 | expect(dictionary.keys.sort).to eq %w(id code name state population
98 | location.lat location.lon land_area area.water).sort
99 | categories = config.data.delete 'categories'
100 | expect(categories.keys.sort).to eq %w(general general2 general3 general4 general5 geographic).sort
101 | expect(config.data).to eq(default_config)
102 | end
103 |
104 | it "has default page size" do
105 | expect(DataMagic::DEFAULT_PAGE_SIZE).to_not be_nil
106 | expect(config.page_size).to eq(DataMagic::DEFAULT_PAGE_SIZE)
107 | end
108 |
109 | describe "#update_indexed_config" do # rename ... or do this in load_config or something
110 | context "after loading config" do
111 | let(:fixture_path) { "./spec/fixtures/import_with_dictionary" }
112 | before do
113 | config.load_datayaml(fixture_path)
114 | end
115 | it "should be true" do
116 | expect(config.update_indexed_config).to be true
117 | end
118 | it "should set new data_path" do
119 | expect(config.data_path).to eq(fixture_path)
120 | end
121 |
122 | it "twice should be false" do
123 | config.update_indexed_config
124 | expect(config.update_indexed_config).to be false
125 | end
126 | end
127 | end
128 |
129 | describe "when has a custom null_value" do
130 | it 'should have a default null value' do
131 | expect(config.null_value).to eq('NULL')
132 | end
133 |
134 | it 'should set null value field' do
135 | config.load_datayaml("./spec/fixtures/import_with_null_value")
136 | expect(config.null_value).to eq('abc123')
137 | end
138 | end
139 | end
140 |
141 | context ".calculated_field_list" do
142 | let(:config) { DataMagic::Config.new(load_datayaml: false) }
143 | it "finds fields with 'calculate' property" do
144 | allow(config).to receive(:dictionary).and_return(
145 | {
146 | one: {
147 | source: 'column1',
148 | type: 'float'
149 | },
150 | two: {
151 | source: 'column2',
152 | type: 'float'
153 | },
154 | all: {
155 | calculate: 'column1 or column2',
156 | type: 'float',
157 | description: 'something'
158 | }
159 | }
160 | )
161 | expect(config.calculated_field_list).to eq(['all'])
162 | end
163 | end
164 |
165 | context ".only_field_list" do
166 | let(:config) { DataMagic::Config.new(load_datayaml: false) }
167 | let(:simple_fields) do
168 | { 'one' => 'column1', 'two' => 'column2', 'three' => 'column3' }
169 | end
170 | let(:fields_with_dots) do
171 | { 'one' => 'column1', 'two.a' => 'column2a', 'two.b' => 'column2b' }
172 | end
173 |
174 | it "selects a subset" do
175 | expect(config.only_field_list(%w(one two), simple_fields)).to eq(
176 | 'one' => 'column1', 'two' => 'column2'
177 | )
178 | end
179 |
180 | it "selects fields with dots" do
181 | expect(config.only_field_list(%w(two), fields_with_dots)).to eq(
182 | 'two.a' => 'column2a', 'two.b' => 'column2b'
183 | )
184 | end
185 | end
186 | end
187 |
--------------------------------------------------------------------------------
/spec/fixtures/cities_with_yml/cities51-100.csv:
--------------------------------------------------------------------------------
1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG"
2 | "TX","4804000","02409731","Arlington","25","A","365438","144805","248332497","9690024","95.882","3.741","32.700708","-97.124691"
3 | "CA","0603526","02409774","Bakersfield","25","A","347483","120725","368204317","3741691","142.164","1.445","35.321213","-119.018291"
4 | "LA","2255000","00545142","New Orleans","25","A","343829","189896","438803381","468240430","169.423","180.789","30.068636","-89.939007"
5 | "HI","1571550","02630783","Urban Honolulu CDP","57","S","337256","143173","156748036","20484151","60.521","7.909","21.325852","-157.845315"
6 | "CA","0602000","02409704","Anaheim","25","A","336265","104237","129073275","2526668","49.835","0.976","33.855497","-117.760071"
7 | "FL","1271000","02405568","Tampa","25","A","335709","157130","293727878","160127838","113.409","61.826","27.970086","-82.479673"
8 | "CO","0804000","02409757","Aurora","25","A","325078","131040","400759192","1806832","154.734","0.698","39.688002","-104.689740"
9 | "CA","0669000","02411814","Santa Ana","25","A","324528","76896","70627761","643479","27.270","0.248","33.736478","-117.882593"
10 | "MO","2965000","00767557","St. Louis","25","A","319294","176002","160343174","10683076","61.909","4.125","38.635699","-90.244582"
11 | "PA","4261000","01214818","Pittsburgh","25","A","305704","156165","143399923","7693613","55.367","2.971","40.439753","-79.976592"
12 | "TX","4817000","02410234","Corpus Christi","25","A","305215","125469","415982136","852055055","160.612","328.980","27.754252","-97.173385"
13 | "CA","0662000","02410965","Riverside","25","A","303871","98444","210152356","788400","81.140","0.304","33.938143","-117.393168"
14 | "OH","3915000","01086201","Cincinnati","25","A","296943","161095","201869928","4155439","77.942","1.604","39.139902","-84.506446"
15 | "KY","2146027","02405089","Lexington-Fayette urban county","UC","A","295803","135160","734648526","4922803","283.649","1.901","38.040157","-84.458443"
16 | "AK","0203000","02419025","Anchorage municipality","37","A","291826","113032","4415108963","663860984","1704.683","256.318","61.177549","-149.274354"
17 | "CA","0675000","02411987","Stockton","25","A","291707","99637","159723404","7984682","61.670","3.083","37.976342","-121.313304"
18 | "OH","3977000","01086537","Toledo","25","A","287208","138039","208991246","8889079","80.692","3.432","41.664071","-83.581861"
19 | "MN","2758000","02396511","St. Paul","25","A","285068","120795","134623737","10875208","51.979","4.199","44.948869","-93.103855"
20 | "NJ","3451000","00885317","Newark","25","A","277140","109520","62643850","4972876","24.187","1.920","40.724220","-74.172574"
21 | "NC","3728000","02403745","Greensboro","25","A","269666","124074","327673360","13690607","126.515","5.286","36.096483","-79.827108"
22 | "NY","3611000","00978764","Buffalo","25","A","261310","133444","104594197","31364094","40.384","12.110","42.892492","-78.859686"
23 | "TX","4858016","02411437","Plano","25","A","259841","103672","185394655","937663","71.581","0.362","33.050769","-96.747944"
24 | "NE","3128000","02395713","Lincoln","25","A","258379","110546","230804010","3229386","89.114","1.247","40.808957","-96.680354"
25 | "NV","3231900","02410741","Henderson","25","A","257729","113586","279023542","0","107.732","0.000","36.012233","-115.037462"
26 | "IN","1825000","02394798","Fort Wayne","25","A","253691","113541","286500436","553423","110.618","0.214","41.088173","-85.143880"
27 | "NJ","3436000","00885264","Jersey","25","A","247597","108720","38315542","16280557","14.794","6.286","40.711417","-74.064760"
28 | "FL","1263000","02405401","St. Petersburg","25","A","244769","129401","159909751","196473878","61.742","75.859","27.761976","-82.644055"
29 | "CA","0613392","02409461","Chula Vista","25","A","243916","79416","128544675","6380068","49.631","2.463","32.627670","-117.015170"
30 | "VA","5157000","01498557","Norfolk","25","A","242803","95018","140171293","109376999","54.120","42.231","36.923015","-76.244641"
31 | "FL","1253000","02404443","Orlando","25","A","238300","121254","265203107","21469603","102.395","8.289","28.415886","-81.298750"
32 | "AZ","0412000","02409433","Chandler","25","A","236123","94404","166828220","289715","64.413","0.112","33.282874","-111.854943"
33 | "TX","4841464","02411626","Laredo","25","A","236091","68610","230271380","3754983","88.908","1.450","27.547681","-99.486931"
34 | "WI","5548000","01583625","Madison","25","A","233209","108843","198882058","44658619","76.789","17.243","43.087806","-89.430121"
35 | "NC","3775000","02405771","Winston-Salem","25","A","229617","103974","343041264","3228612","132.449","1.247","36.103262","-80.260578"
36 | "TX","4845000","02410892","Lubbock","25","A","229573","95926","317041399","2962034","122.410","1.144","33.566479","-101.886677"
37 | "LA","2205000","02403821","Baton Rouge","25","B","229493","100801","199291656","5588234","76.947","2.158","30.448454","-91.125899"
38 | "NC","3719000","02403521","Durham","25","A","228330","103221","278087581","2357401","107.370","0.910","35.980964","-78.905647"
39 | "TX","4829000","02410572","Garland","25","A","226876","80834","147848881","340126","57.085","0.131","32.909826","-96.630357"
40 | "AZ","0427820","02410596","Glendale","25","A","226721","90505","155337275","401624","59.976","0.155","33.533111","-112.189901"
41 | "NV","3260600","02410923","Reno","25","A","225221","102582","266792840","7423507","103.009","2.866","39.474487","-119.776538"
42 | "FL","1230000","02404689","Hialeah","25","A","224669","74067","55554697","3599730","21.450","1.390","25.869941","-80.302865"
43 | "NV","3254600","02409023","Paradise CDP","57","S","223167","114296","120996826","0","46.717","0.000","36.080689","-115.136839"
44 | "VA","5116000","01498558","Chesapeake","25","A","222209","83196","882669156","26052854","340.800","10.059","36.679376","-76.301788"
45 | "AZ","0465000","02411845","Scottsdale","25","A","217385","124001","476350341","1231086","183.920","0.475","33.668727","-111.823682"
46 | "NV","3251800","02411273","North Las Vegas","25","A","216961","76073","262483131","112001","101.345","0.043","36.282974","-115.089262"
47 | "TX","4837000","02410117","Irving","25","A","216290","91128","173573892","2594600","67.017","1.002","32.857748","-96.970022"
48 | "CA","0626000","02410545","Fremont","25","A","214089","73989","200617968","26291598","77.459","10.151","37.494373","-121.941117"
49 | "CA","0636770","02410116","Irvine","25","A","212375","83899","171214072","900908","66.106","0.348","33.678399","-117.771254"
50 | "AL","0107000","02403868","Birmingham","25","A","212237","108981","378310927","6590665","146.067","2.545","33.527444","-86.799047"
51 | "NY","3663000","00979426","Rochester","25","A","210565","97158","92671789","3558427","35.781","1.374","43.169927","-77.616891"
52 |
--------------------------------------------------------------------------------
/spec/fixtures/cities_without_yml/cities51-100.csv:
--------------------------------------------------------------------------------
1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG"
2 | "TX","4804000","02409731","Arlington","25","A","365438","144805","248332497","9690024","95.882","3.741","32.700708","-97.124691"
3 | "CA","0603526","02409774","Bakersfield","25","A","347483","120725","368204317","3741691","142.164","1.445","35.321213","-119.018291"
4 | "LA","2255000","00545142","New Orleans","25","A","343829","189896","438803381","468240430","169.423","180.789","30.068636","-89.939007"
5 | "HI","1571550","02630783","Urban Honolulu CDP","57","S","337256","143173","156748036","20484151","60.521","7.909","21.325852","-157.845315"
6 | "CA","0602000","02409704","Anaheim","25","A","336265","104237","129073275","2526668","49.835","0.976","33.855497","-117.760071"
7 | "FL","1271000","02405568","Tampa","25","A","335709","157130","293727878","160127838","113.409","61.826","27.970086","-82.479673"
8 | "CO","0804000","02409757","Aurora","25","A","325078","131040","400759192","1806832","154.734","0.698","39.688002","-104.689740"
9 | "CA","0669000","02411814","Santa Ana","25","A","324528","76896","70627761","643479","27.270","0.248","33.736478","-117.882593"
10 | "MO","2965000","00767557","St. Louis","25","A","319294","176002","160343174","10683076","61.909","4.125","38.635699","-90.244582"
11 | "PA","4261000","01214818","Pittsburgh","25","A","305704","156165","143399923","7693613","55.367","2.971","40.439753","-79.976592"
12 | "TX","4817000","02410234","Corpus Christi","25","A","305215","125469","415982136","852055055","160.612","328.980","27.754252","-97.173385"
13 | "CA","0662000","02410965","Riverside","25","A","303871","98444","210152356","788400","81.140","0.304","33.938143","-117.393168"
14 | "OH","3915000","01086201","Cincinnati","25","A","296943","161095","201869928","4155439","77.942","1.604","39.139902","-84.506446"
15 | "KY","2146027","02405089","Lexington-Fayette urban county","UC","A","295803","135160","734648526","4922803","283.649","1.901","38.040157","-84.458443"
16 | "AK","0203000","02419025","Anchorage municipality","37","A","291826","113032","4415108963","663860984","1704.683","256.318","61.177549","-149.274354"
17 | "CA","0675000","02411987","Stockton","25","A","291707","99637","159723404","7984682","61.670","3.083","37.976342","-121.313304"
18 | "OH","3977000","01086537","Toledo","25","A","287208","138039","208991246","8889079","80.692","3.432","41.664071","-83.581861"
19 | "MN","2758000","02396511","St. Paul","25","A","285068","120795","134623737","10875208","51.979","4.199","44.948869","-93.103855"
20 | "NJ","3451000","00885317","Newark","25","A","277140","109520","62643850","4972876","24.187","1.920","40.724220","-74.172574"
21 | "NC","3728000","02403745","Greensboro","25","A","269666","124074","327673360","13690607","126.515","5.286","36.096483","-79.827108"
22 | "NY","3611000","00978764","Buffalo","25","A","261310","133444","104594197","31364094","40.384","12.110","42.892492","-78.859686"
23 | "TX","4858016","02411437","Plano","25","A","259841","103672","185394655","937663","71.581","0.362","33.050769","-96.747944"
24 | "NE","3128000","02395713","Lincoln","25","A","258379","110546","230804010","3229386","89.114","1.247","40.808957","-96.680354"
25 | "NV","3231900","02410741","Henderson","25","A","257729","113586","279023542","0","107.732","0.000","36.012233","-115.037462"
26 | "IN","1825000","02394798","Fort Wayne","25","A","253691","113541","286500436","553423","110.618","0.214","41.088173","-85.143880"
27 | "NJ","3436000","00885264","Jersey","25","A","247597","108720","38315542","16280557","14.794","6.286","40.711417","-74.064760"
28 | "FL","1263000","02405401","St. Petersburg","25","A","244769","129401","159909751","196473878","61.742","75.859","27.761976","-82.644055"
29 | "CA","0613392","02409461","Chula Vista","25","A","243916","79416","128544675","6380068","49.631","2.463","32.627670","-117.015170"
30 | "VA","5157000","01498557","Norfolk","25","A","242803","95018","140171293","109376999","54.120","42.231","36.923015","-76.244641"
31 | "FL","1253000","02404443","Orlando","25","A","238300","121254","265203107","21469603","102.395","8.289","28.415886","-81.298750"
32 | "AZ","0412000","02409433","Chandler","25","A","236123","94404","166828220","289715","64.413","0.112","33.282874","-111.854943"
33 | "TX","4841464","02411626","Laredo","25","A","236091","68610","230271380","3754983","88.908","1.450","27.547681","-99.486931"
34 | "WI","5548000","01583625","Madison","25","A","233209","108843","198882058","44658619","76.789","17.243","43.087806","-89.430121"
35 | "NC","3775000","02405771","Winston-Salem","25","A","229617","103974","343041264","3228612","132.449","1.247","36.103262","-80.260578"
36 | "TX","4845000","02410892","Lubbock","25","A","229573","95926","317041399","2962034","122.410","1.144","33.566479","-101.886677"
37 | "LA","2205000","02403821","Baton Rouge","25","B","229493","100801","199291656","5588234","76.947","2.158","30.448454","-91.125899"
38 | "NC","3719000","02403521","Durham","25","A","228330","103221","278087581","2357401","107.370","0.910","35.980964","-78.905647"
39 | "TX","4829000","02410572","Garland","25","A","226876","80834","147848881","340126","57.085","0.131","32.909826","-96.630357"
40 | "AZ","0427820","02410596","Glendale","25","A","226721","90505","155337275","401624","59.976","0.155","33.533111","-112.189901"
41 | "NV","3260600","02410923","Reno","25","A","225221","102582","266792840","7423507","103.009","2.866","39.474487","-119.776538"
42 | "FL","1230000","02404689","Hialeah","25","A","224669","74067","55554697","3599730","21.450","1.390","25.869941","-80.302865"
43 | "NV","3254600","02409023","Paradise CDP","57","S","223167","114296","120996826","0","46.717","0.000","36.080689","-115.136839"
44 | "VA","5116000","01498558","Chesapeake","25","A","222209","83196","882669156","26052854","340.800","10.059","36.679376","-76.301788"
45 | "AZ","0465000","02411845","Scottsdale","25","A","217385","124001","476350341","1231086","183.920","0.475","33.668727","-111.823682"
46 | "NV","3251800","02411273","North Las Vegas","25","A","216961","76073","262483131","112001","101.345","0.043","36.282974","-115.089262"
47 | "TX","4837000","02410117","Irving","25","A","216290","91128","173573892","2594600","67.017","1.002","32.857748","-96.970022"
48 | "CA","0626000","02410545","Fremont","25","A","214089","73989","200617968","26291598","77.459","10.151","37.494373","-121.941117"
49 | "CA","0636770","02410116","Irvine","25","A","212375","83899","171214072","900908","66.106","0.348","33.678399","-117.771254"
50 | "AL","0107000","02403868","Birmingham","25","A","212237","108981","378310927","6590665","146.067","2.545","33.527444","-86.799047"
51 | "NY","3663000","00979426","Rochester","25","A","210565","97158","92671789","3558427","35.781","1.374","43.169927","-77.616891"
52 |
--------------------------------------------------------------------------------
/spec/fixtures/import_with_dictionary/cities51-100.csv:
--------------------------------------------------------------------------------
1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG"
2 | "TX","4804000","02409731","Arlington","25","A","365438","144805","248332497","9690024","95.882","3.741","32.700708","-97.124691"
3 | "CA","0603526","02409774","Bakersfield","25","A","347483","120725","368204317","3741691","142.164","1.445","35.321213","-119.018291"
4 | "LA","2255000","00545142","New Orleans","25","A","343829","189896","438803381","468240430","169.423","180.789","30.068636","-89.939007"
5 | "HI","1571550","02630783","Urban Honolulu CDP","57","S","337256","143173","156748036","20484151","60.521","7.909","21.325852","-157.845315"
6 | "CA","0602000","02409704","Anaheim","25","A","336265","104237","129073275","2526668","49.835","0.976","33.855497","-117.760071"
7 | "FL","1271000","02405568","Tampa","25","A","335709","157130","293727878","160127838","113.409","61.826","27.970086","-82.479673"
8 | "CO","0804000","02409757","Aurora","25","A","325078","131040","400759192","1806832","154.734","0.698","39.688002","-104.689740"
9 | "CA","0669000","02411814","Santa Ana","25","A","324528","76896","70627761","643479","27.270","0.248","33.736478","-117.882593"
10 | "MO","2965000","00767557","St. Louis","25","A","319294","176002","160343174","10683076","61.909","4.125","38.635699","-90.244582"
11 | "PA","4261000","01214818","Pittsburgh","25","A","305704","156165","143399923","7693613","55.367","2.971","40.439753","-79.976592"
12 | "TX","4817000","02410234","Corpus Christi","25","A","305215","125469","415982136","852055055","160.612","328.980","27.754252","-97.173385"
13 | "CA","0662000","02410965","Riverside","25","A","303871","98444","210152356","788400","81.140","0.304","33.938143","-117.393168"
14 | "OH","3915000","01086201","Cincinnati","25","A","296943","161095","201869928","4155439","77.942","1.604","39.139902","-84.506446"
15 | "KY","2146027","02405089","Lexington-Fayette urban county","UC","A","295803","135160","734648526","4922803","283.649","1.901","38.040157","-84.458443"
16 | "AK","0203000","02419025","Anchorage municipality","37","A","291826","113032","4415108963","663860984","1704.683","256.318","61.177549","-149.274354"
17 | "CA","0675000","02411987","Stockton","25","A","291707","99637","159723404","7984682","61.670","3.083","37.976342","-121.313304"
18 | "OH","3977000","01086537","Toledo","25","A","287208","138039","208991246","8889079","80.692","3.432","41.664071","-83.581861"
19 | "MN","2758000","02396511","St. Paul","25","A","285068","120795","134623737","10875208","51.979","4.199","44.948869","-93.103855"
20 | "NJ","3451000","00885317","Newark","25","A","277140","109520","62643850","4972876","24.187","1.920","40.724220","-74.172574"
21 | "NC","3728000","02403745","Greensboro","25","A","269666","124074","327673360","13690607","126.515","5.286","36.096483","-79.827108"
22 | "NY","3611000","00978764","Buffalo","25","A","261310","133444","104594197","31364094","40.384","12.110","42.892492","-78.859686"
23 | "TX","4858016","02411437","Plano","25","A","259841","103672","185394655","937663","71.581","0.362","33.050769","-96.747944"
24 | "NE","3128000","02395713","Lincoln","25","A","258379","110546","230804010","3229386","89.114","1.247","40.808957","-96.680354"
25 | "NV","3231900","02410741","Henderson","25","A","257729","113586","279023542","0","107.732","0.000","36.012233","-115.037462"
26 | "IN","1825000","02394798","Fort Wayne","25","A","253691","113541","286500436","553423","110.618","0.214","41.088173","-85.143880"
27 | "NJ","3436000","00885264","Jersey","25","A","247597","108720","38315542","16280557","14.794","6.286","40.711417","-74.064760"
28 | "FL","1263000","02405401","St. Petersburg","25","A","244769","129401","159909751","196473878","61.742","75.859","27.761976","-82.644055"
29 | "CA","0613392","02409461","Chula Vista","25","A","243916","79416","128544675","6380068","49.631","2.463","32.627670","-117.015170"
30 | "VA","5157000","01498557","Norfolk","25","A","242803","95018","140171293","109376999","54.120","42.231","36.923015","-76.244641"
31 | "FL","1253000","02404443","Orlando","25","A","238300","121254","265203107","21469603","102.395","8.289","28.415886","-81.298750"
32 | "AZ","0412000","02409433","Chandler","25","A","236123","94404","166828220","289715","64.413","0.112","33.282874","-111.854943"
33 | "TX","4841464","02411626","Laredo","25","A","236091","68610","230271380","3754983","88.908","1.450","27.547681","-99.486931"
34 | "WI","5548000","01583625","Madison","25","A","233209","108843","198882058","44658619","76.789","17.243","43.087806","-89.430121"
35 | "NC","3775000","02405771","Winston-Salem","25","A","229617","103974","343041264","3228612","132.449","1.247","36.103262","-80.260578"
36 | "TX","4845000","02410892","Lubbock","25","A","229573","95926","317041399","2962034","122.410","1.144","33.566479","-101.886677"
37 | "LA","2205000","02403821","Baton Rouge","25","B","229493","100801","199291656","5588234","76.947","2.158","30.448454","-91.125899"
38 | "NC","3719000","02403521","Durham","25","A","228330","103221","278087581","2357401","107.370","0.910","35.980964","-78.905647"
39 | "TX","4829000","02410572","Garland","25","A","226876","80834","147848881","340126","57.085","0.131","32.909826","-96.630357"
40 | "AZ","0427820","02410596","Glendale","25","A","226721","90505","155337275","401624","59.976","0.155","33.533111","-112.189901"
41 | "NV","3260600","02410923","Reno","25","A","225221","102582","266792840","7423507","103.009","2.866","39.474487","-119.776538"
42 | "FL","1230000","02404689","Hialeah","25","A","224669","74067","55554697","3599730","21.450","1.390","25.869941","-80.302865"
43 | "NV","3254600","02409023","Paradise CDP","57","S","223167","114296","120996826","0","46.717","0.000","36.080689","-115.136839"
44 | "VA","5116000","01498558","Chesapeake","25","A","222209","83196","882669156","26052854","340.800","10.059","36.679376","-76.301788"
45 | "AZ","0465000","02411845","Scottsdale","25","A","217385","124001","476350341","1231086","183.920","0.475","33.668727","-111.823682"
46 | "NV","3251800","02411273","North Las Vegas","25","A","216961","76073","262483131","112001","101.345","0.043","36.282974","-115.089262"
47 | "TX","4837000","02410117","Irving","25","A","216290","91128","173573892","2594600","67.017","1.002","32.857748","-96.970022"
48 | "CA","0626000","02410545","Fremont","25","A","214089","73989","200617968","26291598","77.459","10.151","37.494373","-121.941117"
49 | "CA","0636770","02410116","Irvine","25","A","212375","83899","171214072","900908","66.106","0.348","33.678399","-117.771254"
50 | "AL","0107000","02403868","Birmingham","25","A","212237","108981","378310927","6590665","146.067","2.545","33.527444","-86.799047"
51 | "NY","3663000","00979426","Rochester","25","A","210565","97158","92671789","3558427","35.781","1.374","43.169927","-77.616891"
52 |
--------------------------------------------------------------------------------
/spec/fixtures/cities_with_yml/cities50.csv:
--------------------------------------------------------------------------------
1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG"
2 | "NY","3651000","02395220","New York","25","A","8175133","3371062","783842402","429527437","302.643","165.841","40.664274","-73.938500"
3 | "CA","0644000","02410877","Los Angeles","25","A","3792621","1413995","1213850147","88119442","468.670","34.023","34.019394","-118.410825"
4 | "IL","1714000","00428803","Chicago","25","A","2695598","1194337","589571105","16781658","227.635","6.479","41.837551","-87.681844"
5 | "TX","4835000","02410796","Houston","25","A","2099451","892646","1552929379","72277296","599.589","27.906","29.780472","-95.386342"
6 | "PA","4260000","01215531","Philadelphia","25","A","1526006","670171","347321129","22289408","134.101","8.606","40.009376","-75.133346"
7 | "AZ","0455000","02411414","Phoenix","25","A","1445632","590149","1338256106","3221362","516.704","1.244","33.572162","-112.087966"
8 | "TX","4865000","02411774","San Antonio","25","A","1327407","524246","1193811736","14968532","460.933","5.779","29.472403","-98.525142"
9 | "CA","0666000","02411782","San Diego","25","A","1307402","516033","842233444","122273028","325.188","47.210","32.815300","-117.134993"
10 | "TX","4819000","02410288","Dallas","25","A","1197816","516639","881939036","117394747","340.519","45.326","32.794176","-96.765503"
11 | "CA","0668000","02411790","San Jose","25","A","945942","314038","457201438","8907829","176.526","3.439","37.296867","-121.819306"
12 | "FL","1235000","02404783","Jacksonville","25","A","821784","366273","1934729255","330550113","747.003","127.626","30.337019","-81.661302"
13 | "IN","1836003","02395424","Indianapolis (balance)","00","F","820445","379856","936107645","17072644","361.433","6.592","39.776664","-86.145935"
14 | "CA","0667000","02411786","San Francisco","25","A","805235","376942","121399963","479190317","46.873","185.016","37.727239","-123.032229"
15 | "TX","4805000","02409761","Austin","25","A","790390","354241","771546901","18560605","297.896","7.166","30.307182","-97.755996"
16 | "OH","3918000","01086101","Columbus","25","A","787033","370965","562466164","15373425","217.169","5.936","39.984799","-82.985044"
17 | "TX","4827000","02410531","Fort Worth","25","A","741206","291086","880127783","20832365","339.819","8.043","32.779542","-97.346335"
18 | "NC","3712000","02404032","Charlotte","25","A","731424","319918","770983559","5148533","297.678","1.988","35.208707","-80.830739"
19 | "MI","2622000","01626181","Detroit","25","A","713777","349170","359361795","10667148","138.750","4.119","42.383037","-83.102237"
20 | "TX","4824000","02410414","El Paso","25","A","649121","227605","661056631","2642037","255.235","1.020","31.848360","-106.426979"
21 | "TN","4748000","02405068","Memphis","25","A","646889","291883","815988030","23176952","315.055","8.949","35.103543","-89.978498"
22 | "MD","2404000","01702381","Baltimore","25","A","620961","296685","209643241","28768302","80.944","11.108","39.300213","-76.610516"
23 | "MA","2507000","00619463","Boston","25","A","617594","272481","125037462","107130299","48.277","41.363","42.331960","-71.020173"
24 | "WA","5363000","02411856","Seattle","25","A","608660","308516","217410345","152055857","83.943","58.709","47.620499","-122.350876"
25 | "DC","1150000","02390665","Washington","25","N","601723","296719","158114680","18884970","61.048","7.292","38.904149","-77.017094"
26 | "TN","4752006","02405092","Nashville-Davidson metropolitan government (balance)","00","F","601222","272622","1230569857","56261385","475.126","21.723","36.171800","-86.785002"
27 | "CO","0820000","02410324","Denver","25","A","600158","285797","396268588","4225359","153.000","1.631","39.761849","-104.880625"
28 | "KY","2148006","01967434","Louisville/Jefferson County metro government (balance)","00","F","597337","270928","842389102","43938966","325.248","16.965","38.178077","-85.666708"
29 | "WI","5553000","01583724","Milwaukee","25","A","594833","255569","248955419","1755108","96.122","0.678","43.063348","-87.966695"
30 | "OR","4159000","02411471","Portland","25","A","583776","265439","345573252","30196598","133.427","11.659","45.536951","-122.649971"
31 | "NV","3240000","02411630","Las Vegas","25","A","583756","243701","351758506","133986","135.815","0.052","36.227712","-115.264045"
32 | "OK","4055000","02411311","Oklahoma","25","A","579999","256930","1570595903","37324163","606.410","14.411","35.467079","-97.513657"
33 | "NM","3502000","02409678","Albuquerque","25","A","545852","239166","486218256","4702696","187.730","1.816","35.105552","-106.647388"
34 | "AZ","0477000","02412104","Tucson","25","A","520116","229762","587173307","842214","226.709","0.325","32.154289","-110.871062"
35 | "CA","0627000","02410546","Fresno","25","A","494665","171288","289966538","910373","111.957","0.351","36.782674","-119.794492"
36 | "CA","0664000","02411751","Sacramento","25","A","466488","190911","253599699","5673097","97.915","2.190","38.566592","-121.468632"
37 | "CA","0643000","02410866","Long Beach","25","A","462257","176032","130259313","2963688","50.293","1.144","33.809102","-118.155327"
38 | "MO","2938000","02395492","Kansas","25","A","459787","221860","815717156","10575319","314.950","4.083","39.125212","-94.551136"
39 | "AZ","0446000","02411087","Mesa","25","A","439041","201173","353409966","1580758","136.452","0.610","33.401926","-111.717379"
40 | "VA","5182000","01498559","Virginia Beach","25","A","437994","177879","644948896","643150238","249.016","248.322","36.779322","-76.024020"
41 | "GA","1304000","02403126","Atlanta","25","A","420003","224573","344861307","2219186","133.152","0.857","33.762909","-84.422675"
42 | "CO","0816000","02410198","Colorado Springs","25","A","416427","179607","503857525","947180","194.540","0.366","38.867255","-104.760749"
43 | "NE","3137000","02396064","Omaha","25","A","408958","177518","329157346","9036153","127.088","3.489","41.264675","-96.041927"
44 | "NC","3755000","02404590","Raleigh","25","A","403892","176124","370117231","2846320","142.903","1.099","35.830204","-78.641439"
45 | "FL","1245000","02404247","Miami","25","A","399457","183994","92905577","52298641","35.871","20.193","25.775163","-80.208615"
46 | "OH","3916000","01085963","Cleveland","25","A","396815","207536","201234202","12351419","77.697","4.769","41.478138","-81.679486"
47 | "OK","4075000","02412110","Tulsa","25","A","391906","185127","509590364","10967865","196.754","4.235","36.127949","-95.902316"
48 | "CA","0653000","02411292","Oakland","25","A","390724","169710","144484543","57539591","55.786","22.216","37.769857","-122.225640"
49 | "MN","2743000","02395345","Minneapolis","25","A","382578","178287","139789184","9052448","53.973","3.495","44.963323","-93.268284"
50 | "KS","2079000","00485662","Wichita","25","A","382368","167310","412571486","11136472","159.295","4.300","37.690694","-97.342678"
51 | "PR","7276770","02414943","San Juan zona urbana","62","S","381931","194316","102305007","17914425","39.500","6.917","18.406409","-66.064004"
52 |
--------------------------------------------------------------------------------
/spec/fixtures/cities_without_yml/cities50.csv:
--------------------------------------------------------------------------------
1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG"
2 | "NY","3651000","02395220","New York","25","A","8175133","3371062","783842402","429527437","302.643","165.841","40.664274","-73.938500"
3 | "CA","0644000","02410877","Los Angeles","25","A","3792621","1413995","1213850147","88119442","468.670","34.023","34.019394","-118.410825"
4 | "IL","1714000","00428803","Chicago","25","A","2695598","1194337","589571105","16781658","227.635","6.479","41.837551","-87.681844"
5 | "TX","4835000","02410796","Houston","25","A","2099451","892646","1552929379","72277296","599.589","27.906","29.780472","-95.386342"
6 | "PA","4260000","01215531","Philadelphia","25","A","1526006","670171","347321129","22289408","134.101","8.606","40.009376","-75.133346"
7 | "AZ","0455000","02411414","Phoenix","25","A","1445632","590149","1338256106","3221362","516.704","1.244","33.572162","-112.087966"
8 | "TX","4865000","02411774","San Antonio","25","A","1327407","524246","1193811736","14968532","460.933","5.779","29.472403","-98.525142"
9 | "CA","0666000","02411782","San Diego","25","A","1307402","516033","842233444","122273028","325.188","47.210","32.815300","-117.134993"
10 | "TX","4819000","02410288","Dallas","25","A","1197816","516639","881939036","117394747","340.519","45.326","32.794176","-96.765503"
11 | "CA","0668000","02411790","San Jose","25","A","945942","314038","457201438","8907829","176.526","3.439","37.296867","-121.819306"
12 | "FL","1235000","02404783","Jacksonville","25","A","821784","366273","1934729255","330550113","747.003","127.626","30.337019","-81.661302"
13 | "IN","1836003","02395424","Indianapolis (balance)","00","F","820445","379856","936107645","17072644","361.433","6.592","39.776664","-86.145935"
14 | "CA","0667000","02411786","San Francisco","25","A","805235","376942","121399963","479190317","46.873","185.016","37.727239","-123.032229"
15 | "TX","4805000","02409761","Austin","25","A","790390","354241","771546901","18560605","297.896","7.166","30.307182","-97.755996"
16 | "OH","3918000","01086101","Columbus","25","A","787033","370965","562466164","15373425","217.169","5.936","39.984799","-82.985044"
17 | "TX","4827000","02410531","Fort Worth","25","A","741206","291086","880127783","20832365","339.819","8.043","32.779542","-97.346335"
18 | "NC","3712000","02404032","Charlotte","25","A","731424","319918","770983559","5148533","297.678","1.988","35.208707","-80.830739"
19 | "MI","2622000","01626181","Detroit","25","A","713777","349170","359361795","10667148","138.750","4.119","42.383037","-83.102237"
20 | "TX","4824000","02410414","El Paso","25","A","649121","227605","661056631","2642037","255.235","1.020","31.848360","-106.426979"
21 | "TN","4748000","02405068","Memphis","25","A","646889","291883","815988030","23176952","315.055","8.949","35.103543","-89.978498"
22 | "MD","2404000","01702381","Baltimore","25","A","620961","296685","209643241","28768302","80.944","11.108","39.300213","-76.610516"
23 | "MA","2507000","00619463","Boston","25","A","617594","272481","125037462","107130299","48.277","41.363","42.331960","-71.020173"
24 | "WA","5363000","02411856","Seattle","25","A","608660","308516","217410345","152055857","83.943","58.709","47.620499","-122.350876"
25 | "DC","1150000","02390665","Washington","25","N","601723","296719","158114680","18884970","61.048","7.292","38.904149","-77.017094"
26 | "TN","4752006","02405092","Nashville-Davidson metropolitan government (balance)","00","F","601222","272622","1230569857","56261385","475.126","21.723","36.171800","-86.785002"
27 | "CO","0820000","02410324","Denver","25","A","600158","285797","396268588","4225359","153.000","1.631","39.761849","-104.880625"
28 | "KY","2148006","01967434","Louisville/Jefferson County metro government (balance)","00","F","597337","270928","842389102","43938966","325.248","16.965","38.178077","-85.666708"
29 | "WI","5553000","01583724","Milwaukee","25","A","594833","255569","248955419","1755108","96.122","0.678","43.063348","-87.966695"
30 | "OR","4159000","02411471","Portland","25","A","583776","265439","345573252","30196598","133.427","11.659","45.536951","-122.649971"
31 | "NV","3240000","02411630","Las Vegas","25","A","583756","243701","351758506","133986","135.815","0.052","36.227712","-115.264045"
32 | "OK","4055000","02411311","Oklahoma","25","A","579999","256930","1570595903","37324163","606.410","14.411","35.467079","-97.513657"
33 | "NM","3502000","02409678","Albuquerque","25","A","545852","239166","486218256","4702696","187.730","1.816","35.105552","-106.647388"
34 | "AZ","0477000","02412104","Tucson","25","A","520116","229762","587173307","842214","226.709","0.325","32.154289","-110.871062"
35 | "CA","0627000","02410546","Fresno","25","A","494665","171288","289966538","910373","111.957","0.351","36.782674","-119.794492"
36 | "CA","0664000","02411751","Sacramento","25","A","466488","190911","253599699","5673097","97.915","2.190","38.566592","-121.468632"
37 | "CA","0643000","02410866","Long Beach","25","A","462257","176032","130259313","2963688","50.293","1.144","33.809102","-118.155327"
38 | "MO","2938000","02395492","Kansas","25","A","459787","221860","815717156","10575319","314.950","4.083","39.125212","-94.551136"
39 | "AZ","0446000","02411087","Mesa","25","A","439041","201173","353409966","1580758","136.452","0.610","33.401926","-111.717379"
40 | "VA","5182000","01498559","Virginia Beach","25","A","437994","177879","644948896","643150238","249.016","248.322","36.779322","-76.024020"
41 | "GA","1304000","02403126","Atlanta","25","A","420003","224573","344861307","2219186","133.152","0.857","33.762909","-84.422675"
42 | "CO","0816000","02410198","Colorado Springs","25","A","416427","179607","503857525","947180","194.540","0.366","38.867255","-104.760749"
43 | "NE","3137000","02396064","Omaha","25","A","408958","177518","329157346","9036153","127.088","3.489","41.264675","-96.041927"
44 | "NC","3755000","02404590","Raleigh","25","A","403892","176124","370117231","2846320","142.903","1.099","35.830204","-78.641439"
45 | "FL","1245000","02404247","Miami","25","A","399457","183994","92905577","52298641","35.871","20.193","25.775163","-80.208615"
46 | "OH","3916000","01085963","Cleveland","25","A","396815","207536","201234202","12351419","77.697","4.769","41.478138","-81.679486"
47 | "OK","4075000","02412110","Tulsa","25","A","391906","185127","509590364","10967865","196.754","4.235","36.127949","-95.902316"
48 | "CA","0653000","02411292","Oakland","25","A","390724","169710","144484543","57539591","55.786","22.216","37.769857","-122.225640"
49 | "MN","2743000","02395345","Minneapolis","25","A","382578","178287","139789184","9052448","53.973","3.495","44.963323","-93.268284"
50 | "KS","2079000","00485662","Wichita","25","A","382368","167310","412571486","11136472","159.295","4.300","37.690694","-97.342678"
51 | "PR","7276770","02414943","San Juan zona urbana","62","S","381931","194316","102305007","17914425","39.500","6.917","18.406409","-66.064004"
52 |
--------------------------------------------------------------------------------
/spec/fixtures/import_with_dictionary/cities50.csv:
--------------------------------------------------------------------------------
1 | "USPS","GEOID","ANSICODE","NAME","LSAD","FUNCSTAT","POP10","HU10","ALAND","AWATER","ALAND_SQMI","AWATER_SQMI","INTPTLAT","INTPTLONG"
2 | "NY","3651000","02395220","New York","25","A","8175133","3371062","783842402","429527437","302.643","165.841","40.664274","-73.938500"
3 | "CA","0644000","02410877","Los Angeles","25","A","3792621","1413995","1213850147","88119442","468.670","34.023","34.019394","-118.410825"
4 | "IL","1714000","00428803","Chicago","25","A","2695598","1194337","589571105","16781658","227.635","6.479","41.837551","-87.681844"
5 | "TX","4835000","02410796","Houston","25","A","2099451","892646","1552929379","72277296","599.589","27.906","29.780472","-95.386342"
6 | "PA","4260000","01215531","Philadelphia","25","A","1526006","670171","347321129","22289408","134.101","8.606","40.009376","-75.133346"
7 | "AZ","0455000","02411414","Phoenix","25","A","1445632","590149","1338256106","3221362","516.704","1.244","33.572162","-112.087966"
8 | "TX","4865000","02411774","San Antonio","25","A","1327407","524246","1193811736","14968532","460.933","5.779","29.472403","-98.525142"
9 | "CA","0666000","02411782","San Diego","25","A","1307402","516033","842233444","122273028","325.188","47.210","32.815300","-117.134993"
10 | "TX","4819000","02410288","Dallas","25","A","1197816","516639","881939036","117394747","340.519","45.326","32.794176","-96.765503"
11 | "CA","0668000","02411790","San Jose","25","A","945942","314038","457201438","8907829","176.526","3.439","37.296867","-121.819306"
12 | "FL","1235000","02404783","Jacksonville","25","A","821784","366273","1934729255","330550113","747.003","127.626","30.337019","-81.661302"
13 | "IN","1836003","02395424","Indianapolis (balance)","00","F","820445","379856","936107645","17072644","361.433","6.592","39.776664","-86.145935"
14 | "CA","0667000","02411786","San Francisco","25","A","805235","376942","121399963","479190317","46.873","185.016","37.727239","-123.032229"
15 | "TX","4805000","02409761","Austin","25","A","790390","354241","771546901","18560605","297.896","7.166","30.307182","-97.755996"
16 | "OH","3918000","01086101","Columbus","25","A","787033","370965","562466164","15373425","217.169","5.936","39.984799","-82.985044"
17 | "TX","4827000","02410531","Fort Worth","25","A","741206","291086","880127783","20832365","339.819","8.043","32.779542","-97.346335"
18 | "NC","3712000","02404032","Charlotte","25","A","731424","319918","770983559","5148533","297.678","1.988","35.208707","-80.830739"
19 | "MI","2622000","01626181","Detroit","25","A","713777","349170","359361795","10667148","138.750","4.119","42.383037","-83.102237"
20 | "TX","4824000","02410414","El Paso","25","A","649121","227605","661056631","2642037","255.235","1.020","31.848360","-106.426979"
21 | "TN","4748000","02405068","Memphis","25","A","646889","291883","815988030","23176952","315.055","8.949","35.103543","-89.978498"
22 | "MD","2404000","01702381","Baltimore","25","A","620961","296685","209643241","28768302","80.944","11.108","39.300213","-76.610516"
23 | "MA","2507000","00619463","Boston","25","A","617594","272481","125037462","107130299","48.277","41.363","42.331960","-71.020173"
24 | "WA","5363000","02411856","Seattle","25","A","608660","308516","217410345","152055857","83.943","58.709","47.620499","-122.350876"
25 | "DC","1150000","02390665","Washington","25","N","601723","296719","158114680","18884970","61.048","7.292","38.904149","-77.017094"
26 | "TN","4752006","02405092","Nashville-Davidson metropolitan government (balance)","00","F","601222","272622","1230569857","56261385","475.126","21.723","36.171800","-86.785002"
27 | "CO","0820000","02410324","Denver","25","A","600158","285797","396268588","4225359","153.000","1.631","39.761849","-104.880625"
28 | "KY","2148006","01967434","Louisville/Jefferson County metro government (balance)","00","F","597337","270928","842389102","43938966","325.248","16.965","38.178077","-85.666708"
29 | "WI","5553000","01583724","Milwaukee","25","A","594833","255569","248955419","1755108","96.122","0.678","43.063348","-87.966695"
30 | "OR","4159000","02411471","Portland","25","A","583776","265439","345573252","30196598","133.427","11.659","45.536951","-122.649971"
31 | "NV","3240000","02411630","Las Vegas","25","A","583756","243701","351758506","133986","135.815","0.052","36.227712","-115.264045"
32 | "OK","4055000","02411311","Oklahoma","25","A","579999","256930","1570595903","37324163","606.410","14.411","35.467079","-97.513657"
33 | "NM","3502000","02409678","Albuquerque","25","A","545852","239166","486218256","4702696","187.730","1.816","35.105552","-106.647388"
34 | "AZ","0477000","02412104","Tucson","25","A","520116","229762","587173307","842214","226.709","0.325","32.154289","-110.871062"
35 | "CA","0627000","02410546","Fresno","25","A","494665","171288","289966538","910373","111.957","0.351","36.782674","-119.794492"
36 | "CA","0664000","02411751","Sacramento","25","A","466488","190911","253599699","5673097","97.915","2.190","38.566592","-121.468632"
37 | "CA","0643000","02410866","Long Beach","25","A","462257","176032","130259313","2963688","50.293","1.144","33.809102","-118.155327"
38 | "MO","2938000","02395492","Kansas","25","A","459787","221860","815717156","10575319","314.950","4.083","39.125212","-94.551136"
39 | "AZ","0446000","02411087","Mesa","25","A","439041","201173","353409966","1580758","136.452","0.610","33.401926","-111.717379"
40 | "VA","5182000","01498559","Virginia Beach","25","A","437994","177879","644948896","643150238","249.016","248.322","36.779322","-76.024020"
41 | "GA","1304000","02403126","Atlanta","25","A","420003","224573","344861307","2219186","133.152","0.857","33.762909","-84.422675"
42 | "CO","0816000","02410198","Colorado Springs","25","A","416427","179607","503857525","947180","194.540","0.366","38.867255","-104.760749"
43 | "NE","3137000","02396064","Omaha","25","A","408958","177518","329157346","9036153","127.088","3.489","41.264675","-96.041927"
44 | "NC","3755000","02404590","Raleigh","25","A","403892","176124","370117231","2846320","142.903","1.099","35.830204","-78.641439"
45 | "FL","1245000","02404247","Miami","25","A","399457","183994","92905577","52298641","35.871","20.193","25.775163","-80.208615"
46 | "OH","3916000","01085963","Cleveland","25","A","396815","207536","201234202","12351419","77.697","4.769","41.478138","-81.679486"
47 | "OK","4075000","02412110","Tulsa","25","A","391906","185127","509590364","10967865","196.754","4.235","36.127949","-95.902316"
48 | "CA","0653000","02411292","Oakland","25","A","390724","169710","144484543","57539591","55.786","22.216","37.769857","-122.225640"
49 | "MN","2743000","02395345","Minneapolis","25","A","382578","178287","139789184","9052448","53.973","3.495","44.963323","-93.268284"
50 | "KS","2079000","00485662","Wichita","25","A","382368","167310","412571486","11136472","159.295","4.300","37.690694","-97.342678"
51 | "PR","7276770","02414943","San Juan zona urbana","62","S","381931","194316","102305007","17914425","39.500","6.917","18.406409","-66.064004"
52 |
--------------------------------------------------------------------------------
/lib/data_magic/index/document_builder.rb:
--------------------------------------------------------------------------------
1 | require './lib/expression/expression'
2 |
3 | module DataMagic
4 | module Index
5 | module DocumentBuilder
6 | class << self
7 | def logger
8 | DataMagic::Config.logger
9 | end
10 |
11 | # build a nested json document from a csv row
12 | # row: a hash { column_name => value }
13 | # where all column_names and values are strings
14 | # fields: column_name => field_name
15 | # config: DataMagic.Config instance for dictionary, column types, NULL
16 | def build(row, builder_data, config)
17 | fields = builder_data.new_field_names
18 | options = builder_data.options
19 | additional = builder_data.additional_data
20 | csv_row = map_column_types(row.to_hash, config)
21 | if fields.empty?
22 | field_values = csv_row
23 | else
24 | field_values = map_field_names(csv_row, fields, options)
25 | end
26 | field_values.merge!(calculated_fields(csv_row, config))
27 | field_values.merge!(lowercase_columns(field_values, config.column_field_types))
28 | field_values.merge!(additional) if additional
29 | doc = NestedHash.new.add(field_values)
30 | doc = parse_nested(doc, options) if options[:nest]
31 | doc = select_only_fields(doc, options[:only]) unless options[:only].nil?
32 | doc
33 | end
34 |
35 | def create(*args)
36 | Document.new(
37 | build(*args)
38 | )
39 | end
40 |
41 | private
42 |
43 | def calculated_fields(row, config)
44 | result = {}
45 | config.calculated_field_list.each do |name|
46 | result[name] = calculate(name, row, config.dictionary)
47 | end
48 | result
49 | end
50 |
51 | # row: a hash (keys may be strings or symbols)
52 | # valid_types: an array of allowed types
53 | # field_types: hash field_name : type (float, integer, string)
54 | # returns a hash where values have been coerced to the new type
55 | # TODO: move type validation to config load time instead
56 | def map_column_types(row, config)
57 | valid_types = config.valid_types
58 | null_value = config.null_value || null_value = 'NULL'
59 |
60 | mapped = {}
61 | row.each do |key, value|
62 | if value == null_value
63 | mapped[key] = nil
64 | else
65 | type = config.csv_column_type(key)
66 | if valid_types.include? type
67 | mapped[key] = fix_field_type(type, value, key)
68 | else
69 | fail InvalidDictionary, "unexpected type '#{type.inspect}' for field '#{key}'"
70 | end
71 | end
72 | end
73 | mapped
74 | end
75 |
76 | def lowercase_columns(row, field_types = {})
77 | new_columns = {}
78 | row.each do |key, value|
79 | type = field_types[key.to_sym] || field_types[key.to_s]
80 | new_columns["_#{key}"] = value.downcase if type == "name" || type == "autocomplete"
81 | end
82 | new_columns
83 | end
84 |
85 | def parse_nested(document, options)
86 | new_doc = {}
87 | nest_options = options[:nest]
88 | if nest_options
89 | key = nest_options['key']
90 | new_doc[key] = {}
91 | new_doc['id'] = document['id'] unless document['id'].nil?
92 | nest_options['contents'].each do |item_key|
93 | new_doc[key][item_key] = document[item_key]
94 | end
95 | end
96 | new_doc
97 | end
98 |
99 | def fix_field_type(type, value, key=nil)
100 | return value if value.nil?
101 |
102 | new_value = case type
103 | when "float"
104 | value.to_f
105 | when "integer"
106 | value.to_i
107 | when "lowercase_name"
108 | value.to_s.downcase
109 | when "boolean"
110 | parse_boolean(value)
111 | else # "string"
112 | value.to_s
113 | end
114 | new_value = value.to_f if key and key.to_s.include? "location"
115 | new_value
116 | end
117 |
118 | def parse_boolean(value)
119 | case value
120 | when "true"
121 | true
122 | when "false"
123 | false
124 | when 0
125 | false
126 | else
127 | !!value
128 | end
129 | end
130 |
131 | # currently we just support 'or' operations on two columns
132 | def calculate(field_name, row, dictionary)
133 | item = dictionary[field_name.to_s] || dictionary[field_name.to_sym]
134 | type = item['type'] || item[:type]
135 | fail "calculate: field not found in dictionary #{field_name.inspect}" if item.nil?
136 | expr = item['calculate'] || item[:calculate]
137 | fail ArgumentError, "expected to calculate #{field_name}" if expr.nil?
138 | e = Expression.find_or_create(expr)
139 | vars = {}
140 | e.variables.each do |name|
141 | vars[name] = fix_field_type(type, row[name.to_sym])
142 | end
143 | fix_field_type(type, e.evaluate(vars))
144 | end
145 |
146 | # row: a hash (keys may be strings or symbols)
147 | # new_fields: hash current_name : new_name
148 | # returns a hash (which may be a subset of row) where keys are new_name
149 | # with value of corresponding row[current_name]
150 | def map_field_names(row, new_fields, options = {})
151 | mapped = {}
152 | row.each do |key, value|
153 | fail ArgumentError, "column header missing for: #{value}" if key.nil?
154 | new_key = new_fields[key.to_sym] || new_fields[key.to_s]
155 | if new_key
156 | value = value.to_f if new_key.include? "location"
157 | mapped[new_key] = value
158 | elsif options[:columns] == 'all'
159 | mapped[key] = value
160 | end
161 | end
162 | mapped
163 | end
164 |
165 | # select top-level fields from a hash
166 | # if there are name types, also select _name
167 | # doc: hash with string keys
168 | # only_keys: array of keys
169 | def select_only_fields(doc, only_keys)
170 | doc = doc.select do |key, value|
171 | key = key.to_s
172 | # if key has _ prefix, select if key present without _
173 | key = key[1..-1] if key[0] == '_'
174 | only_keys.include?(key)
175 | end
176 | end
177 |
178 | end # class methods
179 | end # module QueryBuilder
180 | end
181 | end # module DataMagic
182 |
--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | remote: https://rubygems.org/
3 | specs:
4 | actionview (4.2.3)
5 | activesupport (= 4.2.3)
6 | builder (~> 3.1)
7 | erubis (~> 2.7.0)
8 | rails-dom-testing (~> 1.0, >= 1.0.5)
9 | rails-html-sanitizer (~> 1.0, >= 1.0.2)
10 | activesupport (4.2.3)
11 | i18n (~> 0.7)
12 | json (~> 1.7, >= 1.7.7)
13 | minitest (~> 5.1)
14 | thread_safe (~> 0.3, >= 0.3.4)
15 | tzinfo (~> 1.1)
16 | addressable (2.3.8)
17 | autoparse (0.3.3)
18 | addressable (>= 2.3.1)
19 | extlib (>= 0.9.15)
20 | multi_json (>= 1.0.0)
21 | aws-sdk (2.1.11)
22 | aws-sdk-resources (= 2.1.11)
23 | aws-sdk-core (2.1.11)
24 | jmespath (~> 1.0)
25 | aws-sdk-resources (2.1.11)
26 | aws-sdk-core (= 2.1.11)
27 | axiom-types (0.1.1)
28 | descendants_tracker (~> 0.0.4)
29 | ice_nine (~> 0.11.0)
30 | thread_safe (~> 0.3, >= 0.3.1)
31 | blankslate (2.1.2.4)
32 | builder (3.2.2)
33 | byebug (5.0.0)
34 | columnize (= 0.9.0)
35 | cf-app-utils (0.4)
36 | coderay (1.1.0)
37 | coercible (1.0.0)
38 | descendants_tracker (~> 0.0.1)
39 | columnize (0.9.0)
40 | descendants_tracker (0.0.4)
41 | thread_safe (~> 0.3, >= 0.3.1)
42 | diff-lcs (1.2.5)
43 | dotenv (2.0.2)
44 | elasticsearch (1.0.12)
45 | elasticsearch-api (= 1.0.12)
46 | elasticsearch-transport (= 1.0.12)
47 | elasticsearch-api (1.0.12)
48 | multi_json
49 | elasticsearch-transport (1.0.12)
50 | faraday
51 | multi_json
52 | equalizer (0.0.11)
53 | erubis (2.7.0)
54 | excon (0.45.4)
55 | extlib (0.9.16)
56 | faraday (0.9.1)
57 | multipart-post (>= 1.2, < 3)
58 | google-api-client (0.8.2)
59 | activesupport (>= 3.2)
60 | addressable (~> 2.3)
61 | autoparse (~> 0.3)
62 | extlib (~> 0.9)
63 | faraday (~> 0.9)
64 | launchy (~> 2.4)
65 | multi_json (~> 1.10)
66 | retriable (~> 1.4)
67 | signet (~> 0.6)
68 | google_drive (1.0.1)
69 | google-api-client (>= 0.7.0)
70 | nokogiri (>= 1.4.4, != 1.5.2, != 1.5.1)
71 | oauth (>= 0.3.6)
72 | oauth2 (>= 0.5.0)
73 | hashie (3.4.2)
74 | http_router (0.11.2)
75 | rack (>= 1.0.0)
76 | url_mount (~> 0.2.1)
77 | i18n (0.7.0)
78 | ice_nine (0.11.1)
79 | jmespath (1.0.2)
80 | multi_json (~> 1.0)
81 | json (1.8.3)
82 | jwt (1.5.1)
83 | launchy (2.4.3)
84 | addressable (~> 2.3)
85 | liquid (3.0.3)
86 | liquify (0.2.7)
87 | liquid (>= 2.2.2)
88 | loofah (2.0.3)
89 | nokogiri (>= 1.5.9)
90 | mail (2.5.4)
91 | mime-types (~> 1.16)
92 | treetop (~> 1.4.8)
93 | method_source (0.8.2)
94 | mime-types (1.25.1)
95 | mini_portile2 (2.1.0)
96 | minitest (5.8.0)
97 | moneta (0.7.20)
98 | multi_json (1.11.2)
99 | multi_xml (0.5.5)
100 | multipart-post (2.0.0)
101 | newrelic_rpm (3.14.2.312)
102 | nokogiri (1.6.8)
103 | mini_portile2 (~> 2.1.0)
104 | pkg-config (~> 1.1.7)
105 | oauth (0.4.7)
106 | oauth2 (1.0.0)
107 | faraday (>= 0.8, < 0.10)
108 | jwt (~> 1.0)
109 | multi_json (~> 1.3)
110 | multi_xml (~> 0.5)
111 | rack (~> 1.2)
112 | oj (2.12.13)
113 | padrino (0.12.5)
114 | padrino-admin (= 0.12.5)
115 | padrino-cache (= 0.12.5)
116 | padrino-core (= 0.12.5)
117 | padrino-gen (= 0.12.5)
118 | padrino-helpers (= 0.12.5)
119 | padrino-mailer (= 0.12.5)
120 | padrino-support (= 0.12.5)
121 | padrino-admin (0.12.5)
122 | padrino-core (= 0.12.5)
123 | padrino-helpers (= 0.12.5)
124 | padrino-cache (0.12.5)
125 | moneta (~> 0.7.0)
126 | padrino-core (= 0.12.5)
127 | padrino-helpers (= 0.12.5)
128 | padrino-core (0.12.5)
129 | activesupport (>= 3.1)
130 | http_router (~> 0.11.0)
131 | padrino-support (= 0.12.5)
132 | rack (< 1.6.0)
133 | rack-protection (>= 1.5.0)
134 | sinatra (~> 1.4.2)
135 | thor (~> 0.18)
136 | padrino-gen (0.12.5)
137 | bundler (~> 1.0)
138 | padrino-core (= 0.12.5)
139 | padrino-helpers (0.12.5)
140 | i18n (~> 0.6, >= 0.6.7)
141 | padrino-support (= 0.12.5)
142 | tilt (~> 1.4.1)
143 | padrino-mailer (0.12.5)
144 | mail (~> 2.5.3)
145 | padrino-core (= 0.12.5)
146 | padrino-support (0.12.5)
147 | activesupport (>= 3.1)
148 | parallel (1.6.1)
149 | parslet (1.7.1)
150 | blankslate (>= 2.0, <= 4.0)
151 | pkg-config (1.1.7)
152 | polyglot (0.3.5)
153 | pry (0.10.1)
154 | coderay (~> 1.1.0)
155 | method_source (~> 0.8.1)
156 | slop (~> 3.4)
157 | pry-byebug (3.2.0)
158 | byebug (~> 5.0)
159 | pry (~> 0.10)
160 | puma (2.15.3)
161 | rack (1.5.5)
162 | rack-protection (1.5.3)
163 | rack
164 | rack-test (0.6.3)
165 | rack (>= 1.0)
166 | rails-deprecated_sanitizer (1.0.3)
167 | activesupport (>= 4.2.0.alpha)
168 | rails-dom-testing (1.0.6)
169 | activesupport (>= 4.2.0.beta, < 5.0)
170 | nokogiri (~> 1.6.0)
171 | rails-deprecated_sanitizer (>= 1.0.1)
172 | rails-html-sanitizer (1.0.3)
173 | loofah (~> 2.0)
174 | rake (10.4.2)
175 | retriable (1.4.1)
176 | rspec (3.3.0)
177 | rspec-core (~> 3.3.0)
178 | rspec-expectations (~> 3.3.0)
179 | rspec-mocks (~> 3.3.0)
180 | rspec-core (3.3.2)
181 | rspec-support (~> 3.3.0)
182 | rspec-expectations (3.3.1)
183 | diff-lcs (>= 1.2.0, < 2.0)
184 | rspec-support (~> 3.3.0)
185 | rspec-mocks (3.3.2)
186 | diff-lcs (>= 1.2.0, < 2.0)
187 | rspec-support (~> 3.3.0)
188 | rspec-support (3.3.0)
189 | ruby-prof (0.15.9)
190 | safe_yaml (1.0.4)
191 | sass (3.4.16)
192 | signet (0.6.1)
193 | addressable (~> 2.3)
194 | extlib (~> 0.9)
195 | faraday (~> 0.9)
196 | jwt (~> 1.5)
197 | multi_json (~> 1.10)
198 | sinatra (1.4.6)
199 | rack (~> 1.4)
200 | rack-protection (~> 1.4)
201 | tilt (>= 1.3, < 3)
202 | slop (3.6.0)
203 | stretchy (0.4.7)
204 | elasticsearch (~> 1.0)
205 | excon (~> 0.45)
206 | valid (~> 0.5)
207 | virtus (~> 1.0)
208 | thor (0.19.1)
209 | thread_safe (0.3.5)
210 | tilt (1.4.1)
211 | treetop (1.4.15)
212 | polyglot
213 | polyglot (>= 0.3.1)
214 | tzinfo (1.2.2)
215 | thread_safe (~> 0.1)
216 | url_mount (0.2.1)
217 | rack
218 | valid (0.5.0)
219 | virtus (1.0.5)
220 | axiom-types (~> 0.1)
221 | coercible (~> 1.0)
222 | descendants_tracker (~> 0.0, >= 0.0.3)
223 | equalizer (~> 0.0, >= 0.0.9)
224 |
225 | PLATFORMS
226 | ruby
227 |
228 | DEPENDENCIES
229 | actionview
230 | aws-sdk (~> 2)
231 | cf-app-utils
232 | dotenv
233 | elasticsearch
234 | erubis
235 | google_drive
236 | hashie
237 | liquid (= 3.0.3)
238 | liquify
239 | newrelic_rpm
240 | oj
241 | padrino (= 0.12.5)
242 | parallel
243 | parslet
244 | pry
245 | pry-byebug
246 | puma
247 | rack-test
248 | rake
249 | rspec
250 | rspec-mocks
251 | ruby-prof
252 | safe_yaml
253 | sass
254 | stretchy
255 |
256 | BUNDLED WITH
257 | 1.11.2
258 |
--------------------------------------------------------------------------------