├── .gitignore ├── Dockerfile ├── Drakefile ├── LICENSE ├── README.md ├── postprocess.sh ├── state.txt └── tiger_create.sql /.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | !.gitignore 3 | *.log 4 | data 5 | generated 6 | psql 7 | default_profile 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | ENV SHELL="/bin/bash" 4 | 5 | RUN apt-get update && \ 6 | apt-get install -y default-jdk \ 7 | git \ 8 | postgresql \ 9 | unzip \ 10 | wget 11 | 12 | # for gdal 13 | RUN apt-get install -y software-properties-common && \ 14 | add-apt-repository ppa:ubuntugis/ppa && \ 15 | apt-get update && \ 16 | apt-get install -y gdal-bin 17 | 18 | RUN wget -O /bin/drake https://raw.githubusercontent.com/Factual/drake/3659c116790f1796261d6d23373de8bba1b663be/bin/drake && \ 19 | chmod 755 /bin/drake 20 | 21 | WORKDIR / 22 | RUN wget https://github.com/dssg/acs2pgsql/archive/master.zip && \ 23 | unzip master.zip 24 | COPY default_profile /acs2pgsql-master/ 25 | 26 | WORKDIR /acs2pgsql-master 27 | ENTRYPOINT ["/bin/drake", "--auto"] 28 | -------------------------------------------------------------------------------- /Drakefile: -------------------------------------------------------------------------------- 1 | ; run input SQL file and touch output 2 | psql() 3 | psql -v ON_ERROR_STOP=1 -f $INPUT && mkdir -p $(dirname $OUTPUT) && touch $OUTPUT 4 | 5 | ; drake method to download and unzip a zip file 6 | ; the URL of the zip the file should be in the URL variable 7 | ; the first output is the name of the downloaded zip file 8 | ; the second output is the name of a file in the unzipped archive 9 | wget_unzip() 10 | mkdir -p $(dirname $OUTPUT0) 11 | wget --output-document="$OUTPUT0" "$URL" 12 | unzip -o "$OUTPUT0" -d $(dirname $OUTPUT1) 13 | 14 | shp2psql() 15 | ogr2ogr -f PostgreSQL PG:"host="$PGHOST" dbname="$PGDATABASE"" "$INPUT" -nlt MULTIPOLYGON25D -nln acs_tiger.census_tract_20"$y" && mkdir -p $(dirname $OUTPUT) && touch $OUTPUT 16 | 17 | ; American Community Survey data gets imported into its own db 18 | %include default_profile 19 | 20 | 21 | ; the default location of sql success files 22 | SQL_DIR:=psql/ 23 | 24 | data/census-postgres/ <- [-timecheck] 25 | git clone https://github.com/censusreporter/census-postgres.git $OUTPUT 26 | 27 | $(for y in {2009..2016}; do 28 | 29 | # download and unzip the acs tract and block group data 30 | echo "URL=\"http://www2.census.gov/programs-surveys/acs/summary_file/"$y"/data/5_year_by_state/$[STATE]_Tracts_Block_Groups_Only.zip\"" 31 | echo "data/acs/acs"$y"_5yr.zip, data/acs/acs"$y"_5yr/g"$y"5$[STATE_ABBREV].txt <- [-timecheck method:wget_unzip]" 32 | 33 | # run our postprocess.sh script to generate import scripts 34 | echo "generated/acs"$y"_5yr/import.sql <- postprocess.sh, data/census-postgres/" 35 | echo " \$INPUT0 \$INPUT1/acs"$y"_5yr/ data/acs/acs"$y"_5yr/ generated/acs"$y"_5yr/ $[STATE_ABBREV]" 36 | 37 | # run import scripts 38 | echo "$[SQL_DIR]/acs"$y"_5yr <- generated/acs"$y"_5yr/import.sql, data/acs/acs"$y"_5yr/g"$y"5$[STATE_ABBREV].txt [method:psql]" 39 | 40 | done) 41 | 42 | $[SQL_DIR]/tl_2010_tract_schema <- tiger_create.sql [method:psql] 43 | 44 | $( 45 | FIPS=`grep -i "|$[STATE_ABBREV]|" state.txt | cut -c 1-2` 46 | for y in 00 10; do 47 | echo "y="$y"" 48 | 49 | echo "URL=\"http://www2.census.gov/geo/pvs/tiger2010st/"$FIPS"_"$[STATE]"/"$FIPS"/tl_2010_"$FIPS"_tract"$y".zip\"" 50 | echo "data/tl_2010_"$FIPS"_tract"$y".zip, data/tl_2010_"$FIPS"_tract"$y"/tl_2010_"$FIPS"_tract"$y".shp <- [-timecheck method:wget_unzip]" 51 | 52 | echo "$[SQL_DIR]/acs_tiger_"$FIPS"_tract"$y" <- data/tl_2010_"$FIPS"_tract"$y"/tl_2010_"$FIPS"_tract"$y".shp [method:shp2psql]" 53 | 54 | done) 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BY DOWNLOADING acs2pgsql PROGRAM YOU AGREE TO THE FOLLOWING TERMS OF USE: 2 | 3 | Copyright ©2018. The University of Chicago (“Chicago”). All Rights Reserved. 4 | 5 | Permission to use, copy, modify, and distribute this software, including all object code and source code, and any accompanying documentation (together the “Program”) for educational and not-for-profit research purposes, without fee and without a signed licensing agreement, is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies, modifications, and distributions. For the avoidance of doubt, educational and not-for-profit research purposes excludes any service or part of selling a service that uses the Program. To obtain a commercial license for the Program, contact the Technology Commercialization and Licensing, Polsky Center for Entrepreneurship and Innovation, University of Chicago, 1452 East 53rd Street, 2nd floor, Chicago, IL 60615. 6 | 7 | Created by Data Science and Public Policy, University of Chicago 8 | 9 | The Program is copyrighted by Chicago. The Program is supplied "as is", without any accompanying services from Chicago. Chicago does not warrant that the operation of the Program will be uninterrupted or error-free. The end-user understands that the Program was developed for research purposes and is advised not to rely exclusively on the Program for any reason. 10 | 11 | IN NO EVENT SHALL CHICAGO BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THE PROGRAM, EVEN IF CHICAGO HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. CHICAGO SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE PROGRAM PROVIDED HEREUNDER IS PROVIDED "AS IS". CHICAGO HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # acs2pgsql 2 | This repository imports American Community Survey (ACS) data into PostgreSQL. The hard work has already been done by [Census Reporter](https://github.com/censusreporter/census-postgres). We add the following features: 3 | 4 | - Make the state (e.g. Illinois, California) a variable 5 | - Fix [issues](https://github.com/censusreporter/census-postgres/issues?q=is%3Aissue%20is%3Aopen%202009) with the 2009 ACS 5-year data 6 | - Run the workflow using [docker](https://docs.docker.com/) or [drake](https://github.com/factual/drake) 7 | 8 | ## Usage 9 | 1. Clone the repository and create the file `acs2pgsql/default_profile` by specifying PostgreSQL credentials and the desired U.S. State in the following template: 10 | ``` 11 | PGHOST= 12 | PGDATABASE= 13 | PGUSER= 14 | PGPASSWORD= 15 | 16 | STATE=Illinois 17 | STATE_ABBREV=il 18 | ``` 19 | 20 | If the state name has multiple words, remove the space: e.g. for North Carolina use `STATE=NorthCarolina`. 21 | 22 | 2. Change to the `acs2pgsql` directory. To use docker, run `docker build --rm -t "image_tag" .`. Docker will create an image and then extract, transform, and load the ACS data into your PostgreSQL database. To use drake, run `drake`. 23 | 24 | 3. Consult the [census-postgres](https://github.com/censusreporter/census-postgres) documentation for full technical details. See also the census documentation for each dataset, especially the: 25 | - Appendix (e.g. [2015](http://www2.census.gov/programs-surveys/acs/summary_file/2015/documentation/tech_docs/ACS_2015_SF_5YR_Appendices.xls)) for information about census tables, and the 26 | - Sequence Number and Table Number Lookup (e.g. [2015](http://www2.census.gov/programs-surveys/acs/summary_file/2015/documentation/user_tools/ACS_5yr_Seq_Table_Number_Lookup.txt)) for information about variables. 27 | 28 | ## TODO 29 | - Expand to Decennial and ACS 1yr and 3yr 30 | - Rename folders per state 31 | -------------------------------------------------------------------------------- /postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | src="$1" 3 | datadir="$2" 4 | out="$3" 5 | state="$4" 6 | 7 | schema=$(basename $out) 8 | 9 | mkdir -p $out 10 | cp -r $src/* $out 11 | 12 | edir=$(echo $datadir/ | sed 's/\//\\\//g') 13 | grep 'group1/[em][0-9]*5'$state "$out"/import_sequences.sql | sed "s/'.*\/group1\//'"$edir"/g;s/COPY/\\\COPY/g" > "$out"/import_sequences_"$state".sql 14 | 15 | grep 'g[0-9]*5'$state'.txt' $out/import_geoheader.sql | head -n1 | sed "s/'.*\/\(g[0-9]*5"$state".txt\)/'"$edir\\1"/g;s/COPY/\\\COPY/g" > "$out"/import_geoheader_"$state".sql 16 | 17 | echo " 18 | drop schema if exists $schema cascade; 19 | create schema $schema; 20 | SET search_path = $schema, public; 21 | 22 | \i $out/create_tmp_geoheader.sql 23 | \i $out/create_import_tables.sql 24 | \i $out/create_tmp_geoheader.sql 25 | 26 | \i $out/import_sequences_$state.sql 27 | \i $out/import_geoheader_$state.sql 28 | 29 | \i $out/create_geoheader.sql 30 | \i $out/parse_tmp_geoheader.sql 31 | \i $out/store_by_tables.sql 32 | \i $out/insert_into_tables.sql 33 | \i $out/view_stored_by_tables.sql 34 | 35 | \i $out/drop_import_tables.sql 36 | " > $out/import.sql 37 | 38 | if [[ "$schema" == 'acs2009_5yr' ]]; then 39 | 40 | # insert geoids for all tables after first one 41 | sed -i.bak '9,$ s/logrecno int,/logrecno int, geoid varchar(40),/g' $out/store_by_tables.sql 42 | 43 | # use alias s for tmp_seq tables 44 | sed -i.bak "s/\(FROM acs2009_5yr.tmp_seq[0-9]*\(_moe\)\?\)/\\1 s/g" $out/insert_into_tables.sql 45 | 46 | # use those aliases 47 | sed -i.bak "s/\(fileid\|filetype\|stusab\|chariter\|seq,\|logrecno\)/s.\\1/g" $out/insert_into_tables.sql 48 | sed -i.bak "s/\(NULLIF(NULLIF(\)/\\1s./g" $out/insert_into_tables.sql 49 | sed -i.bak "s/JOIN .*/JOIN acs2009_5yr.geoheader g ON (lower(s.stusab)=lower(g.stusab) AND s.logrecno=g.logrecno);/g" $out/insert_into_tables.sql 50 | 51 | # use schema for tables 52 | sed -i.bak 's/tmp_seq/acs2009_5yr.tmp_seq/g' $out/import_sequences.sql 53 | 54 | # missing character encoding 55 | sed -i.bak 's/;/ WITH ENCODING '\'latin1\'';/g' $out/import_geoheader_$state.sql 56 | 57 | # join using geoid 58 | sed -i.bak 's/USING (stusab, logrecno)/USING (geoid)/g' $out/view_stored_by_tables.sql 59 | 60 | # add missing joins for tables that span multiple sequences 61 | gen() 62 | { 63 | old='FROM acs2009_5yr.'${seqs[0]}';' 64 | new="FROM "$(echo ${seqs[@]} | sed "s/\([a-z0-9_\.]*\)/ JOIN \1 USING (geoid) /2g")";" 65 | sed -i.bak "s/$old/$new/" $out/view_stored_by_tables.sql 66 | 67 | old='FROM acs2009_5yr.'${seqs[0]}' JOIN '${seqs[0]}'_moe USING (geoid);' 68 | new="FROM "$(echo ${seqs[@]} | sed "s/\([a-z0-9_\.]*\)/ JOIN \1 USING (geoid) /2g") 69 | new_moe=" "$(echo ${seqs[@]} | sed "s/\([a-z0-9_\.]*\)/ JOIN \1_moe USING (geoid) /g")";" 70 | sed -i.bak "s/$old/$new$new_moe/" $out/view_stored_by_tables.sql 71 | } 72 | 73 | seqs=('seq0080' 'seq0079' 'seq0078') 74 | gen 75 | 76 | seqs=('seq0083' 'seq0082' 'seq0081') 77 | gen 78 | 79 | seqs=('seq0086' 'seq0085' 'seq0084') 80 | gen 81 | 82 | seqs=('seq0089' 'seq0088' 'seq0087') 83 | gen 84 | 85 | seqs=('seq0092' 'seq0091' 'seq0090') 86 | gen 87 | 88 | seqs=('seq0095' 'seq0094' 'seq0093') 89 | gen 90 | 91 | # delete duplicate table views 92 | sed -i.bak '46523,$d' $out/view_stored_by_tables.sql 93 | 94 | fi 95 | -------------------------------------------------------------------------------- /state.txt: -------------------------------------------------------------------------------- 1 | STATE|STUSAB|STATE_NAME|STATENS 2 | 01|AL|Alabama|01779775 3 | 02|AK|Alaska|01785533 4 | 04|AZ|Arizona|01779777 5 | 05|AR|Arkansas|00068085 6 | 06|CA|California|01779778 7 | 08|CO|Colorado|01779779 8 | 09|CT|Connecticut|01779780 9 | 10|DE|Delaware|01779781 10 | 11|DC|District of Columbia|01702382 11 | 12|FL|Florida|00294478 12 | 13|GA|Georgia|01705317 13 | 15|HI|Hawaii|01779782 14 | 16|ID|Idaho|01779783 15 | 17|IL|Illinois|01779784 16 | 18|IN|Indiana|00448508 17 | 19|IA|Iowa|01779785 18 | 20|KS|Kansas|00481813 19 | 21|KY|Kentucky|01779786 20 | 22|LA|Louisiana|01629543 21 | 23|ME|Maine|01779787 22 | 24|MD|Maryland|01714934 23 | 25|MA|Massachusetts|00606926 24 | 26|MI|Michigan|01779789 25 | 27|MN|Minnesota|00662849 26 | 28|MS|Mississippi|01779790 27 | 29|MO|Missouri|01779791 28 | 30|MT|Montana|00767982 29 | 31|NE|Nebraska|01779792 30 | 32|NV|Nevada|01779793 31 | 33|NH|New Hampshire|01779794 32 | 34|NJ|New Jersey|01779795 33 | 35|NM|New Mexico|00897535 34 | 36|NY|New York|01779796 35 | 37|NC|North Carolina|01027616 36 | 38|ND|North Dakota|01779797 37 | 39|OH|Ohio|01085497 38 | 40|OK|Oklahoma|01102857 39 | 41|OR|Oregon|01155107 40 | 42|PA|Pennsylvania|01779798 41 | 44|RI|Rhode Island|01219835 42 | 45|SC|South Carolina|01779799 43 | 46|SD|South Dakota|01785534 44 | 47|TN|Tennessee|01325873 45 | 48|TX|Texas|01779801 46 | 49|UT|Utah|01455989 47 | 50|VT|Vermont|01779802 48 | 51|VA|Virginia|01779803 49 | 53|WA|Washington|01779804 50 | 54|WV|West Virginia|01779805 51 | 55|WI|Wisconsin|01779806 52 | 56|WY|Wyoming|01779807 53 | 60|AS|American Samoa|01802701 54 | 66|GU|Guam|01802705 55 | 69|MP|Northern Mariana Islands|01779809 56 | 72|PR|Puerto Rico|01779808 57 | 74|UM|U.S. Minor Outlying Islands|01878752 58 | 78|VI|U.S. Virgin Islands|01802710 59 | -------------------------------------------------------------------------------- /tiger_create.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA IF NOT EXISTS acs_tiger; 2 | DROP TABLE IF EXISTS acs_tiger.census_tract_2010; 3 | DROP TABLE IF EXISTS acs_tiger.census_tract_2000; 4 | --------------------------------------------------------------------------------