├── .gitignore
├── Dockerfile
├── Drakefile
├── LICENSE
├── README.md
├── postprocess.sh
├── state.txt
└── tiger_create.sql


/.gitignore:
--------------------------------------------------------------------------------
1 | .*
2 | !.gitignore
3 | *.log
4 | data
5 | generated
6 | psql
7 | default_profile
8 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | ENV SHELL="/bin/bash"
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y default-jdk \
 7 |                        git \
 8 |                        postgresql \
 9 |                        unzip \
10 |                        wget
11 | 
12 | # for gdal
13 | RUN apt-get install -y software-properties-common && \
14 |     add-apt-repository ppa:ubuntugis/ppa && \
15 |     apt-get update && \
16 |     apt-get install -y gdal-bin
17 | 
18 | RUN wget -O /bin/drake https://raw.githubusercontent.com/Factual/drake/3659c116790f1796261d6d23373de8bba1b663be/bin/drake && \
19 |     chmod 755 /bin/drake
20 | 
21 | WORKDIR /
22 | RUN wget https://github.com/dssg/acs2pgsql/archive/master.zip && \
23 |     unzip master.zip
24 | COPY default_profile /acs2pgsql-master/
25 | 
26 | WORKDIR /acs2pgsql-master
27 | ENTRYPOINT ["/bin/drake", "--auto"]
28 | 


--------------------------------------------------------------------------------
/Drakefile:
--------------------------------------------------------------------------------
 1 | ; run input SQL file and touch output
 2 | psql()
 3 |     psql -v ON_ERROR_STOP=1 -f $INPUT && mkdir -p $(dirname $OUTPUT) && touch $OUTPUT
 4 | 
 5 | ; drake method to download and unzip a zip file
 6 | ; the URL of the zip the file should be in the URL variable
 7 | ; the first output is the name of the downloaded zip file
 8 | ; the second output is the name of a file in the unzipped archive
 9 | wget_unzip()
10 |     mkdir -p $(dirname $OUTPUT0)
11 |     wget --output-document="$OUTPUT0" "$URL"
12 |     unzip -o "$OUTPUT0" -d $(dirname $OUTPUT1)
13 | 
14 | shp2psql()
15 | 	ogr2ogr -f PostgreSQL PG:"host="$PGHOST" dbname="$PGDATABASE"" "$INPUT" -nlt MULTIPOLYGON25D -nln acs_tiger.census_tract_20"$y" && mkdir -p $(dirname $OUTPUT) && touch $OUTPUT
16 | 
17 | ; American Community Survey data gets imported into its own db
18 | %include default_profile
19 | 
20 | 
21 | ; the default location of sql success files
22 | SQL_DIR:=psql/
23 | 
24 | data/census-postgres/ <- [-timecheck]
25 |      git clone https://github.com/censusreporter/census-postgres.git $OUTPUT
26 | 
27 | $(for y in {2009..2016}; do
28 | 
29 | # download and unzip the acs tract and block group data
30 | echo "URL=\"http://www2.census.gov/programs-surveys/acs/summary_file/"$y"/data/5_year_by_state/$[STATE]_Tracts_Block_Groups_Only.zip\""
31 | echo "data/acs/acs"$y"_5yr.zip, data/acs/acs"$y"_5yr/g"$y"5$[STATE_ABBREV].txt <- [-timecheck method:wget_unzip]"
32 | 
33 | # run our postprocess.sh script to generate import scripts
34 | echo "generated/acs"$y"_5yr/import.sql <- postprocess.sh, data/census-postgres/"
35 | echo "    \$INPUT0 \$INPUT1/acs"$y"_5yr/ data/acs/acs"$y"_5yr/ generated/acs"$y"_5yr/ $[STATE_ABBREV]"
36 | 
37 | # run import scripts
38 | echo "$[SQL_DIR]/acs"$y"_5yr <- generated/acs"$y"_5yr/import.sql, data/acs/acs"$y"_5yr/g"$y"5$[STATE_ABBREV].txt [method:psql]"
39 | 
40 | done)
41 | 
42 | $[SQL_DIR]/tl_2010_tract_schema <- tiger_create.sql [method:psql]
43 | 
44 | $(
45 | FIPS=`grep -i "|$[STATE_ABBREV]|" state.txt | cut -c 1-2`
46 | for y in 00 10; do
47 | echo "y="$y""
48 | 
49 | echo "URL=\"http://www2.census.gov/geo/pvs/tiger2010st/"$FIPS"_"$[STATE]"/"$FIPS"/tl_2010_"$FIPS"_tract"$y".zip\""
50 | echo "data/tl_2010_"$FIPS"_tract"$y".zip, data/tl_2010_"$FIPS"_tract"$y"/tl_2010_"$FIPS"_tract"$y".shp <- [-timecheck method:wget_unzip]"
51 | 
52 | echo "$[SQL_DIR]/acs_tiger_"$FIPS"_tract"$y" <- data/tl_2010_"$FIPS"_tract"$y"/tl_2010_"$FIPS"_tract"$y".shp [method:shp2psql]"
53 | 
54 | done)
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BY DOWNLOADING acs2pgsql PROGRAM YOU AGREE TO THE FOLLOWING TERMS OF USE:
 2 | 
 3 | Copyright ©2018.  The University of Chicago (“Chicago”). All Rights Reserved.  
 4 | 
 5 | Permission to use, copy, modify, and distribute this software, including all object code and source code, and any accompanying documentation (together the “Program”) for educational and not-for-profit research purposes, without fee and without a signed licensing agreement, is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies, modifications, and distributions. For the avoidance of doubt, educational and not-for-profit research purposes excludes any service or part of selling a service that uses the Program. To obtain a commercial license for the Program, contact the Technology Commercialization and Licensing, Polsky Center for Entrepreneurship and Innovation, University of Chicago, 1452 East 53rd Street, 2nd floor, Chicago, IL 60615.
 6 | 
 7 | Created by Data Science and Public Policy, University of Chicago
 8 | 
 9 | The Program is copyrighted by Chicago. The Program is supplied "as is", without any accompanying services from Chicago. Chicago does not warrant that the operation of the Program will be uninterrupted or error-free. The end-user understands that the Program was developed for research purposes and is advised not to rely exclusively on the Program for any reason.
10 | 
11 | IN NO EVENT SHALL CHICAGO BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THE PROGRAM, EVEN IF CHICAGO HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. CHICAGO SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE PROGRAM PROVIDED HEREUNDER IS PROVIDED "AS IS". CHICAGO HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # acs2pgsql
 2 | This repository imports American Community Survey (ACS) data into PostgreSQL. The hard work has already been done by [Census Reporter](https://github.com/censusreporter/census-postgres). We add the following features:
 3 | 
 4 | - Make the state (e.g. Illinois, California) a variable
 5 | - Fix [issues](https://github.com/censusreporter/census-postgres/issues?q=is%3Aissue%20is%3Aopen%202009) with the 2009 ACS 5-year data
 6 | - Run the workflow using [docker](https://docs.docker.com/) or [drake](https://github.com/factual/drake)
 7 | 
 8 | ## Usage
 9 | 1. Clone the repository and create the file `acs2pgsql/default_profile` by specifying PostgreSQL credentials and the desired U.S. State in the following template:
10 |  ```
11 |  PGHOST=
12 |  PGDATABASE=
13 |  PGUSER=
14 |  PGPASSWORD=
15 | 
16 |  STATE=Illinois
17 |  STATE_ABBREV=il
18 |  ```
19 | 
20 |  If the state name has multiple words, remove the space: e.g. for North Carolina use `STATE=NorthCarolina`.
21 | 
22 | 2. Change to the `acs2pgsql` directory. To use docker, run `docker build --rm -t "image_tag" .`. Docker will create an image and then extract, transform, and load the ACS data into your PostgreSQL database. To use drake, run `drake`.
23 | 
24 | 3. Consult the [census-postgres](https://github.com/censusreporter/census-postgres) documentation for full technical details. See also the census documentation for each dataset, especially the:
25 |  - Appendix (e.g. [2015](http://www2.census.gov/programs-surveys/acs/summary_file/2015/documentation/tech_docs/ACS_2015_SF_5YR_Appendices.xls)) for information about census tables, and the
26 |  - Sequence Number and Table Number Lookup (e.g. [2015](http://www2.census.gov/programs-surveys/acs/summary_file/2015/documentation/user_tools/ACS_5yr_Seq_Table_Number_Lookup.txt)) for information about variables.
27 | 
28 | ## TODO
29 |  - Expand to Decennial and ACS 1yr and 3yr
30 |  - Rename folders per state
31 | 


--------------------------------------------------------------------------------
/postprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | src="$1"
 3 | datadir="$2"
 4 | out="$3"
 5 | state="$4"
 6 | 
 7 | schema=$(basename $out)
 8 | 
 9 | mkdir -p $out
10 | cp -r $src/* $out
11 | 
12 | edir=$(echo $datadir/ | sed 's/\//\\\//g')
13 | grep 'group1/[em][0-9]*5'$state "$out"/import_sequences.sql | sed "s/'.*\/group1\//'"$edir"/g;s/COPY/\\\COPY/g" > "$out"/import_sequences_"$state".sql
14 | 
15 | grep 'g[0-9]*5'$state'.txt' $out/import_geoheader.sql | head -n1 | sed "s/'.*\/\(g[0-9]*5"$state".txt\)/'"$edir\\1"/g;s/COPY/\\\COPY/g" > "$out"/import_geoheader_"$state".sql
16 | 
17 | echo "
18 | drop schema if exists $schema cascade;
19 | create schema $schema;
20 | SET search_path = $schema, public;
21 | 
22 | \i $out/create_tmp_geoheader.sql
23 | \i $out/create_import_tables.sql
24 | \i $out/create_tmp_geoheader.sql
25 | 
26 | \i $out/import_sequences_$state.sql
27 | \i $out/import_geoheader_$state.sql
28 | 
29 | \i $out/create_geoheader.sql
30 | \i $out/parse_tmp_geoheader.sql
31 | \i $out/store_by_tables.sql
32 | \i $out/insert_into_tables.sql
33 | \i $out/view_stored_by_tables.sql
34 | 
35 | \i $out/drop_import_tables.sql
36 | " > $out/import.sql
37 | 
38 | if [[ "$schema" == 'acs2009_5yr' ]]; then
39 | 
40 | # insert geoids for all tables after first one
41 | sed -i.bak '9,$ s/logrecno int,/logrecno int, geoid varchar(40),/g' $out/store_by_tables.sql 
42 | 
43 | # use alias s for tmp_seq tables
44 | sed -i.bak "s/\(FROM acs2009_5yr.tmp_seq[0-9]*\(_moe\)\?\)/\\1 s/g" $out/insert_into_tables.sql 
45 | 
46 | # use those aliases
47 | sed -i.bak "s/\(fileid\|filetype\|stusab\|chariter\|seq,\|logrecno\)/s.\\1/g" $out/insert_into_tables.sql
48 | sed -i.bak "s/\(NULLIF(NULLIF(\)/\\1s./g" $out/insert_into_tables.sql
49 | sed -i.bak "s/JOIN .*/JOIN acs2009_5yr.geoheader g ON (lower(s.stusab)=lower(g.stusab) AND s.logrecno=g.logrecno);/g" $out/insert_into_tables.sql
50 | 
51 | # use schema for tables
52 | sed -i.bak 's/tmp_seq/acs2009_5yr.tmp_seq/g' $out/import_sequences.sql 
53 | 
54 | # missing character encoding
55 | sed -i.bak 's/;/ WITH ENCODING '\'latin1\'';/g' $out/import_geoheader_$state.sql
56 | 
57 | # join using geoid
58 | sed -i.bak 's/USING (stusab, logrecno)/USING (geoid)/g' $out/view_stored_by_tables.sql
59 | 
60 | # add missing joins for tables that span multiple sequences
61 | gen()
62 | {
63 |     old='FROM acs2009_5yr.'${seqs[0]}';'
64 |     new="FROM "$(echo ${seqs[@]} | sed "s/\([a-z0-9_\.]*\)/ JOIN \1 USING (geoid) /2g")";"
65 |     sed -i.bak "s/$old/$new/" $out/view_stored_by_tables.sql
66 | 
67 |     old='FROM acs2009_5yr.'${seqs[0]}' JOIN '${seqs[0]}'_moe USING (geoid);'
68 |     new="FROM "$(echo ${seqs[@]} | sed "s/\([a-z0-9_\.]*\)/ JOIN \1 USING (geoid) /2g")
69 |     new_moe=" "$(echo ${seqs[@]} | sed "s/\([a-z0-9_\.]*\)/ JOIN \1_moe USING (geoid) /g")";"
70 |     sed -i.bak "s/$old/$new$new_moe/" $out/view_stored_by_tables.sql
71 | }
72 | 
73 | seqs=('seq0080' 'seq0079' 'seq0078')
74 | gen
75 | 
76 | seqs=('seq0083' 'seq0082' 'seq0081')
77 | gen
78 | 
79 | seqs=('seq0086' 'seq0085' 'seq0084')
80 | gen
81 | 
82 | seqs=('seq0089' 'seq0088' 'seq0087')
83 | gen
84 | 
85 | seqs=('seq0092' 'seq0091' 'seq0090')
86 | gen
87 | 
88 | seqs=('seq0095' 'seq0094' 'seq0093')
89 | gen
90 | 
91 | # delete duplicate table views
92 | sed -i.bak '46523,$d' $out/view_stored_by_tables.sql
93 | 
94 | fi
95 | 


--------------------------------------------------------------------------------
/state.txt:
--------------------------------------------------------------------------------
 1 | STATE|STUSAB|STATE_NAME|STATENS
 2 | 01|AL|Alabama|01779775
 3 | 02|AK|Alaska|01785533
 4 | 04|AZ|Arizona|01779777
 5 | 05|AR|Arkansas|00068085
 6 | 06|CA|California|01779778
 7 | 08|CO|Colorado|01779779
 8 | 09|CT|Connecticut|01779780
 9 | 10|DE|Delaware|01779781
10 | 11|DC|District of Columbia|01702382
11 | 12|FL|Florida|00294478
12 | 13|GA|Georgia|01705317
13 | 15|HI|Hawaii|01779782
14 | 16|ID|Idaho|01779783
15 | 17|IL|Illinois|01779784
16 | 18|IN|Indiana|00448508
17 | 19|IA|Iowa|01779785
18 | 20|KS|Kansas|00481813
19 | 21|KY|Kentucky|01779786
20 | 22|LA|Louisiana|01629543
21 | 23|ME|Maine|01779787
22 | 24|MD|Maryland|01714934
23 | 25|MA|Massachusetts|00606926
24 | 26|MI|Michigan|01779789
25 | 27|MN|Minnesota|00662849
26 | 28|MS|Mississippi|01779790
27 | 29|MO|Missouri|01779791
28 | 30|MT|Montana|00767982
29 | 31|NE|Nebraska|01779792
30 | 32|NV|Nevada|01779793
31 | 33|NH|New Hampshire|01779794
32 | 34|NJ|New Jersey|01779795
33 | 35|NM|New Mexico|00897535
34 | 36|NY|New York|01779796
35 | 37|NC|North Carolina|01027616
36 | 38|ND|North Dakota|01779797
37 | 39|OH|Ohio|01085497
38 | 40|OK|Oklahoma|01102857
39 | 41|OR|Oregon|01155107
40 | 42|PA|Pennsylvania|01779798
41 | 44|RI|Rhode Island|01219835
42 | 45|SC|South Carolina|01779799
43 | 46|SD|South Dakota|01785534
44 | 47|TN|Tennessee|01325873
45 | 48|TX|Texas|01779801
46 | 49|UT|Utah|01455989
47 | 50|VT|Vermont|01779802
48 | 51|VA|Virginia|01779803
49 | 53|WA|Washington|01779804
50 | 54|WV|West Virginia|01779805
51 | 55|WI|Wisconsin|01779806
52 | 56|WY|Wyoming|01779807
53 | 60|AS|American Samoa|01802701
54 | 66|GU|Guam|01802705
55 | 69|MP|Northern Mariana Islands|01779809
56 | 72|PR|Puerto Rico|01779808
57 | 74|UM|U.S. Minor Outlying Islands|01878752
58 | 78|VI|U.S. Virgin Islands|01802710
59 | 


--------------------------------------------------------------------------------
/tiger_create.sql:
--------------------------------------------------------------------------------
1 | CREATE SCHEMA IF NOT EXISTS acs_tiger;
2 | DROP TABLE IF EXISTS acs_tiger.census_tract_2010;
3 | DROP TABLE IF EXISTS acs_tiger.census_tract_2000;
4 | 


--------------------------------------------------------------------------------