├── .gitignore
├── README.md
├── sql
    └── copy.sql
└── Makefile


/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | get-oa
 2 | ------
 3 | 
 4 | Load [OpenAddresses](openaddresses.io) data into a Postgresql + PostGIS database.
 5 | 
 6 | Assuming that you've downloaded one or more OpenAddresses zipped datasets:
 7 | ```
 8 | export PGUSER=myuser PGHOST=myhost PGDATABASE=mydb
 9 | make init load FILES="path/to/a.zip path/to/another.zip"
10 | ```
11 | 
12 | Data will be loaded into a table (`oa.address`) partitioned by `region`. The table partition will have names like `oa.ca_address`.


--------------------------------------------------------------------------------
/sql/copy.sql:
--------------------------------------------------------------------------------
 1 | SET client_min_messages TO WARNING;
 2 | 
 3 | /**
 4 |  * This should run with the variables
 5 |  * :region (e.g. ny, ca)
 6 |  * :schema (e.g. oa)
 7 |  * :file (e.g. openaddr-collected-us_west.zip)
 8 |  * :zip (e.g. us/ca/city_of_lancaster.csv)
 9 |  */
10 | SELECT 'addresses_' || md5(now()::text) as table
11 | \gset
12 | 
13 | \set partition :region _address
14 | 
15 | CREATE TABLE :schema.:table (LIKE :schema.:partition EXCLUDING CONSTRAINTS EXCLUDING INDEXES);
16 | ALTER TABLE :schema.:table ALTER COLUMN hash DROP NOT NULL;
17 | 
18 | SELECT format(
19 |   '\copy %s.%s (lon, lat, number, street, unit, city, district, region, postcode, id, hash) FROM PROGRAM ''unzip -p %s %s'' CSV HEADER',
20 |   :'schema', :'table', :'zip', :'file'
21 |   ) as command
22 | \gset
23 | 
24 | :command
25 | 
26 | INSERT INTO :schema.:partition
27 |   (lon, lat, number, street, unit, city, district, postcode, id, hash, geom, region)
28 |   SELECT lon
29 |     , lat
30 |     , number
31 |     -- remove multiple spaces in streets
32 |     , regexp_replace(street, ' +', ' ') street
33 |     , unit
34 |     -- city file name if no city is given
35 |     , coalesce(city, upper(replace(replace(:'city', '_', ' '), 'city of ', ' '))) as city
36 |     , district
37 |     , postcode
38 |     , id
39 |     -- generate a hash if none is given
40 |     , coalesce(hash, substring(md5(concat(number, street, unit, city, district, postcode)), 1, 16)) as hash
41 |     -- generate a geometry
42 |     , ST_setsrid(ST_makepoint(lon, lat), 4326) as geom
43 |     -- use region from file path. The region in each line can differ, which breaks the partitioning.
44 |     , :'region' as region
45 |   FROM :schema.:table
46 |   ON CONFLICT (hash) DO NOTHING;
47 | 
48 | DROP TABLE :schema.:table;
49 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL = bash
 2 | psql = psql
 3 | schema = oa
 4 | FILES =
 5 | space = $() $()
 6 | 
 7 | .PHONY: default load load-% init clean
 8 | 
 9 | default: load
10 | 
11 | region = $(word 2,$(subst /,$(space),$(1)))
12 | city   = $(basename $(notdir $(1)))
13 | 
14 | .SECONDEXPANSION:
15 | 
16 | # Load a file inside the zip without unzipping the whole thing
17 | # command has the format load-COUNTRY/REGION/file.csv
18 | define load_recipe
19 | $(addprefix load-,$(1)): load-%.csv: $(2) | region-$$$$(call region,$$$$*)
20 | 	$(psql) -f sql/copy.sql \
21 | 	-v schema=$(schema) \
22 | 	-v file=$$*.csv \
23 | 	-v region=$$(subst region-,,$$|) \
24 | 	-v city=$$(call city,$$*) \
25 | 	-v zip=$$<
26 | endef
27 | 
28 | # get the CSV files in a single zip
29 | get_csv = $(shell unzip -qq -l $(1) *.csv | grep -ve'-summary.csv' | awk '{ print $$4 }')
30 | 
31 | # define a csv_FILE variable for each of the input files
32 | $(foreach f,$(FILES),$(eval csv_$f := $(call get_csv,$f)))
33 | 
34 | # define the load task for each file, dependent on the input zip
35 | $(foreach f,$(FILES),$(eval $(call load_recipe,$(csv_$f),$f)))
36 | 
37 | region-%:
38 | 	$(psql) -c "create table if not exists $(schema).$*_address partition of $(schema).address for values in ('$*')"
39 | 	$(psql) -c "create index if not exists $*_address_geom_idx ON $(schema).$*_address using gist (geom)"
40 | 	-$(psql) -c "alter table $(schema).$*_address add primary key (hash);"
41 | 
42 | # define the top-level load task
43 | load: $(foreach f,$(FILES),$(addprefix load-,$(csv_$f)))
44 | 
45 | init:
46 | 	$(psql) -c "create extension if not exists citext"
47 | 	$(psql) -c "create schema if not exists $(schema)"
48 | 	$(psql) -c "create table if not exists $(schema).address ( \
49 |       lon numeric, \
50 |       lat numeric, \
51 |       number text, \
52 |       street citext, \
53 |       unit text, \
54 |       city citext, \
55 |       district text, \
56 |       region citext, \
57 |       postcode text, \
58 |       id text, \
59 |       hash text, \
60 |       geom geometry(point,4326) \
61 |     ) partition by list (region)"
62 | 
63 | clean: ; $(psql) -c "drop table if exists $(schema).address;"
64 | 


--------------------------------------------------------------------------------