├── .gitignore ├── README.md ├── sql └── copy.sql └── Makefile /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | get-oa 2 | ------ 3 | 4 | Load [OpenAddresses](openaddresses.io) data into a Postgresql + PostGIS database. 5 | 6 | Assuming that you've downloaded one or more OpenAddresses zipped datasets: 7 | ``` 8 | export PGUSER=myuser PGHOST=myhost PGDATABASE=mydb 9 | make init load FILES="path/to/a.zip path/to/another.zip" 10 | ``` 11 | 12 | Data will be loaded into a table (`oa.address`) partitioned by `region`. The table partition will have names like `oa.ca_address`. -------------------------------------------------------------------------------- /sql/copy.sql: -------------------------------------------------------------------------------- 1 | SET client_min_messages TO WARNING; 2 | 3 | /** 4 | * This should run with the variables 5 | * :region (e.g. ny, ca) 6 | * :schema (e.g. oa) 7 | * :file (e.g. openaddr-collected-us_west.zip) 8 | * :zip (e.g. us/ca/city_of_lancaster.csv) 9 | */ 10 | SELECT 'addresses_' || md5(now()::text) as table 11 | \gset 12 | 13 | \set partition :region _address 14 | 15 | CREATE TABLE :schema.:table (LIKE :schema.:partition EXCLUDING CONSTRAINTS EXCLUDING INDEXES); 16 | ALTER TABLE :schema.:table ALTER COLUMN hash DROP NOT NULL; 17 | 18 | SELECT format( 19 | '\copy %s.%s (lon, lat, number, street, unit, city, district, region, postcode, id, hash) FROM PROGRAM ''unzip -p %s %s'' CSV HEADER', 20 | :'schema', :'table', :'zip', :'file' 21 | ) as command 22 | \gset 23 | 24 | :command 25 | 26 | INSERT INTO :schema.:partition 27 | (lon, lat, number, street, unit, city, district, postcode, id, hash, geom, region) 28 | SELECT lon 29 | , lat 30 | , number 31 | -- remove multiple spaces in streets 32 | , regexp_replace(street, ' +', ' ') street 33 | , unit 34 | -- city file name if no city is given 35 | , coalesce(city, upper(replace(replace(:'city', '_', ' '), 'city of ', ' '))) as city 36 | , district 37 | , postcode 38 | , id 39 | -- generate a hash if none is given 40 | , coalesce(hash, substring(md5(concat(number, street, unit, city, district, postcode)), 1, 16)) as hash 41 | -- generate a geometry 42 | , ST_setsrid(ST_makepoint(lon, lat), 4326) as geom 43 | -- use region from file path. The region in each line can differ, which breaks the partitioning. 44 | , :'region' as region 45 | FROM :schema.:table 46 | ON CONFLICT (hash) DO NOTHING; 47 | 48 | DROP TABLE :schema.:table; 49 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL = bash 2 | psql = psql 3 | schema = oa 4 | FILES = 5 | space = $() $() 6 | 7 | .PHONY: default load load-% init clean 8 | 9 | default: load 10 | 11 | region = $(word 2,$(subst /,$(space),$(1))) 12 | city = $(basename $(notdir $(1))) 13 | 14 | .SECONDEXPANSION: 15 | 16 | # Load a file inside the zip without unzipping the whole thing 17 | # command has the format load-COUNTRY/REGION/file.csv 18 | define load_recipe 19 | $(addprefix load-,$(1)): load-%.csv: $(2) | region-$$$$(call region,$$$$*) 20 | $(psql) -f sql/copy.sql \ 21 | -v schema=$(schema) \ 22 | -v file=$$*.csv \ 23 | -v region=$$(subst region-,,$$|) \ 24 | -v city=$$(call city,$$*) \ 25 | -v zip=$$< 26 | endef 27 | 28 | # get the CSV files in a single zip 29 | get_csv = $(shell unzip -qq -l $(1) *.csv | grep -ve'-summary.csv' | awk '{ print $$4 }') 30 | 31 | # define a csv_FILE variable for each of the input files 32 | $(foreach f,$(FILES),$(eval csv_$f := $(call get_csv,$f))) 33 | 34 | # define the load task for each file, dependent on the input zip 35 | $(foreach f,$(FILES),$(eval $(call load_recipe,$(csv_$f),$f))) 36 | 37 | region-%: 38 | $(psql) -c "create table if not exists $(schema).$*_address partition of $(schema).address for values in ('$*')" 39 | $(psql) -c "create index if not exists $*_address_geom_idx ON $(schema).$*_address using gist (geom)" 40 | -$(psql) -c "alter table $(schema).$*_address add primary key (hash);" 41 | 42 | # define the top-level load task 43 | load: $(foreach f,$(FILES),$(addprefix load-,$(csv_$f))) 44 | 45 | init: 46 | $(psql) -c "create extension if not exists citext" 47 | $(psql) -c "create schema if not exists $(schema)" 48 | $(psql) -c "create table if not exists $(schema).address ( \ 49 | lon numeric, \ 50 | lat numeric, \ 51 | number text, \ 52 | street citext, \ 53 | unit text, \ 54 | city citext, \ 55 | district text, \ 56 | region citext, \ 57 | postcode text, \ 58 | id text, \ 59 | hash text, \ 60 | geom geometry(point,4326) \ 61 | ) partition by list (region)" 62 | 63 | clean: ; $(psql) -c "drop table if exists $(schema).address;" 64 | --------------------------------------------------------------------------------