├── .gitignore
├── src
├── test
│ ├── resources
│ │ └── com
│ │ │ └── conveyal
│ │ │ └── data
│ │ │ └── census
│ │ │ └── integrationTest.zip
│ └── java
│ │ └── com
│ │ └── conveyal
│ │ └── data
│ │ └── census
│ │ └── IntegrationTest.java
└── main
│ └── java
│ └── com
│ └── conveyal
│ └── data
│ └── census
│ ├── FileSeamlessSource.java
│ ├── TigerLineSource.java
│ ├── S3SeamlessSource.java
│ ├── CensusLoader.java
│ ├── CensusExtractor.java
│ ├── SeamlessSource.java
│ ├── LodesSource.java
│ └── ShapeDataStore.java
├── settings.xml
├── randomizeCsv.py
├── LICENSE
├── .travis.yml
├── README.md
├── downloadData.py
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*
2 | seamless-census.iml
3 | target/*
4 | temporary_dir/*
5 |
--------------------------------------------------------------------------------
/src/test/resources/com/conveyal/data/census/integrationTest.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/conveyal/seamless-census/HEAD/src/test/resources/com/conveyal/data/census/integrationTest.zip
--------------------------------------------------------------------------------
/settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | conveyal
6 |
7 | ${env.AWS_ACCESS_KEY_ID}
8 | ${env.AWS_SECRET_ACCESS_KEY}
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/randomizeCsv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # Take a CSV from LODES and make all values unique so we can be sure that our tests
3 | # are working. Some columns are all zeros or are (almost) collinear, so accidentally
4 | # switching them might not make the tests fail.
5 |
6 | from csv import DictReader, DictWriter
7 | from sys import argv
8 |
9 | ct = 0
10 | def nextVal():
11 | global ct
12 | ct += 1
13 | return ct
14 |
15 | with open(argv[1]) as infile:
16 | reader = DictReader(infile)
17 |
18 | with open(argv[2], 'w') as outfile:
19 | writer = DictWriter(outfile, reader.fieldnames)
20 | writer.writeheader()
21 |
22 | for row in reader:
23 | writer.writerow({k: nextVal() if k.startswith('C') else v for k, v in row.iteritems()})
24 |
--------------------------------------------------------------------------------
/src/main/java/com/conveyal/data/census/FileSeamlessSource.java:
--------------------------------------------------------------------------------
1 | package com.conveyal.data.census;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.IOException;
6 | import java.io.InputStream;
7 |
8 | /**
9 | * Seamless source for the file system.
10 | */
11 | public class FileSeamlessSource extends SeamlessSource {
12 | private File directory;
13 |
14 | public FileSeamlessSource(String path) {
15 | this.directory = new File(path);
16 | }
17 |
18 | @Override protected InputStream getInputStream(int x, int y) throws IOException {
19 | File dir = new File(directory, x + "");
20 | File file = new File(dir, y + ".pbf.gz");
21 |
22 | if (!file.exists())
23 | return null;
24 |
25 | return new FileInputStream(file);
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Conveyal
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/main/java/com/conveyal/data/census/TigerLineSource.java:
--------------------------------------------------------------------------------
1 | package com.conveyal.data.census;
2 |
3 | import com.conveyal.data.geobuf.GeobufFeature;
4 | import org.geotools.data.FileDataStore;
5 | import org.geotools.data.FileDataStoreFinder;
6 | import org.geotools.data.Query;
7 | import org.geotools.data.simple.SimpleFeatureCollection;
8 | import org.geotools.data.simple.SimpleFeatureIterator;
9 | import org.geotools.data.simple.SimpleFeatureSource;
10 | import org.geotools.referencing.CRS;
11 |
12 | import java.io.File;
13 | import java.util.HashMap;
14 |
15 | /**
16 | * Reads TIGER/Line data into a MapDB.
17 | */
18 | public class TigerLineSource {
19 | private File shapefile;
20 |
21 | public TigerLineSource (File shapefile) {
22 | this.shapefile = shapefile;
23 | }
24 |
25 | public void load (ShapeDataStore store) throws Exception {
26 | FileDataStore fds = FileDataStoreFinder.getDataStore(shapefile);
27 | SimpleFeatureSource src = fds.getFeatureSource();
28 |
29 | Query q = new Query();
30 | q.setCoordinateSystem(src.getInfo().getCRS());
31 | q.setCoordinateSystemReproject(CRS.decode("EPSG:4326", true));
32 | SimpleFeatureCollection sfc = src.getFeatures(q);
33 |
34 | for (SimpleFeatureIterator it = sfc.features(); it.hasNext();) {
35 | GeobufFeature feat = new GeobufFeature(it.next());
36 | feat.id = null;
37 | feat.numericId = Long.parseLong((String) feat.properties.get("GEOID10"));
38 | feat.properties = new HashMap<>();
39 | store.add(feat);
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/java/com/conveyal/data/census/S3SeamlessSource.java:
--------------------------------------------------------------------------------
1 | package com.conveyal.data.census;
2 |
3 | import com.amazonaws.services.s3.AmazonS3;
4 | import com.amazonaws.services.s3.AmazonS3Client;
5 | import com.amazonaws.services.s3.AmazonS3ClientBuilder;
6 | import com.amazonaws.services.s3.model.AmazonS3Exception;
7 | import com.amazonaws.services.s3.model.GetObjectRequest;
8 |
9 | import java.io.IOException;
10 | import java.io.InputStream;
11 |
12 | /**
13 | * A seamless data source based on storage in Amazon S3.
14 | */
15 | public class S3SeamlessSource extends SeamlessSource {
16 | private static AmazonS3 s3;
17 |
18 | public final String region;
19 | public final String bucketName;
20 |
21 | public S3SeamlessSource(String bucketName) {
22 | this.region = null;
23 | this.bucketName = bucketName;
24 | this.s3 = AmazonS3ClientBuilder.defaultClient();
25 | }
26 |
27 | public S3SeamlessSource(String region, String bucketName) {
28 | this.region = region;
29 | this.bucketName = bucketName;
30 | this.s3 = AmazonS3ClientBuilder.standard()
31 | .withRegion(region)
32 | .build();
33 | }
34 |
35 | @Override
36 | protected InputStream getInputStream(int x, int y) throws IOException {
37 | try {
38 | GetObjectRequest req = new GetObjectRequest(bucketName, String.format("%d/%d.pbf.gz", x, y));
39 | // the LODES bucket is requester-pays.
40 | req.setRequesterPays(true);
41 | return s3.getObject(req).getObjectContent();
42 | } catch (AmazonS3Exception e) {
43 | // there is no data in this tile
44 | if ("NoSuchKey".equals(e.getErrorCode()))
45 | return null;
46 | else
47 | // re-throw, something else is amiss
48 | throw e;
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 |
3 | # Geobuf requires Java 8 (OpenJDK 8 is not available on travis)
4 | jdk:
5 | - oraclejdk8
6 |
7 | # encrypted AWS access/secret keys to allow automated deployment to conveyal maven repo
8 | env:
9 | global:
10 | # encrypted AWS access/secret keys to allow automated deployment to conveyal maven repo
11 | - secure: "o5mIHTKiJgHjYUfoHcDwElNj8eShRTqVJRNS5DTfxy+7Tnv6JcKXv4BQzJi7Lzj8kfvQz9xBmYRij2oY+t/dDDcwBfkI4WrrGL1b5pgl1HoyncXgQj0uej8C+bD3FtKJ5aD/UJx5YtbKeMTE8Yz7+IzfyW7JtgANEoMbme1yID47wlSNgfYbvEeHvQPA1RpjR7NdJvesgGt/05L2W46tacxptp5MDrqCyvytpeoWQjS8RpApACL9NoOENH4hxsxJbgjaBK6QComiymOQVSfTwi0MEWqIdbl8YBaA4c1RglRYF86BDnuOqNqsb+yO9pizWeyDJTo1T3szG4o23Wo6MqBu52QC6oPMkoED59bBATB6uDwG7uxDkpDk7BWJZkg7VFQSNDM62+gOII2mssDp49Qo2jP0wVozCGkk9mExOTUrGfwx/AVGj88ockJFIJw6y9/8GtpqwxMdsHEPKJl3/iKqt3IB4gJ4jErlowMPG4uRPO01vkjXx1wi1+lu0TCLivz1Kjgby4RPnD6THG5SP2wpYypym7jyWp7aB4AWoeULuJNN/2MYYFGxT3yo3fJ4A+Md7sQf548GN4nJC9CX4+JASTxF/ZnKOv3VS++SA8EBKyAOd57vZJZ/mRF/2RhkQtj6Y8dt1ztOR/mhXD0TmX/sLPcGvs4Y1I6TnqSCaSo="
12 | - secure: "i5IlkapdSkptmr1unNgYvn0Ps707OOcL9qOxlPUdz/LcHdBQWv0ajuvdvS84JgkY0OFVPVk6+CeiByUU7CbHUNLTuwubXqKoxyAx095jLZ8eG4y12SlZQOay70djgfRHCUj2YMhHqIwoXjObc3FCoUjs8xlofdvHItDQT5kwVt/MVAssXfYxTHaLwhjI4JHhWk/YKbY6pfD2RF+3DiL6RT75By7lbVuTmLZqzDgNFGxmYN+yWNiIowpTIZto651ttV20ICu5atVMryWL8uurG+/xRhari4NTnHn7Clq8MbwxXwOUDekIlUy2+GAsFGCluYZESNJ76d2lVzAjJF3jzE2Rg989KEW02v3iMYi/xu1NY75ZUpQeMS0zTCQCC1BbZq1LmKqdh8Bp69kn1CmPaBWIfG/ZncCLplTsqK6aNnCZsskcRNR/es3DW2n0GBeG/CtPkvKg79olJyCdsvowfXezb4isw1MkjSh1tIaUD5lfJ8KBmXHz0IsYBOtraQZRjwOa8Cef0YQP7v5K4F9C5YNUzEVW7RZML8OUc30qjT8xUpfmnErrVNaSMJFyKA6bppBhGAfJQLtrsjOoCTlh4JnXzkq0M4owckXOr8f+iED8q5DxWujTdIR5fA/WwsiINw82kf1D42UNEaSENor3YvzlTG2GJF90akOmpby9WO0="
13 |
14 | # Run on container based infrastructure (allows caching &c.)
15 | sudo: false
16 |
17 | # deploy maven artifacts from master iff it's not a pull request
18 | after_success: |
19 | if [ "$TRAVIS_BRANCH" = "master" ] && [ "$TRAVIS_PULL_REQUEST" = "false" ]; then
20 | # no need to run tests again
21 | mvn deploy --settings settings.xml -DskipTests
22 | fi
23 |
24 | # Save the maven cache to speed up builds
25 | cache:
26 | directories:
27 | - "$HOME/.m2/repository"
28 |
--------------------------------------------------------------------------------
/src/main/java/com/conveyal/data/census/CensusLoader.java:
--------------------------------------------------------------------------------
1 | package com.conveyal.data.census;
2 |
3 | import org.slf4j.Logger;
4 | import org.slf4j.LoggerFactory;
5 |
6 | import java.io.File;
7 | import java.util.stream.Stream;
8 |
9 | /**
10 | * Import data from the US Census into a seamless store in S3 or on disk.
11 | */
12 | public class CensusLoader {
13 | protected static final Logger LOG = LoggerFactory.getLogger(CensusLoader.class);
14 |
15 | public static void main (String... args) throws Exception {
16 | File indir = new File(args[0]);
17 | File tiger = new File(indir, "tiger");
18 |
19 | ShapeDataStore store = new ShapeDataStore();
20 |
21 | // load up the tiger files in parallel
22 | LOG.info("Loading TIGER (geometry)");
23 | Stream.of(tiger.listFiles())
24 | .filter(f -> f.getName().endsWith(".shp"))
25 | .forEach(f -> {
26 | LOG.info("Loading file {}", f);
27 | TigerLineSource src = new TigerLineSource(f);
28 | try {
29 | src.load(store);
30 | } catch (Exception e) {
31 | throw new RuntimeException(e);
32 | }
33 | });
34 |
35 | LOG.info("TIGER done");
36 |
37 | LOG.info("Loading LODES workforce data");
38 | File workforce = new File(indir, "workforce");
39 | Stream.of(workforce.listFiles())
40 | .filter(f -> f.getName().endsWith(".csv.gz"))
41 | .forEach(f -> {
42 | LOG.info("Loading file {}", f);
43 | try {
44 | new LodesSource(f, LodesSource.LodesType.RESIDENCE).load(store);
45 | } catch (Exception e) {
46 | throw new RuntimeException(e);
47 | }
48 | });
49 | LOG.info("Workforce done");
50 |
51 | LOG.info("Loading LODES jobs data");
52 | File jobs = new File(indir, "jobs");
53 | Stream.of(jobs.listFiles())
54 | .filter(f -> f.getName().endsWith(".csv.gz"))
55 | .forEach(f -> {
56 | LOG.info("Loading file {}", f);
57 | try {
58 | new LodesSource(f, LodesSource.LodesType.WORKPLACE).load(store);
59 | } catch (Exception e) {
60 | throw new RuntimeException(e);
61 | }
62 | });
63 | LOG.info("Jobs done");
64 |
65 | if (args.length == 1)
66 | store.writeTiles(new File(indir, "tiles"));
67 | else
68 | // write to s3
69 | store.writeTilesToS3(args[1]);
70 |
71 | store.close();
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # seamless-census
2 |
3 | Import US Census data into a seamless storage environment.
4 |
5 | ## Usage
6 |
7 | Running the download and load steps for the entire US requires ~45 GB of disk space.
8 |
9 | ### Download data
10 |
11 | You can use the following command to download
12 | data from the Census bureau. Create a temporary directory to receive the files before you combine them and load them to
13 | S3, in a location that has plenty of disk space. The arguments are the temporary directory and the two-letter postal abbreviations
14 | of the states for which you want to retrieve data (you can also use the special code ALL to retrieve data for every state, territory and district).
15 | The command below, for instance, would download data for the greater Washington, DC megalopolis.
16 |
17 | python downloadData.py temporary_dir DC MD VA WV DE
18 |
19 | ### Load data
20 |
21 | Use the same temporary directory
22 | you used above. If you omit the s3 bucket name, it will place the tiles in the `tiles` directory in the temporary directory.
23 |
24 | JAVA_OPTS=-Xmx[several]G mvn exec:java -Dexec.mainClass="com.conveyal.data.census.CensusLoader" -Dexec.args="temporary_dir s3_bucket_name"
25 |
26 | ### Extract data
27 |
28 | Now for the fun part. The following command will extract the data stored in the s3 bucket specified, using the bounding box specified,
29 | to the geobuf file out.pbf.
30 |
31 | JAVA_OPTS=-Xmx[several]G mvn exec:java -Dexec.mainClass="com.conveyal.data.census.CensusExtractor" -Dexec.args="s3://bucket_name n e s w out.pbf"
32 |
33 | ## Data storage
34 |
35 | Data is stored in a directory structure, which is kept in Amazon S3. Census data is split
36 | up into zoom-level-11 tiles and stored in [GeoBuf](https://github.com/mapbox/geobuf) files, each
37 | in a directory for its source, its x coordinate and named its y coordinate. For example, `us-census-2012/342/815.pbf`
38 | might contain US LODES data and decennial census data for southeastern Goleta, CA.
39 |
40 | Enumeration units that fall into two tiles should be included in both tiles. It is the responsibility
41 | of the data consumer to deduplicate them; this can be done based on IDs. An enumeration unit that is
42 | duplicated across tiles must have the same integer ID in both tiles.
43 |
44 | We have already loaded LODES data from 2013, 2014, 2015, and 2017 in the S3 buckets `lodes-data`, `lodes-data-2014`, `lodes-data-2015`, etc.
45 | These buckets and their contents are publicly readable and requester-pays (i.e. accessing them will incur fees on your AWS account).
46 | The 2013 data lack Massachusetts, and uses 2011 data for Kansas, due to data availability.
47 | The 2014 and 2015 data do not have these problems.
48 | The 2017 data exclude federal employees and use 2016 data for Alaska and South Dakota. See LODES Technical Documentation for details.
49 |
50 | ## Use in Conveyal Analysis
51 |
52 | Any dataset that can be placed in this format can be used in [Conveyal Analysis](https://github.com/conveyal/analysis-ui)
53 |
--------------------------------------------------------------------------------
/src/main/java/com/conveyal/data/census/CensusExtractor.java:
--------------------------------------------------------------------------------
1 | package com.conveyal.data.census;
2 |
3 | import com.conveyal.data.geobuf.GeobufEncoder;
4 | import com.conveyal.data.geobuf.GeobufFeature;
5 | import com.conveyal.geojson.GeoJsonModule;
6 | import com.fasterxml.jackson.databind.ObjectMapper;
7 | import org.locationtech.jts.geom.Geometry;
8 | import org.locationtech.jts.geom.GeometryCollection;
9 | import org.locationtech.jts.geom.GeometryFactory;
10 | import org.locationtech.jts.geom.Polygon;
11 |
12 | import java.io.*;
13 | import java.util.List;
14 | import java.util.Map;
15 |
16 | /**
17 | * Extract Census data from a seamless datastore.
18 | */
19 | public class CensusExtractor {
20 | /**
21 | * The precision to use for output files.
22 | * Set above 6 at your own risk; higher precision files work fine with the reference implementation and with geobuf-java,
23 | * but break with pygeobuf (see https://github.com/mapbox/pygeobuf/issues/21)
24 | */
25 | private static final int PRECISION = 6;
26 |
27 | public static void main (String... args) throws IOException {
28 | if (args.length < 3 || args.length > 6) {
29 | System.err.println("usage: CensusExtractor (s3://bucket|data_dir) n e s w [outfile.json]");
30 | System.err.println(" or: CensusExtractor (s3://bucket|data_dir) boundary.geojson [outfile.json]");
31 | return;
32 | }
33 |
34 | SeamlessSource source;
35 | if (!args[0].startsWith("s3://"))
36 | source = new FileSeamlessSource(args[0]);
37 | else
38 | source = new S3SeamlessSource(args[0].substring(5));
39 |
40 | long start = System.currentTimeMillis();
41 |
42 | Map features;
43 |
44 | if (args.length >= 4) {
45 | features = source.extract(Double.parseDouble(args[1]),
46 | Double.parseDouble(args[2]),
47 | Double.parseDouble(args[3]),
48 | Double.parseDouble(args[4]),
49 | false
50 | );
51 | }
52 | else {
53 | // read geojson boundary
54 | ObjectMapper om = new ObjectMapper();
55 | om.registerModule(new GeoJsonModule());
56 | FileInputStream fis = new FileInputStream(new File(args[1]));
57 | FeatureCollection fc = om.readValue(fis, FeatureCollection.class);
58 | fis.close();
59 |
60 | features = source.extract(fc.features.get(0).geometry, false);
61 | }
62 |
63 | OutputStream out;
64 |
65 | long completeTime = System.currentTimeMillis() - start;
66 | System.err.println("Read " + features.size() + " features in " + completeTime + "msec");
67 |
68 | if (args.length == 6)
69 | out = new FileOutputStream(new File(args[5]));
70 | else if (args.length == 3)
71 | out = new FileOutputStream(new File(args[2]));
72 | else
73 | out = System.out;
74 |
75 | GeobufEncoder encoder = new GeobufEncoder(out, PRECISION);
76 | encoder.writeFeatureCollection(features.values());
77 | encoder.close();
78 |
79 | if (out instanceof FileOutputStream)
80 | out.close();
81 | }
82 |
83 | // rudimentary geojson classes to deserialize feature collection
84 |
85 | public static class FeatureCollection {
86 | public String type;
87 | public Map crs;
88 | public List features;
89 | }
90 |
91 | public static class Feature {
92 | public String type;
93 | public Map properties;
94 | public Geometry geometry;
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/java/com/conveyal/data/census/SeamlessSource.java:
--------------------------------------------------------------------------------
1 | package com.conveyal.data.census;
2 |
3 | import com.conveyal.data.geobuf.GeobufDecoder;
4 | import com.conveyal.data.geobuf.GeobufFeature;
5 | import org.locationtech.jts.geom.*;
6 | import org.locationtech.jts.geom.prep.PreparedPolygon;
7 | import org.locationtech.jts.util.GeometricShapeFactory;
8 | import org.mapdb.DBMaker;
9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 |
12 | import java.io.BufferedInputStream;
13 | import java.io.IOException;
14 | import java.io.InputStream;
15 | import java.util.HashMap;
16 | import java.util.Map;
17 | import java.util.zip.GZIPInputStream;
18 |
19 | import static com.conveyal.data.census.ShapeDataStore.lat2tile;
20 | import static com.conveyal.data.census.ShapeDataStore.lon2tile;
21 |
22 | /**
23 | * A tile source for seamless Census extracts
24 | */
25 | public abstract class SeamlessSource {
26 | // convenience
27 | private static final int ZOOM_LEVEL = ShapeDataStore.ZOOM_LEVEL;
28 |
29 | protected static final Logger LOG = LoggerFactory.getLogger(SeamlessSource.class);
30 |
31 | private static final GeometryFactory geometryFactory = new GeometryFactory();
32 |
33 | /** Extract features by bounding box */
34 | public Map extract(double north, double east, double south, double west, boolean onDisk) throws
35 | IOException {
36 | GeometricShapeFactory factory = new GeometricShapeFactory(geometryFactory);
37 | factory.setCentre(new Coordinate((east + west) / 2, (north + south) / 2));
38 | factory.setWidth(east - west);
39 | factory.setHeight(north - south);
40 | Polygon rect = factory.createRectangle();
41 | return extract(rect, onDisk);
42 | }
43 |
44 | /** Extract features by arbitrary polygons */
45 | public Map extract(Geometry bounds, boolean onDisk) throws IOException {
46 | Map ret;
47 |
48 | if (onDisk)
49 | ret = DBMaker.tempTreeMap();
50 | else
51 | ret = new HashMap<>();
52 |
53 | Envelope env = bounds.getEnvelopeInternal();
54 | double west = env.getMinX(), east = env.getMaxX(), north = env.getMaxY(), south = env.getMinY();
55 |
56 | // TODO: use prepared polygons
57 |
58 | // figure out how many tiles we're requesting
59 | int minX = lon2tile(west, ZOOM_LEVEL), maxX = lon2tile(east, ZOOM_LEVEL),
60 | minY = lat2tile(north, ZOOM_LEVEL), maxY = lat2tile(south, ZOOM_LEVEL);
61 |
62 | int tcount = (maxX - minX + 1) * (maxY - minY + 1);
63 |
64 | LOG.info("Requesting {} tiles", tcount);
65 |
66 | int fcount = 0;
67 |
68 | // read all the relevant tiles
69 | for (int x = minX; x <= maxX; x++) {
70 | for (int y = minY; y <= maxY; y++) {
71 | InputStream is = getInputStream(x, y);
72 |
73 | if (is == null)
74 | // no data in this tile
75 | continue;
76 |
77 | // decoder closes input stream as soon as it has read the tile
78 | GeobufDecoder decoder = new GeobufDecoder(new GZIPInputStream(new BufferedInputStream(is)));
79 |
80 | while (decoder.hasNext()) {
81 | GeobufFeature f = decoder.next();
82 | // blocks are duplicated at the edges of tiles, no need to import twice
83 | if (ret.containsKey(f.numericId))
84 | continue;
85 |
86 | if (!bounds.disjoint(f.geometry)) {
87 | ret.put(f.numericId, f);
88 | fcount++;
89 |
90 | if (fcount % 1000 == 0)
91 | LOG.info("Read {} features", fcount);
92 | }
93 | }
94 | }
95 | }
96 |
97 | return ret;
98 | }
99 |
100 | /** get an input stream for the given tile */
101 | protected abstract InputStream getInputStream(int x, int y) throws IOException;
102 | }
103 |
--------------------------------------------------------------------------------
/downloadData.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # Download the data we need for a particular [set of] states, or the entire country
3 | # usage: downloadData.py outDir state_abbr [state_abbr . . .]
4 | # Or use special code 'ALL'
5 |
6 | from sys import argv
7 | from urllib.request import urlretrieve
8 | import zipfile
9 | import os
10 | import os.path
11 | from shutil import copyfileobj
12 | from time import sleep
13 |
14 | # map from state abbreviations to FIPS codes
15 | # per http://www.epa.gov/enviro/html/codes/state.html
16 | fipsCodes = dict (
17 | AK = "02", # ALASKA
18 | AL = "01", # ALABAMA
19 | AR = "05", # ARKANSAS
20 | AS = "60", # AMERICAN SAMOA
21 | AZ = "04", # ARIZONA
22 | CA = "06", # CALIFORNIA
23 | CO = "08", # COLORADO
24 | CT = "09", # CONNECTICUT
25 | DC = "11", # DISTRICT OF COLUMBIA
26 | DE = "10", # DELAWARE
27 | FL = "12", # FLORIDA
28 | GA = "13", # GEORGIA
29 | GU = "66", # GUAM
30 | HI = "15", # HAWAII
31 | IA = "19", # IOWA
32 | ID = "16", # IDAHO
33 | IL = "17", # ILLINOIS
34 | IN = "18", # INDIANA
35 | KS = "20", # KANSAS
36 | KY = "21", # KENTUCKY
37 | LA = "22", # LOUISIANA
38 | MA = "25", # MASSACHUSETTS
39 | MD = "24", # MARYLAND
40 | ME = "23", # MAINE
41 | MI = "26", # MICHIGAN
42 | MN = "27", # MINNESOTA
43 | MO = "29", # MISSOURI
44 | MS = "28", # MISSISSIPPI
45 | MT = "30", # MONTANA
46 | NC = "37", # NORTH CAROLINA
47 | ND = "38", # NORTH DAKOTA
48 | NE = "31", # NEBRASKA
49 | NH = "33", # NEW HAMPSHIRE
50 | NJ = "34", # NEW JERSEY
51 | NM = "35", # NEW MEXICO
52 | NV = "32", # NEVADA
53 | NY = "36", # NEW YORK
54 | OH = "39", # OHIO
55 | OK = "40", # OKLAHOMA
56 | OR = "41", # OREGON
57 | PA = "42", # PENNSYLVANIA
58 | PR = "72", # PUERTO RICO
59 | RI = "44", # RHODE ISLAND
60 | SC = "45", # SOUTH CAROLINA
61 | SD = "46", # SOUTH DAKOTA
62 | TN = "47", # TENNESSEE
63 | TX = "48", # TEXAS
64 | UT = "49", # UTAH
65 | VA = "51", # VIRGINIA
66 | VI = "78", # VIRGIN ISLANDS
67 | VT = "50", # VERMONT
68 | WA = "53", # WASHINGTON
69 | WI = "55", # WISCONSIN
70 | WV = "54", # WEST VIRGINIA
71 | WY = "56", # WYOMING
72 | )
73 |
74 | # parse arguments
75 | outDir = argv[1]
76 | states = [state.upper() for state in argv[2:]]
77 |
78 | if len(states) == 1 and states[0] == 'ALL':
79 | # download all states
80 | print("Downloading all states")
81 | states = fipsCodes.keys()
82 |
83 | # check inputs
84 | invalidStates = [state for state in states if not state in fipsCodes]
85 |
86 | if len(invalidStates) > 0:
87 | print ("Did not recognize states %s" % ' '.join(invalidStates))
88 |
89 | # make the directory structure
90 | os.makedirs(os.path.join(outDir, 'tiger'))
91 | os.makedirs(os.path.join(outDir, 'jobs'))
92 | os.makedirs(os.path.join(outDir, 'workforce'))
93 |
94 | # download resiliently
95 | def retrieve(url, path):
96 | for i in range(50):
97 | try:
98 | print("download attempt {0}".format(i))
99 | urlretrieve(url, path)
100 | except:
101 | print("error retrieving {0}, retrying".format(url))
102 | sleep(5)
103 | else:
104 | break
105 |
106 |
107 | # download the states
108 | for state in states:
109 | print('processing %s' % state)
110 | print('Downloading TIGER')
111 | # get tiger
112 | fips = fipsCodes[state]
113 | zipout = os.path.join(outDir, "tiger", "{0}.zip".format(state))
114 | retrieve("ftp://ftp2.census.gov/geo/tiger/TIGER2010/TABBLOCK/2010/tl_2010_{0}_tabblock10.zip".format(fips), zipout)
115 |
116 | # unzip it
117 | # adapted from http://stackoverflow.com/questions/12886768/
118 | with zipfile.ZipFile(zipout) as zf:
119 | for member in zf.infolist():
120 | name = os.path.split(member.filename)[-1]
121 | dest = os.path.join(outDir, 'tiger', name)
122 | with zf.open(member) as stream:
123 | with open(dest, 'wb') as out:
124 | copyfileobj(stream, out)
125 |
126 | # we no longer need the zipfile
127 | os.remove(zipout)
128 |
129 | print('Done with TIGER')
130 |
131 | print('Downloading LODES data')
132 |
133 | # figure out the year of the latest available data
134 | # Most states have 2015 data available
135 | # see http://lehd.ces.census.gov/data/lodes/LODES7/LODESTechDoc7.4.pdf, page 2f
136 | year = 2017
137 |
138 | # Alaska and South Dakota do not have LODES2017 data available, so use 2016
139 | if state == 'AK' or state == 'SD':
140 | year = 2016
141 | elif state == 'PR' or state == 'VI':
142 | print('{0} does not have LODES data available'.format(state))
143 | year = 0
144 |
145 | if year:
146 | print("Downloading {0} LODES data for {1}".format(year, state))
147 |
148 | # get the rac file
149 | out = os.path.join(outDir, 'workforce', '{0}_{1}_rac.csv.gz'.format(state, year))
150 | retrieve("http://lehd.ces.census.gov/data/lodes/LODES7/{0}/rac/{0}_rac_S000_JT00_{1}.csv.gz".format(state.lower(), year), out)
151 |
152 | # get the wac file
153 | out = os.path.join(outDir, 'jobs', '{0}_{1}_wac.csv.gz'.format(state, year))
154 | retrieve("http://lehd.ces.census.gov/data/lodes/LODES7/{0}/wac/{0}_wac_S000_JT00_{1}.csv.gz".format(state.lower(), year), out)
155 |
156 | print('Done with {0}'.format(state))
157 |
--------------------------------------------------------------------------------
/src/main/java/com/conveyal/data/census/LodesSource.java:
--------------------------------------------------------------------------------
1 | package com.conveyal.data.census;
2 |
3 | import com.conveyal.data.geobuf.GeobufFeature;
4 | import com.csvreader.CsvReader;
5 |
6 | import java.io.*;
7 | import java.util.HashMap;
8 | import java.util.Map;
9 | import java.util.zip.GZIPInputStream;
10 |
11 | /**
12 | * Data source for LODES data.
13 | */
14 | public class LodesSource {
15 | private File input;
16 | private LodesType type;
17 |
18 | public LodesSource(File input, LodesType type) {
19 | this.input = input;
20 | this.type = type;
21 | }
22 |
23 | public void load(ShapeDataStore store) throws Exception {
24 | InputStream csv = new GZIPInputStream(new BufferedInputStream(new FileInputStream(input)));
25 | CsvReader reader = new CsvReader(new InputStreamReader(csv));
26 |
27 | // rename the columns to something useful
28 | //http://lehd.ces.census.gov/data/lodes/LODES7/LODESTechDoc7.1.pdf#page=7&zoom=auto,-266,580
29 | Map colNames = new HashMap<>();
30 | colNames.put("C000", "total");
31 |
32 | colNames.put("CA01", "age 29 or younger");
33 | colNames.put("CA02", "age 30 to 54");
34 | colNames.put("CA03", "age 55 or older");
35 |
36 | colNames.put("CE01", "with earnings $1250 per month or less");
37 | colNames.put("CE02", "with earnings $1251 - $3333 per month");
38 | colNames.put("CE03", "with earnings greater than $3333 per month");
39 |
40 | colNames.put("CNS01", "in agriculture, forestry, fishing and hunting");
41 | colNames.put("CNS02", "in mining, quarrying, and oil and gas extraction");
42 | colNames.put("CNS03", "in utilities");
43 | colNames.put("CNS04", "in construction");
44 | colNames.put("CNS05", "in manufacturing");
45 | colNames.put("CNS06", "in wholesale trade");
46 | colNames.put("CNS07", "in retail trade");
47 | colNames.put("CNS08", "in transportation and warehousing");
48 | colNames.put("CNS09", "in information");
49 | colNames.put("CNS10", "in finance and insurance");
50 | colNames.put("CNS11", "in real estate");
51 | colNames.put("CNS12", "in professional, scientific and technical services");
52 | colNames.put("CNS13", "in management");
53 | colNames.put("CNS14", "in administration, support, and waste management");
54 | colNames.put("CNS15", "in educational services");
55 | colNames.put("CNS16", "in healthcare and social assistance");
56 | colNames.put("CNS17", "in arts, entertainment and recreation");
57 | colNames.put("CNS18", "in accommodation and food services");
58 | colNames.put("CNS19", "in other services, except public administration");
59 | colNames.put("CNS20", "in public administration");
60 |
61 | colNames.put("CR01", "with race White alone");
62 | colNames.put("CR02", "with race Black or African American alone");
63 | colNames.put("CR03", "with race American Indian or Alaska Native alone");
64 | colNames.put("CR04", "with race Asian alone");
65 | colNames.put("CR05", "with race Native Hawaiian or Other Pacific Islander alone");
66 | colNames.put("CR07", "with two or more racial groups");
67 |
68 | colNames.put("CT01", "not Hispanic or Latino");
69 | colNames.put("CT02", "Hispanic or Latino");
70 |
71 | colNames.put("CD01", "with less than high school education");
72 | colNames.put("CD02", "with high school education, no college");
73 | colNames.put("CD03", "with some college education or Associate degree");
74 | colNames.put("CD04", "with Bachelor's degree or higher");
75 | colNames.put("CS01", "male");
76 | colNames.put("CS02", "female");
77 |
78 | // only in workplace characteristics
79 | colNames.put("CFA01", "at firms aged 0-1 years");
80 | colNames.put("CFA02", "at firms aged 2-3 years");
81 | colNames.put("CFA03", "at firms aged 4-5 years");
82 | colNames.put("CFA04", "at firms aged 6-10 years");
83 | colNames.put("CFA05", "at firms aged 11 or more years");
84 |
85 | colNames.put("CFS01", "at firms with 0-19 employees");
86 | colNames.put("CFS02", "at firms with 20-49 employees");
87 | colNames.put("CFS03", "at firms with 50-249 employees");
88 | colNames.put("CFS04", "at firms with 250-499 employees");
89 | colNames.put("CFS05", "at firms with 500 or more employees");
90 | colNames.put("createdate", "Data creation date");
91 |
92 | reader.readHeaders();
93 | String[] headers = reader.getHeaders();
94 |
95 | // read the file
96 | while (reader.readRecord()) {
97 | long id = Long.parseLong(reader.get(type == LodesType.WORKPLACE ? "w_geocode" : "h_geocode"));
98 | GeobufFeature feat = store.get(id);
99 |
100 | String[] line = reader.getValues();
101 | for (int i = 0; i < line.length; i++) {
102 | String col = headers[i];
103 |
104 | if (!colNames.containsKey(col))
105 | continue;
106 |
107 | String colName;
108 |
109 | if (type == LodesType.WORKPLACE) {
110 | if (col.startsWith("CR") || col.startsWith("CD") || col.startsWith("CA"))
111 | colName = "Jobs employing workers " + colNames.get(col);
112 | else if (col.startsWith("CS"))
113 | colName = "Jobs employing " + colNames.get(col) + "s";
114 | else if (col.startsWith("CT"))
115 | colName = "Jobs employing " + colNames.get(col) + " workers";
116 | else
117 | colName = "Jobs " + colNames.get(col);
118 | }
119 | else if (type == LodesType.RESIDENCE) {
120 | if (col.startsWith("CT") || col.startsWith("CS"))
121 | colName = "Workers, " + colNames.get(col);
122 | else
123 | colName = "Workers " + colNames.get(col);
124 | }
125 | else {
126 | throw new IllegalArgumentException("Invalid LODES type");
127 | }
128 |
129 | feat.properties.put(colName, Integer.parseInt(line[i]));
130 | }
131 |
132 | store.put(feat);
133 | }
134 |
135 | reader.close();
136 | }
137 |
138 | /** supported lodes types are workplace area characteristics and residence area characteristics */
139 | public static enum LodesType {
140 | WORKPLACE, RESIDENCE
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.conveyal.data.census
8 | seamless-census
9 | 1.2-SNAPSHOT
10 |
11 |
12 | UTF-8
13 |
14 |
15 |
16 |
17 | org.mapdb
18 | mapdb
19 | 2.0-beta6
20 |
21 |
22 | com.conveyal
23 | jackson2-geojson
24 | 0.9
25 |
26 |
27 | com.google.protobuf
28 | protobuf-java
29 | 2.6.1
30 |
31 |
32 | com.conveyal
33 | geobuf-java
34 | 1.1
35 |
36 |
37 | net.sf.trove4j
38 | trove4j
39 | 3.0.3
40 |
41 |
42 | net.sourceforge.javacsv
43 | javacsv
44 | 2.0
45 |
46 |
47 | com.fasterxml.jackson.core
48 | jackson-databind
49 | 2.8.11.1
50 |
51 |
52 | com.google.guava
53 | guava
54 | 18.0
55 |
56 |
57 | com.amazonaws
58 | aws-java-sdk-s3
59 | 1.11.341
60 |
61 |
62 | junit
63 | junit
64 | 4.12
65 | test
66 |
67 |
68 |
69 |
70 |
71 | conveyal
72 | Conveyal Maven Repository
73 | https://s3.amazonaws.com/maven.conveyal.com/
74 |
75 |
76 |
77 |
78 |
79 |
80 | org.apache.maven.plugins
81 | maven-compiler-plugin
82 | 3.8.1
83 |
84 | 11
85 |
86 |
87 |
88 | org.apache.maven.plugins
89 | maven-jar-plugin
90 | 3.1.2
91 |
92 |
93 |
94 |
96 | com.conveyal.data.census
97 |
98 |
99 |
100 |
101 |
102 | org.apache.maven.plugins
103 | maven-shade-plugin
104 | 3.2.1
105 |
106 |
107 | package
108 | shade
109 |
110 | seamless-census
111 |
112 |
113 |
114 |
115 | com.conveyal.data.census.CensusExtractor
116 |
117 |
118 |
119 |
121 |
122 |
123 |
124 |
125 |
126 |
127 | *:*
128 |
129 | META-INF/*.SF
130 | META-INF/*.DSA
131 | META-INF/*.RSA
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 | org.springframework.build
146 | aws-maven
147 | 5.0.0.RELEASE
148 |
149 |
150 | org.kuali.maven.wagons
151 | maven-s3-wagon
152 | 1.2.1
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 | conveyal-maven-repo
162 | Conveyal Maven Repository
163 | s3://maven.conveyal.com/
164 |
165 |
166 |
167 |
--------------------------------------------------------------------------------
/src/main/java/com/conveyal/data/census/ShapeDataStore.java:
--------------------------------------------------------------------------------
1 | package com.conveyal.data.census;
2 |
3 | import com.amazonaws.services.s3.AmazonS3;
4 | import com.amazonaws.services.s3.AmazonS3ClientBuilder;
5 | import com.amazonaws.services.s3.model.ObjectMetadata;
6 | import com.conveyal.data.geobuf.GeobufEncoder;
7 | import com.conveyal.data.geobuf.GeobufFeature;
8 | import org.locationtech.jts.geom.Envelope;
9 | import org.mapdb.*;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 |
13 | import java.io.*;
14 | import java.util.ArrayList;
15 | import java.util.List;
16 | import java.util.NavigableSet;
17 | import java.util.concurrent.*;
18 | import java.util.function.BiFunction;
19 | import java.util.zip.GZIPOutputStream;
20 |
21 | /**
22 | * Store geographic data by ID, with index by zoom-11 tile.
23 | */
24 | public class ShapeDataStore {
25 | public static final int ZOOM_LEVEL = 11;
26 |
27 | private static final Logger LOG = LoggerFactory.getLogger(ShapeDataStore.class);
28 |
29 | /** number of decimal places of precision to store */
30 | public static final int PRECISION = 12;
31 |
32 | private DB db;
33 |
34 | /**
35 | * set of Object[] { int[] { x, y }, Feature } for features at zoom 11
36 | */
37 | private NavigableSet