├── .gitignore ├── src ├── test │ ├── resources │ │ └── com │ │ │ └── conveyal │ │ │ └── data │ │ │ └── census │ │ │ └── integrationTest.zip │ └── java │ │ └── com │ │ └── conveyal │ │ └── data │ │ └── census │ │ └── IntegrationTest.java └── main │ └── java │ └── com │ └── conveyal │ └── data │ └── census │ ├── FileSeamlessSource.java │ ├── TigerLineSource.java │ ├── S3SeamlessSource.java │ ├── CensusLoader.java │ ├── CensusExtractor.java │ ├── SeamlessSource.java │ ├── LodesSource.java │ └── ShapeDataStore.java ├── settings.xml ├── randomizeCsv.py ├── LICENSE ├── .travis.yml ├── README.md ├── downloadData.py └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | seamless-census.iml 3 | target/* 4 | temporary_dir/* 5 | -------------------------------------------------------------------------------- /src/test/resources/com/conveyal/data/census/integrationTest.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/conveyal/seamless-census/HEAD/src/test/resources/com/conveyal/data/census/integrationTest.zip -------------------------------------------------------------------------------- /settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | conveyal 6 | 7 | ${env.AWS_ACCESS_KEY_ID} 8 | ${env.AWS_SECRET_ACCESS_KEY} 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /randomizeCsv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Take a CSV from LODES and make all values unique so we can be sure that our tests 3 | # are working. Some columns are all zeros or are (almost) collinear, so accidentally 4 | # switching them might not make the tests fail. 5 | 6 | from csv import DictReader, DictWriter 7 | from sys import argv 8 | 9 | ct = 0 10 | def nextVal(): 11 | global ct 12 | ct += 1 13 | return ct 14 | 15 | with open(argv[1]) as infile: 16 | reader = DictReader(infile) 17 | 18 | with open(argv[2], 'w') as outfile: 19 | writer = DictWriter(outfile, reader.fieldnames) 20 | writer.writeheader() 21 | 22 | for row in reader: 23 | writer.writerow({k: nextVal() if k.startswith('C') else v for k, v in row.iteritems()}) 24 | -------------------------------------------------------------------------------- /src/main/java/com/conveyal/data/census/FileSeamlessSource.java: -------------------------------------------------------------------------------- 1 | package com.conveyal.data.census; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | 8 | /** 9 | * Seamless source for the file system. 10 | */ 11 | public class FileSeamlessSource extends SeamlessSource { 12 | private File directory; 13 | 14 | public FileSeamlessSource(String path) { 15 | this.directory = new File(path); 16 | } 17 | 18 | @Override protected InputStream getInputStream(int x, int y) throws IOException { 19 | File dir = new File(directory, x + ""); 20 | File file = new File(dir, y + ".pbf.gz"); 21 | 22 | if (!file.exists()) 23 | return null; 24 | 25 | return new FileInputStream(file); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Conveyal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main/java/com/conveyal/data/census/TigerLineSource.java: -------------------------------------------------------------------------------- 1 | package com.conveyal.data.census; 2 | 3 | import com.conveyal.data.geobuf.GeobufFeature; 4 | import org.geotools.data.FileDataStore; 5 | import org.geotools.data.FileDataStoreFinder; 6 | import org.geotools.data.Query; 7 | import org.geotools.data.simple.SimpleFeatureCollection; 8 | import org.geotools.data.simple.SimpleFeatureIterator; 9 | import org.geotools.data.simple.SimpleFeatureSource; 10 | import org.geotools.referencing.CRS; 11 | 12 | import java.io.File; 13 | import java.util.HashMap; 14 | 15 | /** 16 | * Reads TIGER/Line data into a MapDB. 17 | */ 18 | public class TigerLineSource { 19 | private File shapefile; 20 | 21 | public TigerLineSource (File shapefile) { 22 | this.shapefile = shapefile; 23 | } 24 | 25 | public void load (ShapeDataStore store) throws Exception { 26 | FileDataStore fds = FileDataStoreFinder.getDataStore(shapefile); 27 | SimpleFeatureSource src = fds.getFeatureSource(); 28 | 29 | Query q = new Query(); 30 | q.setCoordinateSystem(src.getInfo().getCRS()); 31 | q.setCoordinateSystemReproject(CRS.decode("EPSG:4326", true)); 32 | SimpleFeatureCollection sfc = src.getFeatures(q); 33 | 34 | for (SimpleFeatureIterator it = sfc.features(); it.hasNext();) { 35 | GeobufFeature feat = new GeobufFeature(it.next()); 36 | feat.id = null; 37 | feat.numericId = Long.parseLong((String) feat.properties.get("GEOID10")); 38 | feat.properties = new HashMap<>(); 39 | store.add(feat); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/conveyal/data/census/S3SeamlessSource.java: -------------------------------------------------------------------------------- 1 | package com.conveyal.data.census; 2 | 3 | import com.amazonaws.services.s3.AmazonS3; 4 | import com.amazonaws.services.s3.AmazonS3Client; 5 | import com.amazonaws.services.s3.AmazonS3ClientBuilder; 6 | import com.amazonaws.services.s3.model.AmazonS3Exception; 7 | import com.amazonaws.services.s3.model.GetObjectRequest; 8 | 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | 12 | /** 13 | * A seamless data source based on storage in Amazon S3. 14 | */ 15 | public class S3SeamlessSource extends SeamlessSource { 16 | private static AmazonS3 s3; 17 | 18 | public final String region; 19 | public final String bucketName; 20 | 21 | public S3SeamlessSource(String bucketName) { 22 | this.region = null; 23 | this.bucketName = bucketName; 24 | this.s3 = AmazonS3ClientBuilder.defaultClient(); 25 | } 26 | 27 | public S3SeamlessSource(String region, String bucketName) { 28 | this.region = region; 29 | this.bucketName = bucketName; 30 | this.s3 = AmazonS3ClientBuilder.standard() 31 | .withRegion(region) 32 | .build(); 33 | } 34 | 35 | @Override 36 | protected InputStream getInputStream(int x, int y) throws IOException { 37 | try { 38 | GetObjectRequest req = new GetObjectRequest(bucketName, String.format("%d/%d.pbf.gz", x, y)); 39 | // the LODES bucket is requester-pays. 40 | req.setRequesterPays(true); 41 | return s3.getObject(req).getObjectContent(); 42 | } catch (AmazonS3Exception e) { 43 | // there is no data in this tile 44 | if ("NoSuchKey".equals(e.getErrorCode())) 45 | return null; 46 | else 47 | // re-throw, something else is amiss 48 | throw e; 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | # Geobuf requires Java 8 (OpenJDK 8 is not available on travis) 4 | jdk: 5 | - oraclejdk8 6 | 7 | # encrypted AWS access/secret keys to allow automated deployment to conveyal maven repo 8 | env: 9 | global: 10 | # encrypted AWS access/secret keys to allow automated deployment to conveyal maven repo 11 | - secure: "o5mIHTKiJgHjYUfoHcDwElNj8eShRTqVJRNS5DTfxy+7Tnv6JcKXv4BQzJi7Lzj8kfvQz9xBmYRij2oY+t/dDDcwBfkI4WrrGL1b5pgl1HoyncXgQj0uej8C+bD3FtKJ5aD/UJx5YtbKeMTE8Yz7+IzfyW7JtgANEoMbme1yID47wlSNgfYbvEeHvQPA1RpjR7NdJvesgGt/05L2W46tacxptp5MDrqCyvytpeoWQjS8RpApACL9NoOENH4hxsxJbgjaBK6QComiymOQVSfTwi0MEWqIdbl8YBaA4c1RglRYF86BDnuOqNqsb+yO9pizWeyDJTo1T3szG4o23Wo6MqBu52QC6oPMkoED59bBATB6uDwG7uxDkpDk7BWJZkg7VFQSNDM62+gOII2mssDp49Qo2jP0wVozCGkk9mExOTUrGfwx/AVGj88ockJFIJw6y9/8GtpqwxMdsHEPKJl3/iKqt3IB4gJ4jErlowMPG4uRPO01vkjXx1wi1+lu0TCLivz1Kjgby4RPnD6THG5SP2wpYypym7jyWp7aB4AWoeULuJNN/2MYYFGxT3yo3fJ4A+Md7sQf548GN4nJC9CX4+JASTxF/ZnKOv3VS++SA8EBKyAOd57vZJZ/mRF/2RhkQtj6Y8dt1ztOR/mhXD0TmX/sLPcGvs4Y1I6TnqSCaSo=" 12 | - secure: "i5IlkapdSkptmr1unNgYvn0Ps707OOcL9qOxlPUdz/LcHdBQWv0ajuvdvS84JgkY0OFVPVk6+CeiByUU7CbHUNLTuwubXqKoxyAx095jLZ8eG4y12SlZQOay70djgfRHCUj2YMhHqIwoXjObc3FCoUjs8xlofdvHItDQT5kwVt/MVAssXfYxTHaLwhjI4JHhWk/YKbY6pfD2RF+3DiL6RT75By7lbVuTmLZqzDgNFGxmYN+yWNiIowpTIZto651ttV20ICu5atVMryWL8uurG+/xRhari4NTnHn7Clq8MbwxXwOUDekIlUy2+GAsFGCluYZESNJ76d2lVzAjJF3jzE2Rg989KEW02v3iMYi/xu1NY75ZUpQeMS0zTCQCC1BbZq1LmKqdh8Bp69kn1CmPaBWIfG/ZncCLplTsqK6aNnCZsskcRNR/es3DW2n0GBeG/CtPkvKg79olJyCdsvowfXezb4isw1MkjSh1tIaUD5lfJ8KBmXHz0IsYBOtraQZRjwOa8Cef0YQP7v5K4F9C5YNUzEVW7RZML8OUc30qjT8xUpfmnErrVNaSMJFyKA6bppBhGAfJQLtrsjOoCTlh4JnXzkq0M4owckXOr8f+iED8q5DxWujTdIR5fA/WwsiINw82kf1D42UNEaSENor3YvzlTG2GJF90akOmpby9WO0=" 13 | 14 | # Run on container based infrastructure (allows caching &c.) 15 | sudo: false 16 | 17 | # deploy maven artifacts from master iff it's not a pull request 18 | after_success: | 19 | if [ "$TRAVIS_BRANCH" = "master" ] && [ "$TRAVIS_PULL_REQUEST" = "false" ]; then 20 | # no need to run tests again 21 | mvn deploy --settings settings.xml -DskipTests 22 | fi 23 | 24 | # Save the maven cache to speed up builds 25 | cache: 26 | directories: 27 | - "$HOME/.m2/repository" 28 | -------------------------------------------------------------------------------- /src/main/java/com/conveyal/data/census/CensusLoader.java: -------------------------------------------------------------------------------- 1 | package com.conveyal.data.census; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import java.io.File; 7 | import java.util.stream.Stream; 8 | 9 | /** 10 | * Import data from the US Census into a seamless store in S3 or on disk. 11 | */ 12 | public class CensusLoader { 13 | protected static final Logger LOG = LoggerFactory.getLogger(CensusLoader.class); 14 | 15 | public static void main (String... args) throws Exception { 16 | File indir = new File(args[0]); 17 | File tiger = new File(indir, "tiger"); 18 | 19 | ShapeDataStore store = new ShapeDataStore(); 20 | 21 | // load up the tiger files in parallel 22 | LOG.info("Loading TIGER (geometry)"); 23 | Stream.of(tiger.listFiles()) 24 | .filter(f -> f.getName().endsWith(".shp")) 25 | .forEach(f -> { 26 | LOG.info("Loading file {}", f); 27 | TigerLineSource src = new TigerLineSource(f); 28 | try { 29 | src.load(store); 30 | } catch (Exception e) { 31 | throw new RuntimeException(e); 32 | } 33 | }); 34 | 35 | LOG.info("TIGER done"); 36 | 37 | LOG.info("Loading LODES workforce data"); 38 | File workforce = new File(indir, "workforce"); 39 | Stream.of(workforce.listFiles()) 40 | .filter(f -> f.getName().endsWith(".csv.gz")) 41 | .forEach(f -> { 42 | LOG.info("Loading file {}", f); 43 | try { 44 | new LodesSource(f, LodesSource.LodesType.RESIDENCE).load(store); 45 | } catch (Exception e) { 46 | throw new RuntimeException(e); 47 | } 48 | }); 49 | LOG.info("Workforce done"); 50 | 51 | LOG.info("Loading LODES jobs data"); 52 | File jobs = new File(indir, "jobs"); 53 | Stream.of(jobs.listFiles()) 54 | .filter(f -> f.getName().endsWith(".csv.gz")) 55 | .forEach(f -> { 56 | LOG.info("Loading file {}", f); 57 | try { 58 | new LodesSource(f, LodesSource.LodesType.WORKPLACE).load(store); 59 | } catch (Exception e) { 60 | throw new RuntimeException(e); 61 | } 62 | }); 63 | LOG.info("Jobs done"); 64 | 65 | if (args.length == 1) 66 | store.writeTiles(new File(indir, "tiles")); 67 | else 68 | // write to s3 69 | store.writeTilesToS3(args[1]); 70 | 71 | store.close(); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # seamless-census 2 | 3 | Import US Census data into a seamless storage environment. 4 | 5 | ## Usage 6 | 7 | Running the download and load steps for the entire US requires ~45 GB of disk space. 8 | 9 | ### Download data 10 | 11 | You can use the following command to download 12 | data from the Census bureau. Create a temporary directory to receive the files before you combine them and load them to 13 | S3, in a location that has plenty of disk space. The arguments are the temporary directory and the two-letter postal abbreviations 14 | of the states for which you want to retrieve data (you can also use the special code ALL to retrieve data for every state, territory and district). 15 | The command below, for instance, would download data for the greater Washington, DC megalopolis. 16 | 17 | python downloadData.py temporary_dir DC MD VA WV DE 18 | 19 | ### Load data 20 | 21 | Use the same temporary directory 22 | you used above. If you omit the s3 bucket name, it will place the tiles in the `tiles` directory in the temporary directory. 23 | 24 | JAVA_OPTS=-Xmx[several]G mvn exec:java -Dexec.mainClass="com.conveyal.data.census.CensusLoader" -Dexec.args="temporary_dir s3_bucket_name" 25 | 26 | ### Extract data 27 | 28 | Now for the fun part. The following command will extract the data stored in the s3 bucket specified, using the bounding box specified, 29 | to the geobuf file out.pbf. 30 | 31 | JAVA_OPTS=-Xmx[several]G mvn exec:java -Dexec.mainClass="com.conveyal.data.census.CensusExtractor" -Dexec.args="s3://bucket_name n e s w out.pbf" 32 | 33 | ## Data storage 34 | 35 | Data is stored in a directory structure, which is kept in Amazon S3. Census data is split 36 | up into zoom-level-11 tiles and stored in [GeoBuf](https://github.com/mapbox/geobuf) files, each 37 | in a directory for its source, its x coordinate and named its y coordinate. For example, `us-census-2012/342/815.pbf` 38 | might contain US LODES data and decennial census data for southeastern Goleta, CA. 39 | 40 | Enumeration units that fall into two tiles should be included in both tiles. It is the responsibility 41 | of the data consumer to deduplicate them; this can be done based on IDs. An enumeration unit that is 42 | duplicated across tiles must have the same integer ID in both tiles. 43 | 44 | We have already loaded LODES data from 2013, 2014, 2015, and 2017 in the S3 buckets `lodes-data`, `lodes-data-2014`, `lodes-data-2015`, etc. 45 | These buckets and their contents are publicly readable and requester-pays (i.e. accessing them will incur fees on your AWS account). 46 | The 2013 data lack Massachusetts, and uses 2011 data for Kansas, due to data availability. 47 | The 2014 and 2015 data do not have these problems. 48 | The 2017 data exclude federal employees and use 2016 data for Alaska and South Dakota. See LODES Technical Documentation for details. 49 | 50 | ## Use in Conveyal Analysis 51 | 52 | Any dataset that can be placed in this format can be used in [Conveyal Analysis](https://github.com/conveyal/analysis-ui) 53 | -------------------------------------------------------------------------------- /src/main/java/com/conveyal/data/census/CensusExtractor.java: -------------------------------------------------------------------------------- 1 | package com.conveyal.data.census; 2 | 3 | import com.conveyal.data.geobuf.GeobufEncoder; 4 | import com.conveyal.data.geobuf.GeobufFeature; 5 | import com.conveyal.geojson.GeoJsonModule; 6 | import com.fasterxml.jackson.databind.ObjectMapper; 7 | import org.locationtech.jts.geom.Geometry; 8 | import org.locationtech.jts.geom.GeometryCollection; 9 | import org.locationtech.jts.geom.GeometryFactory; 10 | import org.locationtech.jts.geom.Polygon; 11 | 12 | import java.io.*; 13 | import java.util.List; 14 | import java.util.Map; 15 | 16 | /** 17 | * Extract Census data from a seamless datastore. 18 | */ 19 | public class CensusExtractor { 20 | /** 21 | * The precision to use for output files. 22 | * Set above 6 at your own risk; higher precision files work fine with the reference implementation and with geobuf-java, 23 | * but break with pygeobuf (see https://github.com/mapbox/pygeobuf/issues/21) 24 | */ 25 | private static final int PRECISION = 6; 26 | 27 | public static void main (String... args) throws IOException { 28 | if (args.length < 3 || args.length > 6) { 29 | System.err.println("usage: CensusExtractor (s3://bucket|data_dir) n e s w [outfile.json]"); 30 | System.err.println(" or: CensusExtractor (s3://bucket|data_dir) boundary.geojson [outfile.json]"); 31 | return; 32 | } 33 | 34 | SeamlessSource source; 35 | if (!args[0].startsWith("s3://")) 36 | source = new FileSeamlessSource(args[0]); 37 | else 38 | source = new S3SeamlessSource(args[0].substring(5)); 39 | 40 | long start = System.currentTimeMillis(); 41 | 42 | Map features; 43 | 44 | if (args.length >= 4) { 45 | features = source.extract(Double.parseDouble(args[1]), 46 | Double.parseDouble(args[2]), 47 | Double.parseDouble(args[3]), 48 | Double.parseDouble(args[4]), 49 | false 50 | ); 51 | } 52 | else { 53 | // read geojson boundary 54 | ObjectMapper om = new ObjectMapper(); 55 | om.registerModule(new GeoJsonModule()); 56 | FileInputStream fis = new FileInputStream(new File(args[1])); 57 | FeatureCollection fc = om.readValue(fis, FeatureCollection.class); 58 | fis.close(); 59 | 60 | features = source.extract(fc.features.get(0).geometry, false); 61 | } 62 | 63 | OutputStream out; 64 | 65 | long completeTime = System.currentTimeMillis() - start; 66 | System.err.println("Read " + features.size() + " features in " + completeTime + "msec"); 67 | 68 | if (args.length == 6) 69 | out = new FileOutputStream(new File(args[5])); 70 | else if (args.length == 3) 71 | out = new FileOutputStream(new File(args[2])); 72 | else 73 | out = System.out; 74 | 75 | GeobufEncoder encoder = new GeobufEncoder(out, PRECISION); 76 | encoder.writeFeatureCollection(features.values()); 77 | encoder.close(); 78 | 79 | if (out instanceof FileOutputStream) 80 | out.close(); 81 | } 82 | 83 | // rudimentary geojson classes to deserialize feature collection 84 | 85 | public static class FeatureCollection { 86 | public String type; 87 | public Map crs; 88 | public List features; 89 | } 90 | 91 | public static class Feature { 92 | public String type; 93 | public Map properties; 94 | public Geometry geometry; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/com/conveyal/data/census/SeamlessSource.java: -------------------------------------------------------------------------------- 1 | package com.conveyal.data.census; 2 | 3 | import com.conveyal.data.geobuf.GeobufDecoder; 4 | import com.conveyal.data.geobuf.GeobufFeature; 5 | import org.locationtech.jts.geom.*; 6 | import org.locationtech.jts.geom.prep.PreparedPolygon; 7 | import org.locationtech.jts.util.GeometricShapeFactory; 8 | import org.mapdb.DBMaker; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import java.io.BufferedInputStream; 13 | import java.io.IOException; 14 | import java.io.InputStream; 15 | import java.util.HashMap; 16 | import java.util.Map; 17 | import java.util.zip.GZIPInputStream; 18 | 19 | import static com.conveyal.data.census.ShapeDataStore.lat2tile; 20 | import static com.conveyal.data.census.ShapeDataStore.lon2tile; 21 | 22 | /** 23 | * A tile source for seamless Census extracts 24 | */ 25 | public abstract class SeamlessSource { 26 | // convenience 27 | private static final int ZOOM_LEVEL = ShapeDataStore.ZOOM_LEVEL; 28 | 29 | protected static final Logger LOG = LoggerFactory.getLogger(SeamlessSource.class); 30 | 31 | private static final GeometryFactory geometryFactory = new GeometryFactory(); 32 | 33 | /** Extract features by bounding box */ 34 | public Map extract(double north, double east, double south, double west, boolean onDisk) throws 35 | IOException { 36 | GeometricShapeFactory factory = new GeometricShapeFactory(geometryFactory); 37 | factory.setCentre(new Coordinate((east + west) / 2, (north + south) / 2)); 38 | factory.setWidth(east - west); 39 | factory.setHeight(north - south); 40 | Polygon rect = factory.createRectangle(); 41 | return extract(rect, onDisk); 42 | } 43 | 44 | /** Extract features by arbitrary polygons */ 45 | public Map extract(Geometry bounds, boolean onDisk) throws IOException { 46 | Map ret; 47 | 48 | if (onDisk) 49 | ret = DBMaker.tempTreeMap(); 50 | else 51 | ret = new HashMap<>(); 52 | 53 | Envelope env = bounds.getEnvelopeInternal(); 54 | double west = env.getMinX(), east = env.getMaxX(), north = env.getMaxY(), south = env.getMinY(); 55 | 56 | // TODO: use prepared polygons 57 | 58 | // figure out how many tiles we're requesting 59 | int minX = lon2tile(west, ZOOM_LEVEL), maxX = lon2tile(east, ZOOM_LEVEL), 60 | minY = lat2tile(north, ZOOM_LEVEL), maxY = lat2tile(south, ZOOM_LEVEL); 61 | 62 | int tcount = (maxX - minX + 1) * (maxY - minY + 1); 63 | 64 | LOG.info("Requesting {} tiles", tcount); 65 | 66 | int fcount = 0; 67 | 68 | // read all the relevant tiles 69 | for (int x = minX; x <= maxX; x++) { 70 | for (int y = minY; y <= maxY; y++) { 71 | InputStream is = getInputStream(x, y); 72 | 73 | if (is == null) 74 | // no data in this tile 75 | continue; 76 | 77 | // decoder closes input stream as soon as it has read the tile 78 | GeobufDecoder decoder = new GeobufDecoder(new GZIPInputStream(new BufferedInputStream(is))); 79 | 80 | while (decoder.hasNext()) { 81 | GeobufFeature f = decoder.next(); 82 | // blocks are duplicated at the edges of tiles, no need to import twice 83 | if (ret.containsKey(f.numericId)) 84 | continue; 85 | 86 | if (!bounds.disjoint(f.geometry)) { 87 | ret.put(f.numericId, f); 88 | fcount++; 89 | 90 | if (fcount % 1000 == 0) 91 | LOG.info("Read {} features", fcount); 92 | } 93 | } 94 | } 95 | } 96 | 97 | return ret; 98 | } 99 | 100 | /** get an input stream for the given tile */ 101 | protected abstract InputStream getInputStream(int x, int y) throws IOException; 102 | } 103 | -------------------------------------------------------------------------------- /downloadData.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Download the data we need for a particular [set of] states, or the entire country 3 | # usage: downloadData.py outDir state_abbr [state_abbr . . .] 4 | # Or use special code 'ALL' 5 | 6 | from sys import argv 7 | from urllib.request import urlretrieve 8 | import zipfile 9 | import os 10 | import os.path 11 | from shutil import copyfileobj 12 | from time import sleep 13 | 14 | # map from state abbreviations to FIPS codes 15 | # per http://www.epa.gov/enviro/html/codes/state.html 16 | fipsCodes = dict ( 17 | AK = "02", # ALASKA 18 | AL = "01", # ALABAMA 19 | AR = "05", # ARKANSAS 20 | AS = "60", # AMERICAN SAMOA 21 | AZ = "04", # ARIZONA 22 | CA = "06", # CALIFORNIA 23 | CO = "08", # COLORADO 24 | CT = "09", # CONNECTICUT 25 | DC = "11", # DISTRICT OF COLUMBIA 26 | DE = "10", # DELAWARE 27 | FL = "12", # FLORIDA 28 | GA = "13", # GEORGIA 29 | GU = "66", # GUAM 30 | HI = "15", # HAWAII 31 | IA = "19", # IOWA 32 | ID = "16", # IDAHO 33 | IL = "17", # ILLINOIS 34 | IN = "18", # INDIANA 35 | KS = "20", # KANSAS 36 | KY = "21", # KENTUCKY 37 | LA = "22", # LOUISIANA 38 | MA = "25", # MASSACHUSETTS 39 | MD = "24", # MARYLAND 40 | ME = "23", # MAINE 41 | MI = "26", # MICHIGAN 42 | MN = "27", # MINNESOTA 43 | MO = "29", # MISSOURI 44 | MS = "28", # MISSISSIPPI 45 | MT = "30", # MONTANA 46 | NC = "37", # NORTH CAROLINA 47 | ND = "38", # NORTH DAKOTA 48 | NE = "31", # NEBRASKA 49 | NH = "33", # NEW HAMPSHIRE 50 | NJ = "34", # NEW JERSEY 51 | NM = "35", # NEW MEXICO 52 | NV = "32", # NEVADA 53 | NY = "36", # NEW YORK 54 | OH = "39", # OHIO 55 | OK = "40", # OKLAHOMA 56 | OR = "41", # OREGON 57 | PA = "42", # PENNSYLVANIA 58 | PR = "72", # PUERTO RICO 59 | RI = "44", # RHODE ISLAND 60 | SC = "45", # SOUTH CAROLINA 61 | SD = "46", # SOUTH DAKOTA 62 | TN = "47", # TENNESSEE 63 | TX = "48", # TEXAS 64 | UT = "49", # UTAH 65 | VA = "51", # VIRGINIA 66 | VI = "78", # VIRGIN ISLANDS 67 | VT = "50", # VERMONT 68 | WA = "53", # WASHINGTON 69 | WI = "55", # WISCONSIN 70 | WV = "54", # WEST VIRGINIA 71 | WY = "56", # WYOMING 72 | ) 73 | 74 | # parse arguments 75 | outDir = argv[1] 76 | states = [state.upper() for state in argv[2:]] 77 | 78 | if len(states) == 1 and states[0] == 'ALL': 79 | # download all states 80 | print("Downloading all states") 81 | states = fipsCodes.keys() 82 | 83 | # check inputs 84 | invalidStates = [state for state in states if not state in fipsCodes] 85 | 86 | if len(invalidStates) > 0: 87 | print ("Did not recognize states %s" % ' '.join(invalidStates)) 88 | 89 | # make the directory structure 90 | os.makedirs(os.path.join(outDir, 'tiger')) 91 | os.makedirs(os.path.join(outDir, 'jobs')) 92 | os.makedirs(os.path.join(outDir, 'workforce')) 93 | 94 | # download resiliently 95 | def retrieve(url, path): 96 | for i in range(50): 97 | try: 98 | print("download attempt {0}".format(i)) 99 | urlretrieve(url, path) 100 | except: 101 | print("error retrieving {0}, retrying".format(url)) 102 | sleep(5) 103 | else: 104 | break 105 | 106 | 107 | # download the states 108 | for state in states: 109 | print('processing %s' % state) 110 | print('Downloading TIGER') 111 | # get tiger 112 | fips = fipsCodes[state] 113 | zipout = os.path.join(outDir, "tiger", "{0}.zip".format(state)) 114 | retrieve("ftp://ftp2.census.gov/geo/tiger/TIGER2010/TABBLOCK/2010/tl_2010_{0}_tabblock10.zip".format(fips), zipout) 115 | 116 | # unzip it 117 | # adapted from http://stackoverflow.com/questions/12886768/ 118 | with zipfile.ZipFile(zipout) as zf: 119 | for member in zf.infolist(): 120 | name = os.path.split(member.filename)[-1] 121 | dest = os.path.join(outDir, 'tiger', name) 122 | with zf.open(member) as stream: 123 | with open(dest, 'wb') as out: 124 | copyfileobj(stream, out) 125 | 126 | # we no longer need the zipfile 127 | os.remove(zipout) 128 | 129 | print('Done with TIGER') 130 | 131 | print('Downloading LODES data') 132 | 133 | # figure out the year of the latest available data 134 | # Most states have 2015 data available 135 | # see http://lehd.ces.census.gov/data/lodes/LODES7/LODESTechDoc7.4.pdf, page 2f 136 | year = 2017 137 | 138 | # Alaska and South Dakota do not have LODES2017 data available, so use 2016 139 | if state == 'AK' or state == 'SD': 140 | year = 2016 141 | elif state == 'PR' or state == 'VI': 142 | print('{0} does not have LODES data available'.format(state)) 143 | year = 0 144 | 145 | if year: 146 | print("Downloading {0} LODES data for {1}".format(year, state)) 147 | 148 | # get the rac file 149 | out = os.path.join(outDir, 'workforce', '{0}_{1}_rac.csv.gz'.format(state, year)) 150 | retrieve("http://lehd.ces.census.gov/data/lodes/LODES7/{0}/rac/{0}_rac_S000_JT00_{1}.csv.gz".format(state.lower(), year), out) 151 | 152 | # get the wac file 153 | out = os.path.join(outDir, 'jobs', '{0}_{1}_wac.csv.gz'.format(state, year)) 154 | retrieve("http://lehd.ces.census.gov/data/lodes/LODES7/{0}/wac/{0}_wac_S000_JT00_{1}.csv.gz".format(state.lower(), year), out) 155 | 156 | print('Done with {0}'.format(state)) 157 | -------------------------------------------------------------------------------- /src/main/java/com/conveyal/data/census/LodesSource.java: -------------------------------------------------------------------------------- 1 | package com.conveyal.data.census; 2 | 3 | import com.conveyal.data.geobuf.GeobufFeature; 4 | import com.csvreader.CsvReader; 5 | 6 | import java.io.*; 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | import java.util.zip.GZIPInputStream; 10 | 11 | /** 12 | * Data source for LODES data. 13 | */ 14 | public class LodesSource { 15 | private File input; 16 | private LodesType type; 17 | 18 | public LodesSource(File input, LodesType type) { 19 | this.input = input; 20 | this.type = type; 21 | } 22 | 23 | public void load(ShapeDataStore store) throws Exception { 24 | InputStream csv = new GZIPInputStream(new BufferedInputStream(new FileInputStream(input))); 25 | CsvReader reader = new CsvReader(new InputStreamReader(csv)); 26 | 27 | // rename the columns to something useful 28 | //http://lehd.ces.census.gov/data/lodes/LODES7/LODESTechDoc7.1.pdf#page=7&zoom=auto,-266,580 29 | Map colNames = new HashMap<>(); 30 | colNames.put("C000", "total"); 31 | 32 | colNames.put("CA01", "age 29 or younger"); 33 | colNames.put("CA02", "age 30 to 54"); 34 | colNames.put("CA03", "age 55 or older"); 35 | 36 | colNames.put("CE01", "with earnings $1250 per month or less"); 37 | colNames.put("CE02", "with earnings $1251 - $3333 per month"); 38 | colNames.put("CE03", "with earnings greater than $3333 per month"); 39 | 40 | colNames.put("CNS01", "in agriculture, forestry, fishing and hunting"); 41 | colNames.put("CNS02", "in mining, quarrying, and oil and gas extraction"); 42 | colNames.put("CNS03", "in utilities"); 43 | colNames.put("CNS04", "in construction"); 44 | colNames.put("CNS05", "in manufacturing"); 45 | colNames.put("CNS06", "in wholesale trade"); 46 | colNames.put("CNS07", "in retail trade"); 47 | colNames.put("CNS08", "in transportation and warehousing"); 48 | colNames.put("CNS09", "in information"); 49 | colNames.put("CNS10", "in finance and insurance"); 50 | colNames.put("CNS11", "in real estate"); 51 | colNames.put("CNS12", "in professional, scientific and technical services"); 52 | colNames.put("CNS13", "in management"); 53 | colNames.put("CNS14", "in administration, support, and waste management"); 54 | colNames.put("CNS15", "in educational services"); 55 | colNames.put("CNS16", "in healthcare and social assistance"); 56 | colNames.put("CNS17", "in arts, entertainment and recreation"); 57 | colNames.put("CNS18", "in accommodation and food services"); 58 | colNames.put("CNS19", "in other services, except public administration"); 59 | colNames.put("CNS20", "in public administration"); 60 | 61 | colNames.put("CR01", "with race White alone"); 62 | colNames.put("CR02", "with race Black or African American alone"); 63 | colNames.put("CR03", "with race American Indian or Alaska Native alone"); 64 | colNames.put("CR04", "with race Asian alone"); 65 | colNames.put("CR05", "with race Native Hawaiian or Other Pacific Islander alone"); 66 | colNames.put("CR07", "with two or more racial groups"); 67 | 68 | colNames.put("CT01", "not Hispanic or Latino"); 69 | colNames.put("CT02", "Hispanic or Latino"); 70 | 71 | colNames.put("CD01", "with less than high school education"); 72 | colNames.put("CD02", "with high school education, no college"); 73 | colNames.put("CD03", "with some college education or Associate degree"); 74 | colNames.put("CD04", "with Bachelor's degree or higher"); 75 | colNames.put("CS01", "male"); 76 | colNames.put("CS02", "female"); 77 | 78 | // only in workplace characteristics 79 | colNames.put("CFA01", "at firms aged 0-1 years"); 80 | colNames.put("CFA02", "at firms aged 2-3 years"); 81 | colNames.put("CFA03", "at firms aged 4-5 years"); 82 | colNames.put("CFA04", "at firms aged 6-10 years"); 83 | colNames.put("CFA05", "at firms aged 11 or more years"); 84 | 85 | colNames.put("CFS01", "at firms with 0-19 employees"); 86 | colNames.put("CFS02", "at firms with 20-49 employees"); 87 | colNames.put("CFS03", "at firms with 50-249 employees"); 88 | colNames.put("CFS04", "at firms with 250-499 employees"); 89 | colNames.put("CFS05", "at firms with 500 or more employees"); 90 | colNames.put("createdate", "Data creation date"); 91 | 92 | reader.readHeaders(); 93 | String[] headers = reader.getHeaders(); 94 | 95 | // read the file 96 | while (reader.readRecord()) { 97 | long id = Long.parseLong(reader.get(type == LodesType.WORKPLACE ? "w_geocode" : "h_geocode")); 98 | GeobufFeature feat = store.get(id); 99 | 100 | String[] line = reader.getValues(); 101 | for (int i = 0; i < line.length; i++) { 102 | String col = headers[i]; 103 | 104 | if (!colNames.containsKey(col)) 105 | continue; 106 | 107 | String colName; 108 | 109 | if (type == LodesType.WORKPLACE) { 110 | if (col.startsWith("CR") || col.startsWith("CD") || col.startsWith("CA")) 111 | colName = "Jobs employing workers " + colNames.get(col); 112 | else if (col.startsWith("CS")) 113 | colName = "Jobs employing " + colNames.get(col) + "s"; 114 | else if (col.startsWith("CT")) 115 | colName = "Jobs employing " + colNames.get(col) + " workers"; 116 | else 117 | colName = "Jobs " + colNames.get(col); 118 | } 119 | else if (type == LodesType.RESIDENCE) { 120 | if (col.startsWith("CT") || col.startsWith("CS")) 121 | colName = "Workers, " + colNames.get(col); 122 | else 123 | colName = "Workers " + colNames.get(col); 124 | } 125 | else { 126 | throw new IllegalArgumentException("Invalid LODES type"); 127 | } 128 | 129 | feat.properties.put(colName, Integer.parseInt(line[i])); 130 | } 131 | 132 | store.put(feat); 133 | } 134 | 135 | reader.close(); 136 | } 137 | 138 | /** supported lodes types are workplace area characteristics and residence area characteristics */ 139 | public static enum LodesType { 140 | WORKPLACE, RESIDENCE 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.conveyal.data.census 8 | seamless-census 9 | 1.2-SNAPSHOT 10 | 11 | 12 | UTF-8 13 | 14 | 15 | 16 | 17 | org.mapdb 18 | mapdb 19 | 2.0-beta6 20 | 21 | 22 | com.conveyal 23 | jackson2-geojson 24 | 0.9 25 | 26 | 27 | com.google.protobuf 28 | protobuf-java 29 | 2.6.1 30 | 31 | 32 | com.conveyal 33 | geobuf-java 34 | 1.1 35 | 36 | 37 | net.sf.trove4j 38 | trove4j 39 | 3.0.3 40 | 41 | 42 | net.sourceforge.javacsv 43 | javacsv 44 | 2.0 45 | 46 | 47 | com.fasterxml.jackson.core 48 | jackson-databind 49 | 2.8.11.1 50 | 51 | 52 | com.google.guava 53 | guava 54 | 18.0 55 | 56 | 57 | com.amazonaws 58 | aws-java-sdk-s3 59 | 1.11.341 60 | 61 | 62 | junit 63 | junit 64 | 4.12 65 | test 66 | 67 | 68 | 69 | 70 | 71 | conveyal 72 | Conveyal Maven Repository 73 | https://s3.amazonaws.com/maven.conveyal.com/ 74 | 75 | 76 | 77 | 78 | 79 | 80 | org.apache.maven.plugins 81 | maven-compiler-plugin 82 | 3.8.1 83 | 84 | 11 85 | 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-jar-plugin 90 | 3.1.2 91 | 92 | 93 | 94 | 96 | com.conveyal.data.census 97 | 98 | 99 | 100 | 101 | 102 | org.apache.maven.plugins 103 | maven-shade-plugin 104 | 3.2.1 105 | 106 | 107 | package 108 | shade 109 | 110 | seamless-census 111 | 112 | 113 | 114 | 115 | com.conveyal.data.census.CensusExtractor 116 | 117 | 118 | 119 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | *:* 128 | 129 | META-INF/*.SF 130 | META-INF/*.DSA 131 | META-INF/*.RSA 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | org.springframework.build 146 | aws-maven 147 | 5.0.0.RELEASE 148 | 149 | 150 | org.kuali.maven.wagons 151 | maven-s3-wagon 152 | 1.2.1 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | conveyal-maven-repo 162 | Conveyal Maven Repository 163 | s3://maven.conveyal.com/ 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /src/main/java/com/conveyal/data/census/ShapeDataStore.java: -------------------------------------------------------------------------------- 1 | package com.conveyal.data.census; 2 | 3 | import com.amazonaws.services.s3.AmazonS3; 4 | import com.amazonaws.services.s3.AmazonS3ClientBuilder; 5 | import com.amazonaws.services.s3.model.ObjectMetadata; 6 | import com.conveyal.data.geobuf.GeobufEncoder; 7 | import com.conveyal.data.geobuf.GeobufFeature; 8 | import org.locationtech.jts.geom.Envelope; 9 | import org.mapdb.*; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | import java.io.*; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | import java.util.NavigableSet; 17 | import java.util.concurrent.*; 18 | import java.util.function.BiFunction; 19 | import java.util.zip.GZIPOutputStream; 20 | 21 | /** 22 | * Store geographic data by ID, with index by zoom-11 tile. 23 | */ 24 | public class ShapeDataStore { 25 | public static final int ZOOM_LEVEL = 11; 26 | 27 | private static final Logger LOG = LoggerFactory.getLogger(ShapeDataStore.class); 28 | 29 | /** number of decimal places of precision to store */ 30 | public static final int PRECISION = 12; 31 | 32 | private DB db; 33 | 34 | /** 35 | * set of Object[] { int[] { x, y }, Feature } for features at zoom 11 36 | */ 37 | private NavigableSet tiles; 38 | 39 | /** 40 | * Map from geoid to feature 41 | */ 42 | private BTreeMap features; 43 | 44 | public ShapeDataStore() { 45 | db = DBMaker.tempFileDB().deleteFilesAfterClose().asyncWriteEnable() 46 | .transactionDisable() 47 | .fileMmapEnable() 48 | .asyncWriteEnable() 49 | .asyncWriteFlushDelay(1000) 50 | .executorEnable() 51 | .asyncWriteQueueSize(10000) 52 | // start with 1GB 53 | .allocateStartSize(1024 * 1024 * 1024) 54 | // and bump by 512MB 55 | .allocateIncrement(512 * 1024 * 1024) 56 | .make(); 57 | 58 | features = db.treeMapCreate("features") 59 | .keySerializer(BTreeKeySerializer.LONG) 60 | .valueSerializer(new GeobufEncoder.GeobufFeatureSerializer(12)) 61 | .counterEnable() 62 | .make(); 63 | 64 | tiles = db.treeSetCreate("tiles") 65 | .serializer(BTreeKeySerializer.ARRAY3) 66 | .make(); 67 | 68 | // bind the map by tile 69 | features.modificationListenerAdd((id, feat0, feat1) -> { 70 | if (feat0 != null) 71 | // updates never change geometry, and there are no deletes 72 | return; 73 | 74 | // figure out which z11 tiles this is part of 75 | Envelope e = feat1.geometry.getEnvelopeInternal(); 76 | for (int x = lon2tile(e.getMinX(), ZOOM_LEVEL); x <= lon2tile(e.getMaxX(), ZOOM_LEVEL); x++) { 77 | for (int y = lat2tile(e.getMaxY(), ZOOM_LEVEL); y <= lat2tile(e.getMinY(), ZOOM_LEVEL); y++) { 78 | tiles.add(new Object[]{x, y, feat1.numericId}); 79 | } 80 | } 81 | }); 82 | } 83 | 84 | public void add(GeobufFeature feature) { 85 | if (this.features.containsKey(feature.numericId)) 86 | throw new IllegalArgumentException("ID " + feature.numericId + " already present in store"); 87 | this.features.put(feature.numericId, feature); 88 | 89 | if (this.features.size() % 10000 == 0) 90 | LOG.info("Loaded {} features", this.features.size()); 91 | } 92 | 93 | /** Get the longitude of a particular tile */ 94 | public static int lon2tile (double lon, int zoom) { 95 | // recenter 96 | lon += 180; 97 | 98 | // scale 99 | return (int) (lon * Math.pow(2, zoom) / 360); 100 | } 101 | 102 | public void close () { 103 | db.close(); 104 | } 105 | 106 | /** Get the latitude of a particular tile */ 107 | public static int lat2tile (double lat, int zoom) { 108 | // http://wiki.openstreetmap.org/wiki/Slippy_map_tilenames 109 | lat = Math.toRadians(lat); 110 | lat = Math.log(Math.tan(lat) + 1 / Math.cos(lat)); 111 | 112 | return (int) ((1 - lat / Math.PI) / 2 * Math.pow(2, zoom)); 113 | } 114 | 115 | /** Write GeoBuf tiles to a directory */ 116 | public void writeTiles (File file) throws IOException { 117 | writeTilesInternal((x, y) -> { 118 | // write out the features 119 | File dir = new File(file, "" + x); 120 | File out = new File(dir, y + ".pbf.gz"); 121 | dir.mkdirs(); 122 | return new FileOutputStream(out); 123 | }); 124 | } 125 | 126 | /** Write GeoBuf tiles to S3 */ 127 | public void writeTilesToS3 (String bucketName) throws IOException { 128 | // set up an upload thread 129 | ExecutorService executor = Executors.newSingleThreadExecutor(); 130 | 131 | // initialize an S3 client 132 | AmazonS3 s3 = 133 | AmazonS3ClientBuilder.standard().build(); 134 | try { 135 | writeTilesInternal((x, y) -> { 136 | PipedInputStream is = new PipedInputStream(); 137 | PipedOutputStream os = new PipedOutputStream(is); 138 | ObjectMetadata metadata = new ObjectMetadata(); 139 | metadata.setContentType("application/gzip"); 140 | 141 | // perform the upload in its own thread so it doesn't deadlock 142 | executor.execute(() -> s3.putObject(bucketName, String.format("%d/%d.pbf.gz", x, y), is, metadata)); 143 | return os; 144 | }); 145 | } finally { 146 | // allow the JVM to exit 147 | executor.shutdown(); 148 | try { 149 | executor.awaitTermination(1, TimeUnit.HOURS); 150 | } catch (InterruptedException e) { 151 | LOG.error("Interrupted while waiting for S3 uploads to finish"); 152 | } 153 | } 154 | } 155 | 156 | /** 157 | * generic write tiles function, calls function with x and y indices to get an output stream, which it will close itself. 158 | * The Internal suffix is because lambdas in java get confused with overloaded functions 159 | */ 160 | private void writeTilesInternal(TileOutputStreamProducer outputStreamForTile) throws IOException { 161 | int lastx = -1, lasty = -1, tileCount = 0; 162 | 163 | List featuresThisTile = new ArrayList<>(); 164 | 165 | for (Object[] val : tiles) { 166 | int x = (Integer) val[0]; 167 | int y = (Integer) val[1]; 168 | long id = (Long) val[2]; 169 | 170 | if (x != lastx || y != lasty) { 171 | if (!featuresThisTile.isEmpty()) { 172 | LOG.debug("x: {}, y: {}, {} features", lastx, lasty, featuresThisTile.size()); 173 | GeobufEncoder enc = new GeobufEncoder(new GZIPOutputStream(new BufferedOutputStream(outputStreamForTile.apply(lastx, lasty))), PRECISION); 174 | enc.writeFeatureCollection(featuresThisTile); 175 | enc.close(); 176 | featuresThisTile.clear(); 177 | 178 | tileCount++; 179 | } 180 | } 181 | 182 | featuresThisTile.add(features.get(id)); 183 | 184 | lastx = x; 185 | lasty = y; 186 | } 187 | 188 | LOG.info("Wrote {} tiles", tileCount); 189 | } 190 | 191 | /** get a feature */ 192 | public GeobufFeature get(long id) { 193 | // protective copy, don't get entangled in mapdb async serialization. 194 | return features.get(id).clone(); 195 | } 196 | 197 | /** put a feature that already exists */ 198 | public void put (GeobufFeature feat) { 199 | if (!features.containsKey(feat.numericId)) 200 | throw new IllegalArgumentException("Feature does not exist in database!"); 201 | 202 | features.put(feat.numericId, feat); 203 | } 204 | 205 | @FunctionalInterface 206 | private interface TileOutputStreamProducer { 207 | public OutputStream apply (int x, int y) throws IOException; 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /src/test/java/com/conveyal/data/census/IntegrationTest.java: -------------------------------------------------------------------------------- 1 | package com.conveyal.data.census; 2 | 3 | import com.conveyal.data.geobuf.GeobufDecoder; 4 | import com.conveyal.data.geobuf.GeobufFeature; 5 | import com.csvreader.CsvReader; 6 | import com.google.common.io.ByteStreams; 7 | import com.google.common.io.Files; 8 | import org.locationtech.jts.geom.Envelope; 9 | import gnu.trove.map.TLongObjectMap; 10 | import gnu.trove.map.hash.TLongObjectHashMap; 11 | import junit.framework.TestCase; 12 | import org.junit.Test; 13 | 14 | import java.io.*; 15 | import java.util.Arrays; 16 | import java.util.HashSet; 17 | import java.util.Set; 18 | import java.util.zip.GZIPInputStream; 19 | import java.util.zip.ZipEntry; 20 | import java.util.zip.ZipInputStream; 21 | 22 | /** 23 | * Test loading, extracting, etc. 24 | */ 25 | public class IntegrationTest extends TestCase { 26 | private CsvReader reader; 27 | private TLongObjectMap features; 28 | 29 | /** test loading and extracting from a small state (DC) */ 30 | @Test 31 | public void testAll () throws Exception { 32 | // extract the data 33 | // note that the LODES data is fake; we replaced every value with a unique number to ensure that we detect 34 | // swapped/incorrect column names; some columns are collinear in the original DC dataset (i.e. all zeros) 35 | // so they wouldn't show up in tests if we swapped them. 36 | // The python script in the resources directory alongside the data file takes a LODES CSV and replaces all the 37 | // values in it with unique numbers. 38 | File dir = Files.createTempDir(); 39 | ZipInputStream zis = new ZipInputStream(getClass().getResourceAsStream("integrationTest.zip")); 40 | 41 | ZipEntry entry; 42 | while ((entry = zis.getNextEntry()) != null) { 43 | if (entry.isDirectory()) 44 | continue; 45 | 46 | File out = new File(dir, entry.getName()); 47 | out.getParentFile().mkdirs(); 48 | FileOutputStream fos = new FileOutputStream(out); 49 | ByteStreams.copy(zis, fos); 50 | fos.close(); 51 | } 52 | 53 | // load up the data 54 | CensusLoader.main(dir.getAbsolutePath()); 55 | 56 | // do an extract (this crosses a tile boundary) 57 | CensusExtractor.main(new File(dir, "tiles").getAbsolutePath(), "38.9872", "-77.0378", "38.9218", "-77.1086", new File(dir, "extract.pbf").getAbsolutePath()); 58 | 59 | // load the extract 60 | FileInputStream fis = new FileInputStream(new File(dir, "extract.pbf")); 61 | GeobufDecoder decoder = new GeobufDecoder(fis); 62 | 63 | assertTrue(decoder.hasNext()); 64 | 65 | Envelope envelope = new Envelope(-77.1086, -77.0378, 38.9218, 38.9872); 66 | 67 | features = new TLongObjectHashMap<>(); 68 | 69 | while (decoder.hasNext()) { 70 | GeobufFeature feat = decoder.next(); 71 | // TODO the extractor has a true geometric overlap check, not just an envelope check 72 | // so this check could be made more specific 73 | assertTrue(feat.geometry.getEnvelopeInternal().intersects(envelope)); 74 | 75 | features.put(feat.numericId, feat); 76 | assertNotSame(0, feat.numericId); 77 | } 78 | 79 | // > 1 to ensure all numeric IDs are not the same 80 | assertTrue(features.size() > 1); 81 | // random census block in NW DC 82 | assertTrue(features.containsKey(110010014023009L)); 83 | 84 | // read the workplace area characteristics csv 85 | InputStream csv = new GZIPInputStream(new FileInputStream(new File(new File(dir, "jobs"), "dc_wac_S000_JT00_2013.csv.gz"))); 86 | reader = new CsvReader(new InputStreamReader(csv)); 87 | reader.readHeaders(); 88 | 89 | // make sure we found a jobs entry 90 | boolean foundJobsEntry = false; 91 | 92 | String[] line; 93 | while (reader.readRecord()) { 94 | // make sure everything matches, and that we got the column name mappings correct 95 | foundJobsEntry = foundJobsEntry || check("Jobs at firms aged 0-1 years", "CFA01"); 96 | check("Jobs at firms aged 2-3 years", "CFA02"); 97 | check("Jobs at firms aged 4-5 years", "CFA03"); 98 | check("Jobs at firms aged 6-10 years", "CFA04"); 99 | check("Jobs at firms aged 11 or more years", "CFA05"); 100 | 101 | check("Jobs at firms with 0-19 employees", "CFS01"); 102 | check("Jobs at firms with 20-49 employees", "CFS02"); 103 | check("Jobs at firms with 50-249 employees", "CFS03"); 104 | check("Jobs at firms with 250-499 employees", "CFS04"); 105 | check("Jobs at firms with 500 or more employees", "CFS05"); 106 | 107 | check("Jobs employing Hispanic or Latino workers", "CT02"); 108 | check("Jobs employing not Hispanic or Latino workers", "CT01"); 109 | 110 | check("Jobs employing females", "CS02"); 111 | check("Jobs employing males", "CS01"); 112 | 113 | check("Jobs employing workers age 29 or younger", "CA01"); 114 | check("Jobs employing workers age 30 to 54", "CA02"); 115 | check("Jobs employing workers age 55 or older", "CA03"); 116 | 117 | check("Jobs employing workers with less than high school education", "CD01"); 118 | check("Jobs employing workers with high school education, no college", "CD02"); 119 | check("Jobs employing workers with some college education or Associate degree", "CD03"); 120 | check("Jobs employing workers with Bachelor's degree or higher", "CD04"); 121 | 122 | check("Jobs employing workers with race American Indian or Alaska Native alone", "CR03"); 123 | check("Jobs employing workers with race Asian alone", "CR04"); 124 | check("Jobs employing workers with race Black or African American alone", "CR02"); 125 | check("Jobs employing workers with race Native Hawaiian or Other Pacific Islander alone", "CR05"); 126 | check("Jobs employing workers with race White alone", "CR01"); 127 | check("Jobs employing workers with two or more racial groups", "CR07"); 128 | 129 | check("Jobs in accommodation and food services", "CNS18"); 130 | check("Jobs in administration, support, and waste management", "CNS14"); 131 | check("Jobs in agriculture, forestry, fishing and hunting", "CNS01"); 132 | check("Jobs in arts, entertainment and recreation", "CNS17"); 133 | check("Jobs in construction", "CNS04"); 134 | check("Jobs in educational services", "CNS15"); 135 | check("Jobs in finance and insurance", "CNS10"); 136 | check("Jobs in healthcare and social assistance", "CNS16"); 137 | check("Jobs in information", "CNS09"); 138 | check("Jobs in management", "CNS13"); 139 | check("Jobs in manufacturing", "CNS05"); 140 | check("Jobs in mining, quarrying, and oil and gas extraction", "CNS02"); 141 | check("Jobs in other services, except public administration", "CNS19"); 142 | check("Jobs in professional, scientific and technical services", "CNS12"); 143 | check("Jobs in public administration", "CNS20"); 144 | check("Jobs in real estate", "CNS11"); 145 | check("Jobs in retail trade", "CNS07"); 146 | check("Jobs in transportation and warehousing", "CNS08"); 147 | check("Jobs in utilities", "CNS03"); 148 | check("Jobs in wholesale trade", "CNS06"); 149 | 150 | check("Jobs total", "C000"); 151 | 152 | check("Jobs with earnings $1250 per month or less", "CE01"); 153 | check("Jobs with earnings $1251 - $3333 per month", "CE02"); 154 | check("Jobs with earnings greater than $3333 per month", "CE03"); 155 | } 156 | csv.close(); 157 | 158 | assertTrue(foundJobsEntry); 159 | 160 | // read the rac csv 161 | csv = new GZIPInputStream(new FileInputStream(new File(new File(dir, "workforce"), "dc_rac_S000_JT00_2013.csv.gz"))); 162 | reader = new CsvReader(new InputStreamReader(csv)); 163 | 164 | reader.readHeaders(); 165 | 166 | boolean foundWorkforceEntry = false; 167 | 168 | while (reader.readRecord()) { 169 | foundWorkforceEntry = foundWorkforceEntry || check("Workers age 29 or younger", "CA01"); 170 | check("Workers age 30 to 54", "CA02"); 171 | check("Workers age 55 or older", "CA03"); 172 | 173 | check("Workers in accommodation and food services", "CNS18"); 174 | check("Workers in administration, support, and waste management", "CNS14"); 175 | check("Workers in agriculture, forestry, fishing and hunting", "CNS01"); 176 | check("Workers in arts, entertainment and recreation", "CNS17"); 177 | check("Workers in construction", "CNS04"); 178 | check("Workers in educational services", "CNS15"); 179 | check("Workers in finance and insurance", "CNS10"); 180 | check("Workers in healthcare and social assistance", "CNS16"); 181 | check("Workers in information", "CNS09"); 182 | check("Workers in management", "CNS13"); 183 | check("Workers in manufacturing", "CNS05"); 184 | check("Workers in mining, quarrying, and oil and gas extraction", "CNS02"); 185 | check("Workers in other services, except public administration", "CNS19"); 186 | check("Workers in professional, scientific and technical services", "CNS12"); 187 | check("Workers in public administration", "CNS20"); 188 | check("Workers in real estate", "CNS11"); 189 | check("Workers in retail trade", "CNS07"); 190 | check("Workers in transportation and warehousing", "CNS08"); 191 | check("Workers in utilities", "CNS03"); 192 | check("Workers in wholesale trade", "CNS06"); 193 | 194 | check("Workers total", "C000"); 195 | 196 | check("Workers with earnings $1250 per month or less", "CE01"); 197 | check("Workers with earnings $1251 - $3333 per month", "CE02"); 198 | check("Workers with earnings greater than $3333 per month", "CE03"); 199 | 200 | check("Workers with less than high school education", "CD01"); 201 | check("Workers with high school education, no college", "CD02"); 202 | check("Workers with some college education or Associate degree", "CD03"); 203 | check("Workers with Bachelor's degree or higher", "CD04"); 204 | 205 | check("Workers with race American Indian or Alaska Native alone", "CR03"); 206 | check("Workers with race Asian alone", "CR04"); 207 | check("Workers with race Black or African American alone", "CR02"); 208 | check("Workers with race Native Hawaiian or Other Pacific Islander alone", "CR05"); 209 | check("Workers with race White alone", "CR01"); 210 | check("Workers with two or more racial groups", "CR07"); 211 | 212 | check("Workers, Hispanic or Latino", "CT02"); 213 | check("Workers, not Hispanic or Latino", "CT01"); 214 | 215 | check("Workers, female", "CS02"); 216 | check("Workers, male", "CS01"); 217 | } 218 | csv.close(); 219 | 220 | assertTrue(foundWorkforceEntry); 221 | dir.delete(); 222 | } 223 | 224 | private boolean check (String colName, String colCode) throws Exception { 225 | long fid; 226 | 227 | // TODO cache? 228 | Set headers = new HashSet<>(Arrays.asList(reader.getHeaders())); 229 | 230 | if (headers.contains("w_geocode")) 231 | fid = Long.parseLong(reader.get("w_geocode")); 232 | else 233 | fid = Long.parseLong(reader.get("h_geocode")); 234 | 235 | // cast to primitive long so as not to confuse Java's type inference 236 | if (features.containsKey(fid)) { 237 | assertEquals((long) Long.parseLong(reader.get(colCode)), (long) features.get(fid).properties.get(colName)); 238 | return true; 239 | } 240 | else return false; 241 | } 242 | } 243 | --------------------------------------------------------------------------------