├── .circleci └── config.yml ├── .github ├── docker-compose.yml ├── pull_request_template.md └── workflows │ └── ci.yml ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.org ├── LICENSE ├── README.md ├── RELEASING.md ├── bench └── src │ └── main │ └── scala │ └── vectorpipe │ └── Bench.scala ├── build.sbt ├── data ├── 8shapedmultipolygon.osm ├── diomede.osm ├── india-pakistan.osm ├── linestring.mvt ├── onepoint.mvt ├── polygon.mvt ├── quarry-rock.osm └── roads.mvt ├── project ├── Dependencies.scala ├── Version.scala ├── assembly.sbt ├── build.properties └── plugins.sbt ├── sbt ├── scripts ├── cibuild ├── cipublish └── test └── src ├── main ├── resources │ ├── META-INF │ │ └── services │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── microsite │ │ └── data │ │ └── menu.yml ├── scala │ └── vectorpipe │ │ ├── OSM.scala │ │ ├── VectorPipe.scala │ │ ├── examples │ │ ├── AugmentedDiffProcessor.scala │ │ ├── AugmentedDiffStreamProcessor.scala │ │ ├── ChangeProcessor.scala │ │ ├── ChangeStreamProcessor.scala │ │ ├── ChangesetProcessor.scala │ │ └── ChangesetStreamProcessor.scala │ │ ├── functions │ │ ├── osm │ │ │ └── package.scala │ │ └── package.scala │ │ ├── internal │ │ └── package.scala │ │ ├── model │ │ ├── Actions.scala │ │ ├── AugmentedDiff.scala │ │ ├── Change.scala │ │ ├── Changeset.scala │ │ ├── ChangesetComment.scala │ │ ├── ElementWithSequence.scala │ │ ├── Member.scala │ │ └── Nd.scala │ │ ├── relations │ │ ├── MultiPolygons.scala │ │ ├── Routes.scala │ │ ├── package.scala │ │ └── utils │ │ │ ├── PartialCoordinateSequence.scala │ │ │ ├── ReversedCoordinateSequence.scala │ │ │ ├── VirtualCoordinateSequence.scala │ │ │ └── package.scala │ │ ├── sources │ │ ├── AugmentedDiffMicroBatchReader.scala │ │ ├── AugmentedDiffProvider.scala │ │ ├── AugmentedDiffReader.scala │ │ ├── AugmentedDiffSource.scala │ │ ├── ChangeMicroBatchReader.scala │ │ ├── ChangeProvider.scala │ │ ├── ChangeReader.scala │ │ ├── ChangeSource.scala │ │ ├── ChangesetMicroBatchReader.scala │ │ ├── ChangesetProvider.scala │ │ ├── ChangesetReader.scala │ │ ├── ChangesetSource.scala │ │ ├── ReplicationReader.scala │ │ ├── ReplicationStreamBatchReader.scala │ │ ├── ReplicationStreamMicroBatchReader.scala │ │ ├── SequenceOffset.scala │ │ └── Source.scala │ │ ├── util │ │ ├── Auth.scala │ │ ├── DBUtils.scala │ │ ├── Geocode.scala │ │ ├── Implicits.scala │ │ ├── JsonRobustFeatureCollection.scala │ │ ├── JsonRobustFeatureCollectionMap.scala │ │ ├── Resource.scala │ │ ├── RobustFeature.scala │ │ └── package.scala │ │ └── vectortile │ │ ├── Clipping.scala │ │ ├── Pipeline.scala │ │ ├── Simplify.scala │ │ ├── export │ │ └── package.scala │ │ └── package.scala └── tut │ ├── index.md │ ├── outputs.md │ ├── sources.md │ ├── usage.md │ └── usage │ ├── concepts.md │ ├── osm.md │ └── usage.md └── test ├── resources ├── .gitignore ├── isle-of-man-latest.osm.orc ├── log4j.properties ├── relation-110564.orc ├── relation-110564.wkt ├── relation-191199.orc ├── relation-191199.wkt ├── relation-191204.orc ├── relation-191204.wkt ├── relation-1949938.orc ├── relation-1949938.wkt ├── relation-2554903.orc ├── relation-2554903.wkt ├── relation-2580685.orc ├── relation-2580685.wkt ├── relation-3080946.orc ├── relation-3080946.wkt ├── relation-3105056.orc ├── relation-3105056.wkt ├── relation-333501.orc ├── relation-333501.wkt ├── relation-393502.orc ├── relation-393502.wkt ├── relation-5448156.orc ├── relation-5448156.wkt ├── relation-5448691.orc ├── relation-5448691.wkt ├── relation-5612959.orc ├── relation-5612959.wkt ├── relation-61315.orc ├── relation-61315.wkt ├── relation-6710544.orc ├── relation-6710544.wkt └── view │ ├── cluster-view.html │ └── layer-test.html └── scala └── vectorpipe ├── MultiPolygonRelationReconstructionSpec.scala ├── ProcessOSMTest.scala ├── TestEnvironment.scala ├── functions └── osm │ └── FunctionSpec.scala ├── sources └── AugmentedDiffSourceTest.scala └── vectortile ├── LayerTestPipeline.scala ├── PipelineSpec.scala ├── TestPipeline.scala └── WeightedCentroid.scala /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | aliases: 2 | - &restore_sbt_cache 3 | key: sbt-cache-{{ checksum "/tmp/scala_version" }} 4 | 5 | - &save_sbt_cache 6 | key: sbt-cache-{{ checksum "/tmp/scala_version" }}-{{ epoch }} 7 | paths: 8 | - "~/.ivy2/cache" 9 | - "~/.sbt" 10 | - "~/.cache/coursier" 11 | 12 | - &run_cibuild 13 | - checkout 14 | - run: echo "${SCALA_VERSION}" > /tmp/scala_version 15 | - restore_cache: *restore_sbt_cache 16 | - run: 17 | name: Executing cibuild 18 | command: ./scripts/cibuild 19 | - save_cache: *save_sbt_cache 20 | 21 | - &run_cipublish 22 | - checkout 23 | - run: echo "${SCALA_VERSION}" > /tmp/scala_version 24 | - restore_cache: *restore_sbt_cache 25 | - run: 26 | name: "Import signing key" 27 | command: | 28 | gpg --keyserver keyserver.ubuntu.com \ 29 | --recv-keys 0x13E9AA1D8153E95E && \ 30 | echo "${GPG_KEY}" | base64 -d > signing_key.asc && \ 31 | gpg --import signing_key.asc 32 | - run: 33 | name: Executing cipublish 34 | command: ./scripts/cipublish 35 | 36 | # Build environments 37 | - &machine-openjdk8-scala2_11_12-environment 38 | machine: 39 | image: ubuntu-1604:201903-01 40 | environment: 41 | SCALA_VERSION: 2.11.12 42 | 43 | - &openjdk8-scala2_11_12-environment 44 | docker: 45 | - image: circleci/openjdk:8-jdk 46 | environment: 47 | SCALA_VERSION: 2.11.12 48 | 49 | version: 2 50 | workflows: 51 | version: 2 52 | build: 53 | jobs: 54 | - "openjdk8-scala2.11.12": 55 | filters: # required since `openjdk8-scala2.11.12_deploy` has tag filters AND requires `openjdk8-scala2.11.12` 56 | tags: 57 | only: 58 | - /^(.*)$/ 59 | - "openjdk8-scala2.11.12_deploy": 60 | requires: 61 | - "openjdk8-scala2.11.12" 62 | filters: 63 | tags: 64 | only: 65 | - /^(.*)$/ 66 | 67 | jobs: 68 | # Execute cibuild in machine executor so we can use our existing 69 | # docker-compose test setup 70 | "openjdk8-scala2.11.12": 71 | <<: *machine-openjdk8-scala2_11_12-environment 72 | steps: *run_cibuild 73 | 74 | "openjdk8-scala2.11.12_deploy": 75 | <<: *openjdk8-scala2_11_12-environment 76 | steps: *run_cipublish 77 | -------------------------------------------------------------------------------- /.github/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | test: 5 | image: openjdk:8-jdk 6 | working_dir: /usr/local/src 7 | command: ./sbt ++$SCALA_VERSION test 8 | environment: 9 | - CI 10 | - SCALA_VERSION 11 | volumes: 12 | - ./../:/usr/local/src 13 | network_mode: host 14 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Brief description of what this PR does and why it's important 4 | 5 | ## Demo 6 | 7 | Optional. Screenshots, etc. 8 | 9 | ## Notes 10 | 11 | Optional. Extra context, ancillary topics, alternative strategies that didn't work out, etc. 12 | 13 | ## Testing Instructions 14 | 15 | Optional. Include if there's more specifics than "CI tests should pass". 16 | 17 | ## Checklist 18 | 19 | - [ ] Add entry to CHANGELOG.md 20 | 21 | Closes #XXX 22 | 23 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | branches: ['**'] 6 | push: 7 | branches: ['master'] 8 | tags: [v*] 9 | # release: 10 | # types: [published] 11 | 12 | jobs: 13 | build: 14 | name: Build and Test 15 | strategy: 16 | matrix: 17 | scala: ["2.12.7"] 18 | runs-on: ubuntu-latest 19 | 20 | env: 21 | SCALA_VERSION: ${{ matrix.scala }} 22 | BUILD_NUMBER: ${{ github.run_id }} 23 | 24 | steps: 25 | - uses: actions/checkout@v2 26 | with: 27 | fetch-depth: 0 28 | 29 | - uses: coursier/cache-action@v6 30 | # - uses: olafurpg/setup-scala@v13 31 | # with: 32 | # java-version: adopt@1.8 33 | 34 | - name: run tests 35 | run: docker compose -f .github/docker-compose.yml up test --abort-on-container-exit --exit-code-from test 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /RUNNING_PID 2 | /logs/ 3 | /project/*-shim.sbt 4 | /project/project/ 5 | /project/target/ 6 | /target/ 7 | /data/*.osm 8 | /data/*.geojson 9 | /data/*.osm.json 10 | /data/*.osm.pbf 11 | /images/* 12 | .ensime 13 | .ensime_cache/* 14 | clipping/* 15 | osmosis/* 16 | .idea 17 | target 18 | .metals 19 | \#* 20 | .\#* 21 | 22 | derby.log 23 | metastore_db/* 24 | bench/target/ 25 | idea.sbt 26 | mainRunner/ 27 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2 | # Changelog 3 | 4 | All notable changes to this project will be documented in this file. 5 | 6 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 7 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 8 | 9 | ## [Unreleased] 10 | 11 | ### Added 12 | 13 | - GitHub actions config 14 | 15 | ### Changed 16 | 17 | ### Fixed 18 | 19 | ## [2.2.0] 20 | 21 | ### Added 22 | 23 | - Feature parsing in `AugmentedDiffSource` uses `vectorpipe.util.RobustFeature` to tolerate bad geometries in the stream [#148](https://github.com/geotrellis/vectorpipe/pull/148). 24 | - Receive GPG key while publishing artifacts [#138](https://github.com/geotrellis/vectorpipe/pull/138) 25 | - `Pipeline#finalize(vectorTiles, zoom)` method to receive the final RDD of generated vector tiles for a zoom level 26 | - `Pipeline.Output` mixin trait that overrides `finalize` with default implementation using `saveVectorTiles(vectorTiles, zoom, pipeline.baseOutputURI)` 27 | 28 | ### Changed 29 | 30 | - `VectorPipe.Options` to support for any square layout level (not just from ZoomedLayoutScheme) 31 | - `Pipeline#baseOutputURI` moved to `Pipeline.Output#baseOutputURI` 32 | - Updated Geotrellis dependency to 3.5.1 33 | - Improve robustness of functions in `vectorpipe.sources.ChangesetSource` 34 | 35 | ### Fixed 36 | 37 | ## [2.1.3] - 2019-12-18 38 | 39 | ### Fixed 40 | 41 | - Catch 403 S3Exceptions when checking minutely diffs in AugmentedDiffSource 42 | 43 | ## [2.1.2] - 2019-12-17 44 | 45 | ### Fixed 46 | 47 | - Catch proper AWS SDK v2 NoSuchKeyException when checking minutely diffs in AugmentedDiffSource 48 | 49 | ## [2.1.1] - 2019-12-16 50 | 51 | ### Fixed 52 | 53 | - AugmentedDiffSource failed to properly decode from JSON sources 54 | - MicroBatchReader null pointer exception when reading baseURI from DataSourceOptions 55 | 56 | ## [2.1.0] - 2019-12-12 57 | 58 | ### Added 59 | 60 | - `vectorpipe.examples`: VectorPipe examples moved from https://github.com/azavea/osmesa 61 | - `VectorPipe.defaultSparkSessionWithJTS` method to construct a VectorPipe tailored `SparkSession`. Users with more complicated use cases will still want to manually construct their own session. 62 | 63 | ## [2.0.0] - 2019-11-29 64 | 65 | This is the first release to depend on GeoTrellis 3.0. 66 | 67 | ### Changed 68 | 69 | - Streaming sources now fallback to the current remote sequence if no database 70 | checkpoint or option can be found 71 | - Depend on Spark 2.4.4 72 | - Depend on GeoTrellis 3.1.0 73 | 74 | ## [1.1.0] - 2019-09-26 75 | 76 | ### Added 77 | 78 | - `useCaching` option to VectorPipe.Options allows for persisting to disk. 79 | Helps avoid repeated computations. 80 | - Functions for converting sequence numbers to timestamps and back for both 81 | changeset replications and augmented diff replications. See `ChangesetSource` 82 | and `AugmentedDiffSource` in `vectorpipe.sources`. 83 | 84 | ### Changed 85 | 86 | - Improved empty geometry handling in UDFs 87 | 88 | ### Fixed 89 | 90 | ## [1.0.0] - 2019-07-09 91 | 92 | ### Added 93 | 94 | - RELEASING.md - Instructions for releasing new versions of this project 95 | - Support for semicolon-delimited tag values in UDFs, e.g. `shop=bakery;dairy` 96 | - Support for `nds` in augmented diff GeoJSON (matching 97 | [`osm-replication-streams@^0.7.0`](https://github.com/mojodna/osm-replication-streams/tree/v0.7.0) 98 | output) 99 | - "Uninteresting" tags are dropped when processing OSM inputs; this will result 100 | in fewer point features being generated (as those nodes previously had tags 101 | applied). 102 | 103 | ### Changed 104 | 105 | - Sync with [id-area-keys@2.13.0](https://github.com/osmlab/id-area-keys/blob/v2.13.0/areaKeys.json) for determining area-ness of a way. 106 | - Fetch gzipped augmented diff JSON (produced by [overpass-diff-publisher](https://github.com/mojodna/overpass-diff-publisher)) 107 | - Preserve the last-known coordinates of deleted nodes 108 | - Better handling of falsy boolean values in tag UDFs 109 | - Adds `riverbank`, `stream_end`, `dam`, `weir`, `waterfall`, and `pressurised` 110 | to the list of waterway features 111 | - Populates `nds` and `members` for deleted elements from the previous version 112 | 113 | ### Fixed 114 | 115 | - Resolve commons-io deprecation warnings 116 | - Convert coordinates to Doubles (expected by VP internals) when pre-processing 117 | 118 | ## [1.0.0-RC3] - 2019-04-24 119 | 120 | ### Fixed 121 | 122 | - Mark all logger vals and some UDF vals as @transient lazy to avoid Spark serialization issues 123 | - Properly strip leading and trailing slashes from S3 URIs when exporting vector tiles 124 | -------------------------------------------------------------------------------- /CONTRIBUTING.org: -------------------------------------------------------------------------------- 1 | #+TITLE: Contributing to VectorPipe 2 | #+AUTHOR: Colin 3 | #+HTML_HEAD: 4 | 5 | ** Prerequisite Knowledge 6 | 7 | *** GeoTrellis 8 | 9 | GeoTrellis sublibraries and types are used heavily throughout ~vectorpipe~, 10 | particularly its ~vector~ and ~vectortile~ packages. 11 | 12 | *** Apache Spark 13 | 14 | ~RDD~ usage is fairly prevalent, so knowledge of Spark internals may help 15 | you, depending on your task. 16 | 17 | *** Cats 18 | 19 | The Functional Programming library that adds certain necessities missing 20 | from vanilla Scala. This is not at all necessary for /using/ ~vectorpipe~, 21 | but is used here and there within its internal machinery. 22 | 23 | *** OpenStreetMap 24 | 25 | Knowledge of how OpenStreetMap data is formatted will help you immensely. Terms: 26 | 27 | - Element 28 | - Node 29 | - Way 30 | - Relation 31 | 32 | ** Development Dependencies 33 | 34 | - [[http://www.scala-sbt.org/][SBT]] 35 | - [[https://spark.apache.org/][Apache Spark]] (a local install on your machine) 36 | - [[https://jekyllrb.com/][Jekyll]] (if editing the microsite) 37 | 38 | Otherwise, all Scala dependencies (including compilers) will be 39 | automatically downloaded by sbt. 40 | 41 | ** Style Guide 42 | 43 | When contributing code changes to ~vectorpipe~, bear in mind that we make a 44 | few stylistic choices in order to minimize code complexity: 45 | 46 | *** Code and Directory Layout 47 | 48 | - Code mechanics relevant to the workings of the library but irrelevant to the 49 | user should be relegated to a module under ~vectorpipe.*.internal~, where 50 | the ~*~ is whatever parent module you're working in. 51 | 52 | - Type aliases live in *package objects*: 53 | 54 | #+BEGIN_SRC scala 55 | package vectorpipe 56 | 57 | package object foo { 58 | type Bar = Int 59 | } 60 | #+END_SRC 61 | 62 | - Typeclass instances live in the companion object of the class they're for: 63 | 64 | #+BEGIN_SRC scala 65 | import cats._ 66 | 67 | case class Foo[T](t: T) 68 | 69 | object Foo { 70 | implicit val fooFunctor: Functor[Foo] = new Functor[Foo] { 71 | def map[A, B](fa: Foo[A])(f: A => B): Foo[B] = ??? 72 | } 73 | } 74 | #+END_SRC 75 | 76 | This is to give immediate "visibility" of instances to their corresponding 77 | types. Just by importing ~Foo~, you have access to all its instances without 78 | having to think about them. This decreases ~import~ confusion. 79 | 80 | *** Scala Features to Avoid 81 | 82 | **** Method Overloading and Default Arguments 83 | 84 | We [[https://stackoverflow.com/a/2512001/643684][avoid method overloading]]: 85 | 86 | #+BEGIN_SRC scala 87 | case class Foo[T](t: T) { 88 | def bar(a: Int): Bar = ??? 89 | 90 | // avoid 91 | def bar(a: Int, b: Int): Bar = ??? 92 | } 93 | #+END_SRC 94 | 95 | We avoid default arguments: 96 | 97 | #+BEGIN_SRC scala 98 | case class Foo[T](t: T) { 99 | // avoid 100 | def bar(a: Int, b: Option[Int] = None): Bar = ??? 101 | } 102 | #+END_SRC 103 | 104 | Since this is method overloading in disguise. 105 | 106 | **** Exceptions 107 | 108 | We avoid throwing Exceptions: 109 | 110 | #+BEGIN_SRC scala 111 | /* Surely this function will obey its contract... */ 112 | def innocent(path: String): Foo 113 | 114 | sbt> innocent("/wrong/file/path/or/bad/data.txt") 115 | java.lang.YouCouldntHaveForeseenThisException 116 | #+END_SRC 117 | 118 | Exceptions were intentionally left out of new languages like [[https://golang.org/doc/faq#exceptions][Golang]], [[https://www.rust-lang.org/en-US/faq.html#error-handling][Rust]], and Elm. 119 | In Scala, we can use vanilla ~Try~ and ~Either~, or ~EitherT~ from [[http://typelevel.org/cats/][Cats]] or [[https://github.com/scalaz/scalaz][ScalaZ]] 120 | to model potential errors: 121 | 122 | #+BEGIN_SRC scala 123 | def innocent(path: String): Either[String, Foo] 124 | 125 | /* "Mixing Contexts", i.e. the ability to run concurrently and to fail safely */ 126 | def innocentIO(path: String): EitherT[Future, String, Foo] 127 | #+END_SRC 128 | 129 | **** Non-data Classes 130 | 131 | We [[https://www.youtube.com/watch?v=o9pEzgHorH0][avoid classes that don't represent data]]: 132 | 133 | #+BEGIN_SRC scala 134 | class Fooifizer(val bestArg: Type) { 135 | def work(arg: Type): Unit = { ??? } 136 | } 137 | #+END_SRC 138 | 139 | Instead, we call a spade a spade and write a stand-alone function: 140 | 141 | #+BEGIN_SRC scala 142 | /* Put this in an appropriate companion object, or the package object */ 143 | def fooifize(bestArg: Type, arg: Type): Unit = { ??? } 144 | #+END_SRC 145 | 146 | **** Miscellaneous 147 | 148 | We avoid ~.apply~ returning a type other than the parent object: 149 | 150 | #+BEGIN_SRC scala 151 | object Foo { 152 | // avoid 153 | def apply(...): Bar = ... 154 | } 155 | 156 | // Or else you can write code like: 157 | val x = Foo(...) // hard to know what x's type is. 158 | #+END_SRC 159 | 160 | We [[https://github.com/circe/circe/blame/master/DESIGN.md#L77][avoid implicit conversions]]: 161 | 162 | #+BEGIN_SRC scala 163 | case class Foo(...) 164 | 165 | case class Bar(...) { 166 | def bar: ??? = ... 167 | } 168 | 169 | object Foo { 170 | // avoid 171 | implicit def foo2Bar(foo: Foo): Bar = ... 172 | } 173 | 174 | // Or else you can write code like: 175 | val x = Foo(...).bar // where did `bar` come from? 176 | #+END_SRC 177 | 178 | Typeclasses should be implemented via the implicit-val-within-companion-object 179 | pattern. 180 | 181 | ** Updating the Microsite 182 | 183 | All content files can be found in ~src/main/tut/~. After making your desired 184 | changes, you can confirm them by running the following in sbt: 185 | 186 | #+BEGIN_EXAMPLE 187 | sbt> makeMicrosite 188 | #+END_EXAMPLE 189 | 190 | This will build the site as well as compile every Scala example. If 191 | something about the API has changed and the examples are no longer valid, 192 | these docs will fail to build. This is a good thing! Just make the 193 | appropriate extra changes and rebuild. 194 | 195 | To view your built site locally, navigate to ~target/site/~ and run ~jekyll 196 | serve~. Be careful: The main content of the site will be visible at 197 | [[http://127.0.0.1:4000/vectorpipe/][127.0.0.1:4000/vectorpipe/]]. Without 198 | the ~vectorpipe~ on the end, you won't see anything. 199 | 200 | If you have write permission to the main VectorPipe repo on Github, then 201 | your updated microsite can be published to 202 | [[https://geotrellis.github.io/vectorpipe/]] via: 203 | 204 | #+BEGIN_EXAMPLE 205 | sbt> publishMicrosite 206 | #+END_EXAMPLE 207 | ** Publishing to Bintray 208 | 209 | Provided you have permissions to publish to [[https://bintray.com/azavea][Azavea's Bintray]], all that's necessary 210 | to proceed is: 211 | 212 | #+BEGIN_EXAMPLE 213 | sbt> publish 214 | #+END_EXAMPLE 215 | 216 | in your SBT shell. 217 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This software is licensed under the Apache 2 license, quoted below. 2 | 3 | Copyright 2011-2017 Azavea [http://www.azavea.com] 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); you may not 6 | use this file except in compliance with the License. You may obtain a copy of 7 | the License at 8 | 9 | [http://www.apache.org/licenses/LICENSE-2.0] 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14 | License for the specific language governing permissions and limitations under 15 | the License. 16 | -------------------------------------------------------------------------------- /RELEASING.md: -------------------------------------------------------------------------------- 1 | # Publishing a release 2 | 3 | 1. Create a new release branch from up-to-date master named `release/x.y.z` 4 | 1. Review CHANGELOG.md. Move `[Unreleased]` header to empty section and replace with `[x.y.z]` header plus release date. 5 | 1. Update the version numbers in the build.sbt and spark-shell examples in the README's "Getting Started" section. 6 | 1. Commit these changes as a single commit, with the message "Release vx.y.z" 7 | 1. Push branch and make a PR on GitHub 8 | 1. Ensure CI succeeds 9 | 1. Ensure there are no new commits on master. If there are new commits, rebase this branch on master and start over at step 2 if you wish to include them. Otherwise, merge. 10 | 1. Tag the merge commit on the master branch: `git tag -a vx.y.z -m "Release x.y.z"` 11 | 1. Push the new tag: `git push --tags`; if you have multiple remotes, you may need to target the proper upstream repo: `git push --tags`. 12 | 1. Review the CircleCI build status to ensure that the tag was successfully published to SonaType. 13 | -------------------------------------------------------------------------------- /bench/src/main/scala/vectorpipe/Bench.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import geotrellis.vector.{Extent, Line, Point} 6 | import org.openjdk.jmh.annotations._ 7 | 8 | // --- // 9 | 10 | @BenchmarkMode(Array(Mode.AverageTime)) 11 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 12 | @State(Scope.Thread) 13 | class LineBench { 14 | val extent = Extent(0, 0, 5, 5) 15 | 16 | var line: Line = _ 17 | 18 | @Setup 19 | def setup: Unit = { 20 | line = Line( 21 | List.range(4, -100, -2).map(n => Point(n, 1)) ++ List(Point(-3,4), Point(-1,4), Point(2,4), Point(4,4)) 22 | ) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /data/linestring.mvt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/data/linestring.mvt -------------------------------------------------------------------------------- /data/onepoint.mvt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/data/onepoint.mvt -------------------------------------------------------------------------------- /data/polygon.mvt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/data/polygon.mvt -------------------------------------------------------------------------------- /data/roads.mvt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/data/roads.mvt -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object Dependencies { 4 | val awscala = "com.github.seratch" %% "awscala" % Version.awscala 5 | val decline = "com.monovore" %% "decline" % Version.decline 6 | val spark = "org.apache.spark" %% "spark-core" % Version.spark 7 | val sparkSql = "org.apache.spark" %% "spark-sql" % Version.spark 8 | val sparkHive = "org.apache.spark" %% "spark-hive" % Version.spark 9 | val sparkStreaming = "org.apache.spark" %% "spark-streaming" % Version.spark 10 | val sparkJts = "org.locationtech.geomesa" %% "geomesa-spark-jts" % Version.geomesa 11 | val gtGeomesa = "org.locationtech.geotrellis" %% "geotrellis-geomesa" % Version.geotrellis 12 | val gtGeotools = "org.locationtech.geotrellis" %% "geotrellis-geotools" % Version.geotrellis 13 | val gtS3 = "org.locationtech.geotrellis" %% "geotrellis-s3" % Version.geotrellis 14 | 15 | val gtS3Spark = "org.locationtech.geotrellis" %% "geotrellis-s3-spark" % Version.geotrellis 16 | val gtSpark = "org.locationtech.geotrellis" %% "geotrellis-spark" % Version.geotrellis 17 | val gtSparkTestKit = "org.locationtech.geotrellis" %% "geotrellis-spark-testkit" % Version.geotrellis % "test" 18 | val gtVector = "org.locationtech.geotrellis" %% "geotrellis-vector" % Version.geotrellis 19 | val gtShapefile = "org.locationtech.geotrellis" %% "geotrellis-shapefile" % Version.geotrellis 20 | val gtVectorTile = "org.locationtech.geotrellis" %% "geotrellis-vectortile" % Version.geotrellis 21 | val cats = "org.typelevel" %% "cats-core" % Version.cats 22 | val scalactic = "org.scalactic" %% "scalactic" % Version.scalactic 23 | val scalatest = "org.scalatest" %% "scalatest" % Version.scalatest % "test" 24 | val jaiCore = "javax.media" % "jai_core" % "1.1.3" from "https://repo.osgeo.org/repository/release/javax/media/jai_core/1.1.3/jai_core-1.1.3.jar" 25 | val hbaseCommon = "org.apache.hbase" % "hbase-common" % "1.3.1" 26 | val hbaseClient = "org.apache.hbase" % "hbase-client" % "1.3.1" 27 | val hbaseServer = "org.apache.hbase" % "hbase-server" % "1.3.1" 28 | val geomesaHbaseDatastore = "org.locationtech.geomesa" % "geomesa-hbase-datastore_2.11" % Version.geomesa 29 | val kryo = "com.esotericsoftware" % "kryo-shaded" % Version.kryo 30 | val circeCore = "io.circe" %% "circe-core" % Version.circe 31 | val circeGeneric = "io.circe" %% "circe-generic" % Version.circe 32 | val circeExtras = "io.circe" %% "circe-generic-extras" % Version.circe 33 | val circeParser = "io.circe" %% "circe-parser" % Version.circe 34 | val circeOptics = "io.circe" %% "circe-optics" % Version.circe 35 | val circeJava8 = "io.circe" %% "circe-java8" % Version.circe 36 | val circeYaml = "io.circe" %% "circe-yaml" % "0.9.0" 37 | val commonsIO = "commons-io" % "commons-io" % Version.commonsIO 38 | val scalaj = "org.scalaj" %% "scalaj-http" % Version.scalaj 39 | } 40 | -------------------------------------------------------------------------------- /project/Version.scala: -------------------------------------------------------------------------------- 1 | object Version { 2 | val awscala = "0.8.1" 3 | val geotrellis = "3.5.1" 4 | val scala2_11 = "2.11.12" 5 | val scala2_12 = "2.12.12" 6 | val geomesa = "2.2.1" 7 | val decline = "0.6.1" 8 | val cats = "1.6.1" 9 | val scalactic = "3.0.6" 10 | val scalatest = "3.0.3" 11 | val spark = "2.4.4" 12 | val kryo = "4.0.2" 13 | val circe = "0.11.0" 14 | val scalaLogging = "3.9.2" 15 | val commonsIO = "2.6" 16 | val scalaj = "2.4.1" 17 | } 18 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") 2 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.2.8 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.47deg" % "sbt-microsites" % "0.7.4") 2 | 3 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.2.27") 4 | 5 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0") 6 | 7 | addCompilerPlugin("org.scalamacros" % "paradise" % "2.1.0" cross CrossVersion.full) 8 | 9 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.5") 10 | 11 | addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.2.0") 12 | 13 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0") 14 | -------------------------------------------------------------------------------- /scripts/cibuild: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [[ -n "${VECTORPIPE_DEBUG}" ]]; then 6 | set -x 7 | fi 8 | 9 | function usage() { 10 | echo -n \ 11 | "Usage: $(basename "$0") 12 | Execute tests. 13 | " 14 | } 15 | 16 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 17 | if [[ "${1:-}" == "--help" ]]; then 18 | usage 19 | else 20 | SCALA_VERSION="${SCALA_VERSION:-2.11.12}" ./scripts/test 21 | fi 22 | fi 23 | -------------------------------------------------------------------------------- /scripts/cipublish: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [[ -n "${VECTORPIPE_DEBUG}" ]]; then 6 | set -x 7 | fi 8 | 9 | function usage() { 10 | echo -n \ 11 | "Usage: $(basename "$0") 12 | Publish artifacts to Sonatype. 13 | " 14 | } 15 | 16 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 17 | if [[ "${1:-}" == "--help" ]]; then 18 | usage 19 | else 20 | if [[ -n "${CIRCLE_TAG}" ]]; then 21 | echo "Publishing artifacts to Sonatype" 22 | ./sbt ";++${SCALA_VERSION:-2.11.12};sonatypeOpen ${CIRCLE_BUILD_NUM};publish;sonatypeRelease" 23 | else 24 | echo "Publishing artifacts to default location" 25 | ./sbt "++${SCALA_VERSION:-2.11.12}" publish 26 | fi 27 | fi 28 | fi 29 | -------------------------------------------------------------------------------- /scripts/test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [[ -n "${VECTORPIPE_DEBUG}" ]]; then 6 | set -x 7 | fi 8 | 9 | function usage() { 10 | echo -n \ 11 | "Usage: $(basename "$0") 12 | Update Scala dependencies and execute tests. 13 | " 14 | } 15 | 16 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 17 | if [[ "${1:-}" == "--help" ]]; then 18 | usage 19 | else 20 | echo "Executing Scala test suite" 21 | ./sbt "++${SCALA_VERSION:-2.11.12}" test 22 | fi 23 | fi 24 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | vectorpipe.sources.AugmentedDiffProvider 2 | vectorpipe.sources.ChangeProvider 3 | vectorpipe.sources.ChangesetProvider 4 | -------------------------------------------------------------------------------- /src/main/resources/microsite/data/menu.yml: -------------------------------------------------------------------------------- 1 | options: 2 | 3 | - title: Usage 4 | url: usage.html 5 | menu_type: usage 6 | menu_section: usage 7 | 8 | - title: Concepts 9 | url: usage/concepts.html 10 | menu_type: usage 11 | menu_section: concepts 12 | 13 | - title: Reading OpenStreetMap Data 14 | url: usage/osm.html 15 | menu_type: usage 16 | menu_section: osm 17 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/OSM.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe 2 | 3 | import java.sql.Timestamp 4 | 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.functions._ 7 | import geotrellis.vector._ 8 | import vectorpipe.functions.osm.removeUninterestingTags 9 | import vectorpipe.internal._ 10 | 11 | object OSM { 12 | /** 13 | * Convert a raw OSM dataframe into a frame containing JTS geometries for each unique id/changeset. 14 | * 15 | * This currently produces Points for nodes containing "interesting" tags, LineStrings and Polygons for ways 16 | * (according to OSM rules for defining areas), MultiPolygons for multipolygon and boundary relations, and 17 | * LineStrings / MultiLineStrings for route relations. 18 | * 19 | * @param input DataFrame containing node, way, and relation elements 20 | * @return DataFrame containing geometries. 21 | */ 22 | def toGeometry(input: DataFrame): DataFrame = { 23 | import input.sparkSession.implicits._ 24 | 25 | val st_pointToGeom = org.apache.spark.sql.functions.udf { pt: Point => pt.asInstanceOf[Geometry] } 26 | 27 | val elements = input 28 | .withColumn("tags", removeUninterestingTags('tags)) 29 | 30 | val nodes = preprocessNodes(elements) 31 | 32 | val nodeGeoms = constructPointGeometries(nodes) 33 | .withColumn("minorVersion", lit(0)) 34 | .withColumn("geom", st_pointToGeom('geom)) 35 | 36 | val wayGeoms = reconstructWayGeometries(elements, nodes) 37 | 38 | val relationGeoms = reconstructRelationGeometries(elements, wayGeoms) 39 | 40 | nodeGeoms 41 | .union(wayGeoms.where(size('tags) > 0).drop('geometryChanged)) 42 | .union(relationGeoms) 43 | } 44 | 45 | /** 46 | * Snapshot pre-processed elements. 47 | * 48 | * A Time Pin is stuck through a set of elements that have been augmented with a 'validUntil column to identify all 49 | * that were valid at a specific point in time (i.e. updated before the target timestamp and valid after it). 50 | * 51 | * @param df Elements (including 'validUntil column) 52 | * @param timestamp Optional timestamp to snapshot at 53 | * @return DataFrame containing valid elements at timestamp (or now) 54 | */ 55 | def snapshot(df: DataFrame, timestamp: Timestamp = null): DataFrame = { 56 | import df.sparkSession.implicits._ 57 | 58 | df 59 | .where( 60 | 'updated <= coalesce(lit(timestamp), current_timestamp) 61 | and coalesce(lit(timestamp), current_timestamp) < coalesce('validUntil, date_add(current_timestamp, 1))) 62 | } 63 | 64 | /** 65 | * Augment geometries with user metadata. 66 | * 67 | * When 'changeset is included, user (name and 'uid) metadata is joined from a DataFrame containing changeset 68 | * metadata. 69 | * 70 | * @param geoms Geometries to augment. 71 | * @param changesets Changesets DataFrame with user metadata. 72 | * @return Geometries augmented with user metadata. 73 | */ 74 | def addUserMetadata(geoms: DataFrame, changesets: DataFrame): DataFrame = { 75 | import geoms.sparkSession.implicits._ 76 | 77 | geoms 78 | .join(changesets.select('id as 'changeset, 'uid, 'user), Seq("changeset")) 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/examples/AugmentedDiffProcessor.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.examples 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import org.apache.spark.sql._ 8 | import vectorpipe.VectorPipe 9 | import vectorpipe.model.AugmentedDiff 10 | import vectorpipe.sources.Source 11 | 12 | /* 13 | * Usage example: 14 | * 15 | * sbt assembly 16 | * 17 | * spark-submit \ 18 | * --class vectorpipe.examples.AugmentedDiffProcessor \ 19 | * target/scala-2.11/vectorpipe.jar \ 20 | * --augmented-diff-source s3://somewhere/diffs/ 21 | */ 22 | object AugmentedDiffProcessor 23 | extends CommandApp( 24 | name = "augmented-diff-processor", 25 | header = "Read from augmented diffs", 26 | main = { 27 | val augmentedDiffSourceOpt = Opts.option[URI]( 28 | "augmented-diff-source", 29 | short = "a", 30 | metavar = "uri", 31 | help = "Location of augmented diffs to process" 32 | ) 33 | val startSequenceOpt = Opts 34 | .option[Int]( 35 | "start-sequence", 36 | short = "s", 37 | metavar = "sequence", 38 | help = "Starting sequence. If absent, the current (remote) sequence will be used." 39 | ) 40 | .orNone 41 | val endSequenceOpt = Opts 42 | .option[Int]( 43 | "end-sequence", 44 | short = "e", 45 | metavar = "sequence", 46 | help = "Ending sequence. If absent, the current (remote) sequence will be used." 47 | ) 48 | .orNone 49 | 50 | (augmentedDiffSourceOpt, startSequenceOpt, endSequenceOpt) 51 | .mapN { 52 | (augmentedDiffSource, startSequence, endSequence) => 53 | implicit val ss: SparkSession = 54 | VectorPipe.defaultSparkSessionWithJTS("AugmentedDiffProcessor") 55 | 56 | import ss.implicits._ 57 | 58 | val options = Map(Source.BaseURI -> augmentedDiffSource.toString) ++ 59 | startSequence 60 | .map(s => Map(Source.StartSequence -> s.toString)) 61 | .getOrElse(Map.empty[String, String]) ++ 62 | endSequence 63 | .map(s => Map(Source.EndSequence -> s.toString)) 64 | .getOrElse(Map.empty[String, String]) 65 | 66 | val geoms = 67 | ss.read.format(Source.AugmentedDiffs).options(options).load 68 | 69 | // aggregations are triggered when an event with a later timestamp ("event time") is received 70 | // geoms.select('sequence).distinct.show 71 | geoms.as[AugmentedDiff].show 72 | 73 | ss.stop() 74 | } 75 | } 76 | ) 77 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/examples/AugmentedDiffStreamProcessor.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.examples 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import geotrellis.vector.{Feature, Geometry} 8 | import org.apache.spark.sql._ 9 | import vectorpipe.VectorPipe 10 | import vectorpipe.model.ElementWithSequence 11 | import vectorpipe.sources.Source 12 | 13 | /* 14 | * Usage example: 15 | * 16 | * sbt assembly 17 | * 18 | * spark-submit \ 19 | * --class vectorpipe.examples.AugmentedDiffStreamProcessor \ 20 | * target/scala-2.11/vectorpipe.jar \ 21 | * --augmented-diff-source s3://somewhere/diffs/ 22 | */ 23 | object AugmentedDiffStreamProcessor 24 | extends CommandApp( 25 | name = "augmented-diff-stream-processor", 26 | header = "Read OSM augmented diffs as an open stream", 27 | main = { 28 | type AugmentedDiffFeature = Feature[Geometry, ElementWithSequence] 29 | 30 | val augmentedDiffSourceOpt = Opts.option[URI]( 31 | "augmented-diff-source", 32 | short = "a", 33 | metavar = "uri", 34 | help = "Location of augmented diffs to process" 35 | ) 36 | val startSequenceOpt = Opts 37 | .option[Int]( 38 | "start-sequence", 39 | short = "s", 40 | metavar = "sequence", 41 | help = "Starting sequence. If absent, the current (remote) sequence will be used." 42 | ) 43 | .orNone 44 | val endSequenceOpt = Opts 45 | .option[Int]( 46 | "end-sequence", 47 | short = "e", 48 | metavar = "sequence", 49 | help = "Ending sequence. If absent, this will be an infinite stream." 50 | ) 51 | .orNone 52 | 53 | (augmentedDiffSourceOpt, startSequenceOpt, endSequenceOpt) 54 | .mapN { 55 | (augmentedDiffSource, startSequence, endSequence) => 56 | implicit val ss: SparkSession = 57 | VectorPipe.defaultSparkSessionWithJTS("AugmentedDiffStreamProcessor") 58 | 59 | val options = Map(Source.BaseURI -> augmentedDiffSource.toString, 60 | Source.ProcessName -> "AugmentedDiffStreamProcessor") ++ 61 | startSequence 62 | .map(s => Map(Source.StartSequence -> s.toString)) 63 | .getOrElse(Map.empty[String, String]) ++ 64 | endSequence 65 | .map(s => Map(Source.EndSequence -> s.toString)) 66 | .getOrElse(Map.empty[String, String]) 67 | 68 | val geoms = 69 | ss.readStream.format(Source.AugmentedDiffs).options(options).load 70 | 71 | // aggregations are triggered when an event with a later timestamp ("event time") is received 72 | val query = geoms.writeStream 73 | .format("console") 74 | .start 75 | 76 | query.awaitTermination() 77 | 78 | ss.stop() 79 | } 80 | } 81 | ) 82 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/examples/ChangeProcessor.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.examples 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import org.apache.spark.sql._ 8 | import vectorpipe.VectorPipe 9 | import vectorpipe.model.Change 10 | import vectorpipe.sources.Source 11 | 12 | /* 13 | * Usage example: 14 | * 15 | * sbt assembly 16 | * 17 | * spark-submit \ 18 | * --class vectorpipe.examples.ChangeProcessor \ 19 | * target/scala-2.11/vectorpipe.jar 20 | */ 21 | object ChangeProcessor 22 | extends CommandApp( 23 | name = "change-processor", 24 | header = "Read minutely changes from start sequence to end sequence", 25 | main = { 26 | val changeSourceOpt = Opts 27 | .option[URI]("change-source", 28 | short = "d", 29 | metavar = "uri", 30 | help = "Location of minutely diffs to process") 31 | .withDefault(new URI("https://planet.osm.org/replication/minute/")) 32 | val startSequenceOpt = Opts 33 | .option[Int]( 34 | "start-sequence", 35 | short = "s", 36 | metavar = "sequence", 37 | help = "Starting sequence. If absent, the current (remote) sequence will be used." 38 | ) 39 | .orNone 40 | val endSequenceOpt = Opts 41 | .option[Int]( 42 | "end-sequence", 43 | short = "e", 44 | metavar = "sequence", 45 | help = "Ending sequence. If absent, this will be an infinite stream." 46 | ) 47 | .orNone 48 | 49 | (changeSourceOpt, startSequenceOpt, endSequenceOpt) 50 | .mapN { 51 | (changeSource, startSequence, endSequence) => 52 | implicit val ss: SparkSession = 53 | VectorPipe.defaultSparkSessionWithJTS("ChangeProcessor") 54 | 55 | import ss.implicits._ 56 | 57 | val options = Map(Source.BaseURI -> changeSource.toString) ++ 58 | startSequence 59 | .map(s => Map(Source.StartSequence -> s.toString)) 60 | .getOrElse(Map.empty[String, String]) ++ 61 | endSequence 62 | .map(s => Map(Source.EndSequence -> s.toString)) 63 | .getOrElse(Map.empty[String, String]) 64 | 65 | val changes = 66 | ss.read.format(Source.Changes).options(options).load 67 | 68 | // aggregations are triggered when an event with a later timestamp ("event time") is received 69 | // changes.select('sequence).distinct.show 70 | changes.as[Change].show 71 | 72 | ss.stop() 73 | } 74 | } 75 | ) 76 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/examples/ChangeStreamProcessor.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.examples 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import org.apache.spark.sql._ 8 | import vectorpipe.VectorPipe 9 | import vectorpipe.sources.Source 10 | 11 | /* 12 | * Usage example: 13 | * 14 | * sbt assembly 15 | * 16 | * spark-submit \ 17 | * --class vectorpipe.examples.ChangeStreamProcessor \ 18 | * target/scala-2.11/vectorpipe.jar \ 19 | * --augmented-diff-source s3://somewhere/diffs/ 20 | */ 21 | object ChangeStreamProcessor 22 | extends CommandApp( 23 | name = "change-stream-processor", 24 | header = "Read OSM minutely diffs as a stream", 25 | main = { 26 | val changeSourceOpt = Opts 27 | .option[URI]("change-source", 28 | short = "d", 29 | metavar = "uri", 30 | help = "Location of minutely diffs to process") 31 | .withDefault(new URI("https://planet.osm.org/replication/minute/")) 32 | val startSequenceOpt = Opts 33 | .option[Int]( 34 | "start-sequence", 35 | short = "s", 36 | metavar = "sequence", 37 | help = "Starting sequence. If absent, the current (remote) sequence will be used." 38 | ) 39 | .orNone 40 | val endSequenceOpt = Opts 41 | .option[Int]( 42 | "end-sequence", 43 | short = "e", 44 | metavar = "sequence", 45 | help = "Ending sequence. If absent, this will be an infinite stream." 46 | ) 47 | .orNone 48 | val partitionCountOpt = Opts 49 | .option[Int]("partitions", 50 | short = "p", 51 | metavar = "partition count", 52 | help = "Change partition count.") 53 | .orNone 54 | 55 | (changeSourceOpt, startSequenceOpt, endSequenceOpt, partitionCountOpt) 56 | .mapN { 57 | (changeSource, startSequence, endSequence, partitionCount) => 58 | implicit val ss: SparkSession = 59 | VectorPipe.defaultSparkSessionWithJTS("ChangeStreamProcessor") 60 | 61 | val options = Map(Source.BaseURI -> changeSource.toString, Source.ProcessName -> "ChangeStreamProcessor") ++ 62 | startSequence.map(s => Map(Source.StartSequence -> s.toString)) 63 | .getOrElse(Map.empty[String, String]) ++ 64 | endSequence.map(s => Map(Source.EndSequence -> s.toString)) 65 | .getOrElse(Map.empty[String, String]) ++ 66 | partitionCount.map(s => Map(Source.PartitionCount -> s.toString)) 67 | .getOrElse(Map.empty[String, String]) 68 | 69 | val changes = 70 | ss.readStream.format(Source.Changes).options(options).load 71 | 72 | val query = changes.writeStream 73 | .format("console") 74 | .start 75 | 76 | query.awaitTermination() 77 | 78 | ss.stop() 79 | } 80 | } 81 | ) 82 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/examples/ChangesetProcessor.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.examples 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import org.apache.spark.sql._ 8 | import vectorpipe.VectorPipe 9 | import vectorpipe.model.Changeset 10 | import vectorpipe.sources.Source 11 | 12 | /* 13 | * Usage example: 14 | * 15 | * sbt assembly 16 | * 17 | * spark-submit \ 18 | * --class vectorpipe.examples.ChangesetProcessor \ 19 | * target/scala-2.11/vectorpipe.jar 20 | */ 21 | object ChangesetProcessor 22 | extends CommandApp( 23 | name = "changeset-processor", 24 | header = "Read changesets between start sequence and end sequence", 25 | main = { 26 | val changesetSourceOpt = 27 | Opts.option[URI]("changeset-source", 28 | short = "c", 29 | metavar = "uri", 30 | help = "Location of changesets to process" 31 | ).withDefault(new URI("https://planet.osm.org/replication/changesets/")) 32 | val startSequenceOpt = Opts 33 | .option[Int]( 34 | "start-sequence", 35 | short = "s", 36 | metavar = "sequence", 37 | help = "Starting sequence. If absent, the current (remote) sequence will be used." 38 | ) 39 | .orNone 40 | val endSequenceOpt = Opts 41 | .option[Int]( 42 | "end-sequence", 43 | short = "e", 44 | metavar = "sequence", 45 | help = "Ending sequence. If absent, this will be an infinite stream." 46 | ) 47 | .orNone 48 | 49 | (changesetSourceOpt, startSequenceOpt, endSequenceOpt) 50 | .mapN { 51 | (changesetSource, startSequence, endSequence) => 52 | implicit val ss: SparkSession = 53 | VectorPipe.defaultSparkSessionWithJTS("ChangesetProcessor") 54 | 55 | import ss.implicits._ 56 | 57 | val options = Map(Source.BaseURI -> changesetSource.toString) ++ 58 | startSequence 59 | .map(s => Map(Source.StartSequence -> s.toString)) 60 | .getOrElse(Map.empty[String, String]) ++ 61 | endSequence 62 | .map(s => Map(Source.EndSequence -> s.toString)) 63 | .getOrElse(Map.empty[String, String]) 64 | 65 | val changes = 66 | ss.read.format(Source.Changesets).options(options).load 67 | 68 | // aggregations are triggered when an event with a later timestamp ("event time") is received 69 | // changes.select('sequence).distinct.show 70 | changes.as[Changeset].show 71 | 72 | ss.stop() 73 | } 74 | } 75 | ) 76 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/examples/ChangesetStreamProcessor.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.examples 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import org.apache.spark.sql._ 8 | import vectorpipe.VectorPipe 9 | import vectorpipe.sources.Source 10 | 11 | /* 12 | * Usage example: 13 | * 14 | * sbt assembly 15 | * 16 | * spark-submit \ 17 | * --class vectorpipe.examples.ChangesetStreamProcessor \ 18 | * target/scala-2.11/vectorpipe.jar \ 19 | * --augmented-diff-source s3://somewhere/diffs/ 20 | */ 21 | object ChangesetStreamProcessor 22 | extends CommandApp( 23 | name = "changeset-stream-processor", 24 | header = "Read OSM changesets from start sequence to end sequence as a stream", 25 | main = { 26 | val changesetSourceOpt = 27 | Opts.option[URI]("changeset-source", 28 | short = "c", 29 | metavar = "uri", 30 | help = "Location of changesets to process" 31 | ).withDefault(new URI("https://planet.osm.org/replication/changesets/")) 32 | val startSequenceOpt = Opts 33 | .option[Int]( 34 | "start-sequence", 35 | short = "s", 36 | metavar = "sequence", 37 | help = "Starting sequence. If absent, the current (remote) sequence will be used." 38 | ) 39 | .orNone 40 | val endSequenceOpt = Opts 41 | .option[Int]( 42 | "end-sequence", 43 | short = "e", 44 | metavar = "sequence", 45 | help = "Ending sequence. If absent, this will be an infinite stream." 46 | ) 47 | .orNone 48 | val batchSizeOpt = Opts 49 | .option[Int]("batch-size", 50 | short = "b", 51 | metavar = "batch size", 52 | help = "Change batch size.") 53 | .orNone 54 | 55 | (changesetSourceOpt, startSequenceOpt, endSequenceOpt, batchSizeOpt) 56 | .mapN { 57 | (changesetSource, startSequence, endSequence, batchSize) => 58 | implicit val ss: SparkSession = 59 | VectorPipe.defaultSparkSessionWithJTS("ChangesetStreamProcessor") 60 | 61 | val options = Map(Source.BaseURI -> changesetSource.toString, Source.ProcessName -> "ChangesetStreamProcessor") ++ 62 | startSequence.map(s => Map(Source.StartSequence -> s.toString)) 63 | .getOrElse(Map.empty[String, String]) ++ 64 | endSequence.map(s => Map(Source.EndSequence -> s.toString)) 65 | .getOrElse(Map.empty[String, String]) ++ 66 | batchSize.map(s => Map(Source.BatchSize -> s.toString)) 67 | .getOrElse(Map.empty[String, String]) 68 | 69 | val changesets = 70 | ss.readStream.format(Source.Changesets).options(options).load 71 | 72 | val query = changesets.writeStream 73 | .format("console") 74 | .start 75 | 76 | query.awaitTermination() 77 | 78 | ss.stop() 79 | } 80 | } 81 | ) 82 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/functions/package.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe 2 | 3 | import org.apache.spark.sql.Column 4 | import org.apache.spark.sql.expressions.UserDefinedFunction 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.types.{DoubleType, FloatType} 7 | import vectorpipe.util._ 8 | 9 | package object functions { 10 | // A brief note about style 11 | // Spark functions are typically defined using snake_case, therefore so are the UDFs 12 | // internal helper functions use standard Scala naming conventions 13 | 14 | @transient lazy val merge_counts: UserDefinedFunction = udf(_mergeCounts) 15 | 16 | @transient lazy val sum_counts: UserDefinedFunction = udf { counts: Iterable[Map[String, Int]] => 17 | counts.reduce(_mergeCounts(_, _)) 18 | } 19 | 20 | // Convert BigDecimals to doubles 21 | // Reduces size taken for representation at the expense of some precision loss. 22 | def asDouble(value: Column): Column = 23 | when(value.isNotNull, value.cast(DoubleType)) 24 | .otherwise(lit(Double.NaN)) as s"asDouble($value)" 25 | 26 | // Convert BigDecimals to floats 27 | // Reduces size taken for representation at the expense of more precision loss. 28 | def asFloat(value: Column): Column = 29 | when(value.isNotNull, value.cast(FloatType)) 30 | .otherwise(lit(Float.NaN)) as s"asFloat($value)" 31 | 32 | @transient lazy val count_values: UserDefinedFunction = udf { 33 | (_: Seq[String]).groupBy(identity).mapValues(_.size) 34 | } 35 | 36 | @transient lazy val flatten: UserDefinedFunction = udf { 37 | (_: Seq[Seq[String]]).flatten 38 | } 39 | 40 | @transient lazy val flatten_set: UserDefinedFunction = udf { 41 | (_: Seq[Seq[String]]).flatten.distinct 42 | } 43 | 44 | @transient lazy val merge_sets: UserDefinedFunction = udf { (a: Iterable[String], b: Iterable[String]) => 45 | (Option(a).getOrElse(Set.empty).toSet ++ Option(b).getOrElse(Set.empty).toSet).toArray 46 | } 47 | 48 | @transient lazy val without: UserDefinedFunction = udf { (list: Seq[String], without: String) => 49 | list.filterNot(x => x == without) 50 | } 51 | 52 | private val _mergeCounts = (a: Map[String, Int], b: Map[String, Int]) => 53 | mergeMaps(Option(a).getOrElse(Map.empty[String, Int]), 54 | Option(b).getOrElse(Map.empty[String, Int]))(_ + _) 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/model/Actions.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.model 2 | 3 | object Actions { 4 | type Action = Byte 5 | 6 | val Create: Action = 1.byteValue 7 | val Modify: Action = 2.byteValue 8 | val Delete: Action = 3.byteValue 9 | 10 | def fromString(str: String): Action = 11 | str.toLowerCase match { 12 | case "create" => Actions.Create 13 | case "delete" => Actions.Delete 14 | case "modify" => Actions.Modify 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/model/AugmentedDiff.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.model 2 | 3 | import java.sql.Timestamp 4 | 5 | import geotrellis.vector._ 6 | 7 | case class AugmentedDiff(sequence: Int, 8 | `type`: Byte, 9 | id: Long, 10 | prevGeom: Option[Geometry], 11 | geom: Geometry, 12 | prevTags: Option[Map[String, String]], 13 | tags: Map[String, String], 14 | prevNds: Option[Seq[Long]], 15 | nds: Seq[Long], 16 | prevChangeset: Option[Long], 17 | changeset: Long, 18 | prevUid: Option[Long], 19 | uid: Long, 20 | prevUser: Option[String], 21 | user: String, 22 | prevUpdated: Option[Timestamp], 23 | updated: Timestamp, 24 | prevVisible: Option[Boolean], 25 | visible: Boolean, 26 | prevVersion: Option[Int], 27 | version: Int, 28 | minorVersion: Boolean) 29 | 30 | object AugmentedDiff { 31 | def apply(sequence: Int, 32 | prev: Option[Feature[Geometry, ElementWithSequence]], 33 | curr: Feature[Geometry, ElementWithSequence]): AugmentedDiff = { 34 | val `type` = Member.typeFromString(curr.data.`type`) 35 | val minorVersion = prev.map(_.data.version).getOrElse(Int.MinValue) == curr.data.version 36 | 37 | AugmentedDiff( 38 | sequence, 39 | `type`, 40 | curr.data.id, 41 | prev.map(_.geom), 42 | curr.geom, 43 | prev.map(_.data.tags), 44 | curr.data.tags, 45 | prev.map(_.data.nds), 46 | curr.data.nds, 47 | prev.map(_.data.changeset), 48 | curr.data.changeset, 49 | prev.map(_.data.uid), 50 | curr.data.uid, 51 | prev.map(_.data.user), 52 | curr.data.user, 53 | prev.map(_.data.timestamp), 54 | curr.data.timestamp, 55 | prev.map(_.data.visible.getOrElse(true)), 56 | curr.data.visible.getOrElse(true), 57 | prev.map(_.data.version), 58 | curr.data.version, 59 | minorVersion 60 | ) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/model/Change.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.model 2 | 3 | import java.sql.Timestamp 4 | 5 | import org.joda.time.DateTime 6 | import org.xml.sax 7 | import org.xml.sax.helpers.DefaultHandler 8 | 9 | import scala.collection.mutable 10 | import scala.collection.mutable.ListBuffer 11 | 12 | // TODO at some point user metadata (changeset, uid, user, timestamp?) should become options, as they may not be 13 | // available 14 | case class Change(id: Long, 15 | `type`: String, 16 | tags: Map[String, String], 17 | lat: Option[Double], 18 | lon: Option[Double], 19 | nds: Option[Seq[Nd]], 20 | members: Option[Seq[Member]], 21 | changeset: Long, 22 | timestamp: Timestamp, 23 | uid: Long, 24 | user: String, 25 | version: Long, 26 | visible: Boolean, 27 | sequence: Int) 28 | 29 | object Change { 30 | implicit def stringToTimestamp(s: String): Timestamp = 31 | Timestamp.from(DateTime.parse(s).toDate.toInstant) 32 | 33 | class ChangeHandler(sequence: Int) extends DefaultHandler { 34 | final val ActionLabels: Set[String] = Set("create", "delete", "modify") 35 | final val ElementLabels: Set[String] = Set("node", "way", "relation") 36 | 37 | private val changeSeq: ListBuffer[Change] = ListBuffer.empty 38 | private val tags: mutable.Map[String, String] = mutable.Map.empty 39 | private val nds: ListBuffer[Nd] = ListBuffer.empty 40 | private val members: ListBuffer[Member] = ListBuffer.empty 41 | private var action: Actions.Action = _ 42 | private var attrs: Map[String, String] = _ 43 | 44 | def changes: Seq[Change] = changeSeq 45 | 46 | override def startElement(uri: String, 47 | localName: String, 48 | qName: String, 49 | attributes: sax.Attributes): Unit = { 50 | val attrs = 51 | (for { 52 | i <- Range(0, attributes.getLength) 53 | } yield attributes.getQName(i) -> attributes.getValue(i)).toMap 54 | 55 | qName.toLowerCase match { 56 | case label if ActionLabels.contains(label) => 57 | action = Actions.fromString(qName) 58 | 59 | case label if ElementLabels.contains(label) => 60 | reset() 61 | 62 | this.attrs = attrs 63 | 64 | case "tag" => 65 | tags.update(attrs("k"), attrs("v")) 66 | 67 | case "nd" => 68 | nds.append(Nd(attrs("ref").toLong)) 69 | 70 | case "member" => 71 | members.append( 72 | Member(Member.typeFromString(attrs("type")), attrs("ref").toLong, attrs("role"))) 73 | 74 | case _ => () // no-op 75 | } 76 | } 77 | 78 | def reset(): Unit = { 79 | tags.clear() 80 | nds.clear() 81 | members.clear() 82 | } 83 | 84 | override def endElement(uri: String, localName: String, qName: String): Unit = { 85 | if (ElementLabels.contains(qName.toLowerCase)) { 86 | changeSeq.append( 87 | Change( 88 | attrs("id").toLong, 89 | qName, 90 | tags.toMap, 91 | attrs.get("lat").map(_.toDouble), 92 | attrs.get("lon").map(_.toDouble), 93 | Option(nds).filter(_.nonEmpty), 94 | Option(members).filter(_.nonEmpty).map(_.toSeq), 95 | attrs.get("changeset").map(_.toLong).getOrElse(-1L), 96 | stringToTimestamp(attrs.getOrElse("timestamp", "1970-01-01T00:00:00Z")), 97 | attrs.get("uid").map(_.toLong).getOrElse(-1L), 98 | attrs.getOrElse("user", ""), 99 | attrs.get("version").map(_.toLong).getOrElse(-1L), 100 | action != Actions.Delete, 101 | sequence 102 | )) 103 | } 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/model/Changeset.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.model 2 | 3 | import java.sql.Timestamp 4 | 5 | import org.joda.time.DateTime 6 | 7 | import scala.util.Try 8 | 9 | case class Changeset(id: Long, 10 | tags: Map[String, String], 11 | createdAt: Timestamp, 12 | open: Boolean, 13 | closedAt: Option[Timestamp], 14 | commentsCount: Int, 15 | minLat: Option[Double], 16 | maxLat: Option[Double], 17 | minLon: Option[Double], 18 | maxLon: Option[Double], 19 | numChanges: Int, 20 | uid: Long, 21 | user: String, 22 | comments: Seq[ChangesetComment], 23 | sequence: Int) 24 | 25 | object Changeset { 26 | implicit def stringToTimestamp(s: String): Timestamp = 27 | Timestamp.from(DateTime.parse(s).toDate.toInstant) 28 | 29 | implicit def stringToOptionalTimestamp(s: String): Option[Timestamp] = 30 | s match { 31 | case "" => None 32 | case ts => Some(ts) 33 | } 34 | 35 | implicit def stringToOptionalDouble(s: String): Option[Double] = 36 | s match { 37 | case "" => None 38 | case c => Some(c.toDouble) 39 | } 40 | 41 | def fromXML(node: scala.xml.Node, sequence: Int): Changeset = { 42 | val id = (node \@ "id").toLong 43 | // Old changesets lack the appropriate field 44 | val commentsCount = Try((node \@ "comments_count").toInt).toOption.getOrElse(0) 45 | val uid = (node \@ "uid").toLong 46 | val user = node \@ "user" 47 | val numChanges = Try((node \@ "num_changes").toInt).toOption.getOrElse(0) 48 | val open = (node \@ "open").toBoolean 49 | val closedAt = node \@ "closed_at" 50 | val createdAt = node \@ "created_at" 51 | 52 | val maxLon = node \@ "max_lon" 53 | val minLon = node \@ "min_lon" 54 | val maxLat = node \@ "max_lat" 55 | val minLat = node \@ "min_lat" 56 | val tags = 57 | (node \ "tag").map(tag => (tag \@ "k", tag \@ "v")).toMap 58 | val comments = (node \ "discussion" \ "comment").map(ChangesetComment.fromXML) 59 | 60 | Changeset( 61 | id, 62 | tags, 63 | createdAt, 64 | open, 65 | closedAt, 66 | commentsCount, 67 | minLat, 68 | maxLat, 69 | minLon, 70 | maxLon, 71 | numChanges, 72 | uid, 73 | user, 74 | comments, 75 | sequence 76 | ) 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/model/ChangesetComment.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.model 2 | 3 | import java.sql.Timestamp 4 | 5 | import org.joda.time.DateTime 6 | 7 | case class ChangesetComment(date: Timestamp, user: String, uid: Long, body: String) 8 | 9 | object ChangesetComment { 10 | implicit def stringToTimestamp(s: String): Timestamp = 11 | Timestamp.from(DateTime.parse(s).toDate.toInstant) 12 | 13 | def fromXML(node: scala.xml.Node): ChangesetComment = { 14 | val date = node \@ "date" 15 | val user = node \@ "user" 16 | val uid = (node \@ "uid").toLong 17 | val body = (node \ "text").text 18 | 19 | ChangesetComment(date, user, uid, body) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/model/ElementWithSequence.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.model 2 | 3 | import vectorpipe.model 4 | 5 | import org.joda.time.format.ISODateTimeFormat 6 | 7 | import io.circe._ 8 | import cats.syntax.either._ 9 | 10 | import java.sql.Timestamp 11 | 12 | 13 | // TODO is this an AugmentedDiff or an OSM Element w/ a sequence property? 14 | // an AugmentedDiff may be (Option[Element with Sequence], Element with Sequence) 15 | case class ElementWithSequence(id: Long, 16 | `type`: String, 17 | tags: Map[String, String], 18 | nds: Seq[Long], 19 | changeset: Long, 20 | timestamp: Timestamp, 21 | uid: Long, 22 | user: String, 23 | version: Int, 24 | visible: Option[Boolean], 25 | sequence: Option[Long]) { 26 | // TODO extract this; it's used in MakeTiles and elsewhere 27 | val elementId: String = `type` match { 28 | case "node" => s"n$id" 29 | case "way" => s"w$id" 30 | case "relation" => s"r$id" 31 | case _ => id.toString 32 | } 33 | } 34 | 35 | object ElementWithSequence { 36 | implicit val decodeFoo: Decoder[ElementWithSequence] = new Decoder[ElementWithSequence] { 37 | final def apply(c: HCursor): Decoder.Result[ElementWithSequence] = 38 | for { 39 | id <- c.downField("id").as[Long] 40 | `type` <- c.downField("type").as[String] 41 | tags <- c.downField("tags").as[Map[String, String]] 42 | nds <- c.downField("nds").as[Option[Seq[Long]]] 43 | changeset <- c.downField("changeset").as[Long] 44 | timestampS <- c.downField("timestamp").as[String] 45 | uid <- c.downField("uid").as[Long] 46 | user <- c.downField("user").as[String] 47 | version <- c.downField("version").as[Int] 48 | visible <- c.downField("visible").as[Option[Boolean]] 49 | sequence <- c.downField("augmentedDiff").as[Option[Long]] 50 | } yield { 51 | val timestamp = 52 | Timestamp.from( 53 | ISODateTimeFormat 54 | .dateTimeParser() 55 | .parseDateTime(timestampS) 56 | .toDate 57 | .toInstant 58 | ) 59 | model.ElementWithSequence( 60 | id, 61 | `type`, 62 | tags, 63 | nds.getOrElse(Seq.empty[Long]), 64 | changeset, 65 | timestamp, 66 | uid, 67 | user, 68 | version, 69 | visible, 70 | sequence 71 | ) 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/model/Member.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.model 2 | 3 | import vectorpipe.internal.{NodeType, RelationType, WayType} 4 | 5 | import scala.xml.Node 6 | 7 | case class Member(`type`: Byte, ref: Long, role: String) 8 | 9 | object Member { 10 | def typeFromString(str: String): Byte = str match { 11 | case "node" => NodeType 12 | case "way" => WayType 13 | case "relation" => RelationType 14 | case _ => null.asInstanceOf[Byte] 15 | } 16 | 17 | def stringFromByte(b: Byte): String = b match { 18 | case NodeType => "node" 19 | case WayType => "way" 20 | case RelationType => "relation" 21 | } 22 | 23 | def fromXML(node: Node): Member = { 24 | val `type` = typeFromString(node \@ "type") 25 | val ref = (node \@ "ref").toLong 26 | val role = node \@ "role" 27 | 28 | Member(`type`, ref, role) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/model/Nd.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.model 2 | import scala.xml.Node 3 | 4 | case class Nd(ref: Long) 5 | 6 | object Nd { 7 | def fromXML(node: Node): Nd = 8 | Nd((node \@ "ref").toLong) 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/relations/MultiPolygons.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.relations 2 | import java.sql.Timestamp 3 | 4 | import org.locationtech.jts.geom.prep.PreparedGeometryFactory 5 | import org.locationtech.jts.geom.{Geometry, LineString, Polygon, TopologyException} 6 | import org.apache.log4j.Logger 7 | import vectorpipe.internal.WayType 8 | 9 | object MultiPolygons { 10 | @transient private lazy val logger = Logger.getLogger(getClass) 11 | val prepGeomFactory = new PreparedGeometryFactory 12 | 13 | def build(id: Long, 14 | version: Int, 15 | timestamp: Timestamp, 16 | types: Seq[Byte], 17 | roles: Seq[String], 18 | _geoms: Seq[Geometry]): Option[Geometry] = { 19 | if (types.zip(_geoms).exists { case (t, g) => t == WayType && Option(g).isEmpty }) { 20 | // bail early if null values are present where they should exist (members w/ type=way) 21 | logger.debug(s"Incomplete relation: $id @ $version ($timestamp)") 22 | None 23 | } else if (types.isEmpty) { 24 | // empty relation 25 | None 26 | } else { 27 | val geomCount = _geoms.map(Option(_)).count(_.isDefined) 28 | 29 | logger.debug(s"$id @ $version ($timestamp) ${geomCount.formatted("%,d")} geoms") 30 | val geoms = _geoms.map { 31 | case geom: Polygon => Some(geom.getExteriorRing) 32 | case geom: LineString => Some(geom) 33 | case _ => None 34 | } 35 | 36 | val vertexCount = geoms.filter(_.isDefined).map(_.get).map(_.getNumPoints).sum 37 | logger.warn(s"${vertexCount.formatted("%,d")} vertices (${geomCount 38 | .formatted("%,d")} geoms) from ${types.size} members in $id @ $version ($timestamp)") 39 | 40 | val members: Seq[(String, LineString)] = roles 41 | .zip(geoms) 42 | .filter(_._2.isDefined) 43 | .map(x => (x._1, x._2.get)) 44 | 45 | val (complete, partial) = 46 | members.foldLeft((Vector.empty[Polygon], Vector.empty[LineString])) { 47 | case ((c, p), (role, line: LineString)) => 48 | role match { 49 | case "outer" if line.isClosed && line.getNumPoints >= 4 => 50 | (c :+ geometryFactory.createPolygon(line.getCoordinates), p) 51 | case "outer" => 52 | (c, p :+ line) 53 | case "inner" if line.isClosed && line.getNumPoints >= 4 => 54 | (c :+ geometryFactory.createPolygon(line.getCoordinates), p) 55 | case "inner" => (c, p :+ line) 56 | case "" if line.isClosed && line.getNumPoints >= 4 => 57 | (c :+ geometryFactory.createPolygon(line.getCoordinates), p) 58 | case "" => 59 | (c, p :+ line) 60 | case _ => 61 | (c, p) 62 | } 63 | } 64 | 65 | try { 66 | val rings = complete ++ formRings(partial.sortWith(_.getNumPoints > _.getNumPoints)) 67 | val preparedRings = rings.map(prepGeomFactory.create) 68 | 69 | // reclassify rings according to their topology (ignoring roles) 70 | val (classifiedOuters, classifiedInners) = rings.sortWith(_.getArea > _.getArea) match { 71 | case Seq(h, t @ _*) => 72 | t.foldLeft((Array(h), Array.empty[Polygon])) { 73 | case ((os, is), ring) => 74 | // check the number of containing elements 75 | preparedRings.count(r => r.getGeometry != ring && r.contains(ring)) % 2 match { 76 | // if even, it's an outer ring 77 | case 0 => (os :+ ring, is) 78 | // if odd, it's an inner ring 79 | case 1 => (os, is :+ ring) 80 | } 81 | } 82 | case rs if rs.isEmpty => (Array.empty[Polygon], Array.empty[Polygon]) 83 | } 84 | 85 | val (dissolvedOuters, addlInners) = 86 | dissolveRings(classifiedOuters) 87 | val (dissolvedInners, addlOuters) = 88 | dissolveRings( 89 | classifiedInners 90 | .map(_.getExteriorRing.getCoordinates) 91 | .map(geometryFactory.createPolygon) ++ addlInners) 92 | 93 | val (polygons, _) = 94 | (dissolvedOuters ++ addlOuters) 95 | // sort by size (descending) to use rings as part of the largest available polygon 96 | .sortWith(_.getArea > _.getArea) 97 | // only use inners once if they're contained by multiple outer rings 98 | .foldLeft((Vector.empty[Polygon], dissolvedInners)) { 99 | case ((ps, is), outer) => 100 | val preparedOuter = prepGeomFactory.create(outer) 101 | (ps :+ geometryFactory.createPolygon( 102 | geometryFactory.createLinearRing(outer.getExteriorRing.getCoordinates), 103 | is.filter(inner => preparedOuter.contains(inner)) 104 | .map({ x => geometryFactory.createLinearRing(x.getExteriorRing.getCoordinates) 105 | }) 106 | .toArray 107 | ), 108 | is.filterNot(inner => preparedOuter.contains(inner))) 109 | } 110 | 111 | polygons match { 112 | case v @ Vector(p: Polygon) if v.length == 1 => Some(p) 113 | case ps => Some(geometryFactory.createMultiPolygon(ps.toArray)) 114 | } 115 | } catch { 116 | case e @ (_: AssemblyException | _: IllegalArgumentException | _: TopologyException) => 117 | logger.warn( 118 | s"Could not reconstruct relation $id @ $version ($timestamp): ${e.getMessage}") 119 | None 120 | case e: Throwable => 121 | logger.warn(s"Could not reconstruct relation $id @ $version ($timestamp): $e") 122 | e.getStackTrace.foreach(logger.warn) 123 | None 124 | } 125 | } 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/relations/Routes.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.relations 2 | import java.sql.Timestamp 3 | 4 | import geotrellis.vector._ 5 | import org.locationtech.jts.geom.TopologyException 6 | import org.apache.log4j.Logger 7 | import vectorpipe.internal.WayType 8 | 9 | object Routes { 10 | @transient private lazy val logger = Logger.getLogger(getClass) 11 | 12 | def build(id: Long, 13 | version: Int, 14 | timestamp: Timestamp, 15 | types: Seq[Byte], 16 | roles: Seq[String], 17 | geoms: Seq[Geometry]): Option[Seq[(String, Geometry)]] = { 18 | if (types.zip(geoms).exists { case (t, g) => t == WayType && Option(g).isEmpty }) { 19 | // bail early if null values are present where they should exist (members w/ type=way) 20 | logger.debug(s"Incomplete relation: $id @ $version ($timestamp)") 21 | None 22 | } else if (types.isEmpty) { 23 | // empty relation 24 | None 25 | } else { 26 | 27 | try { 28 | val res = roles 29 | .zip(geoms.map(Option.apply)) 30 | .filter(_._2.isDefined) 31 | .map(x => (x._1, x._2.get)) 32 | .groupBy { 33 | case (role, _) => role 34 | } 35 | .mapValues(_.map(_._2)) 36 | .mapValues(connectSegments) 37 | .map { 38 | case (role, lines) => 39 | lines match { 40 | case Seq(line) => (role, line) 41 | case _ => (role, geometryFactory.createMultiLineString(lines.toArray)) 42 | } 43 | } 44 | .toSeq 45 | 46 | Some(res) 47 | } catch { 48 | case e @ (_: AssemblyException | _: IllegalArgumentException | _: TopologyException) => 49 | logger.warn( 50 | s"Could not reconstruct route relation $id @ $version ($timestamp): ${e.getMessage}") 51 | None 52 | case e: Throwable => 53 | logger.warn(s"Could not reconstruct route relation $id @ $version ($timestamp): $e") 54 | e.getStackTrace.foreach(logger.warn) 55 | None 56 | } 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/relations/package.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe 2 | 3 | import org.locationtech.jts.geom._ 4 | import vectorpipe.relations.utils.{ 5 | PartialCoordinateSequence, 6 | ReversedCoordinateSequence, 7 | VirtualCoordinateSequence, 8 | isEqual 9 | } 10 | 11 | import scala.annotation.tailrec 12 | import scala.collection.GenTraversable 13 | 14 | package object relations { 15 | 16 | // join segments together 17 | @tailrec 18 | def connectSegments(segments: GenTraversable[VirtualCoordinateSequence], 19 | lines: Seq[CoordinateSequence] = Vector.empty[CoordinateSequence]) 20 | : GenTraversable[CoordinateSequence] = { 21 | segments match { 22 | case Nil => 23 | lines 24 | case Seq(h, t @ _*) => 25 | val x = h.getX(h.size - 1) 26 | val y = h.getY(h.size - 1) 27 | 28 | t.find(line => x == line.getX(0) && y == line.getY(0)) match { 29 | case Some(next) => 30 | connectSegments(h.append(new PartialCoordinateSequence(next, 1)) +: t.filterNot(line => 31 | isEqual(line, next)), 32 | lines) 33 | case None => 34 | t.find(line => x == line.getX(line.size - 1) && y == line.getY(line.size - 1)) match { 35 | case Some(next) => 36 | connectSegments(h.append( 37 | new PartialCoordinateSequence( 38 | new ReversedCoordinateSequence(next), 39 | 1)) +: t.filterNot(line => isEqual(line, next)), 40 | lines) 41 | case None => connectSegments(t, lines :+ h) 42 | } 43 | } 44 | } 45 | } 46 | 47 | def connectSegments(segments: GenTraversable[Geometry])( 48 | implicit geometryFactory: GeometryFactory): GenTraversable[LineString] = 49 | connectSegments( 50 | segments 51 | .flatMap { 52 | case geom: LineString => Some(geom.getCoordinateSequence) 53 | case _ => None 54 | } 55 | .map(s => new VirtualCoordinateSequence(Seq(s))) 56 | ).map(geometryFactory.createLineString) 57 | 58 | // since GeoTrellis's GeometryFactory is unavailable 59 | implicit val geometryFactory: GeometryFactory = new GeometryFactory() 60 | 61 | // join segments together into rings 62 | @tailrec 63 | def formRings(segments: GenTraversable[VirtualCoordinateSequence], 64 | rings: Seq[CoordinateSequence] = Vector.empty[CoordinateSequence]) 65 | : GenTraversable[CoordinateSequence] = { 66 | segments match { 67 | case Nil => 68 | rings 69 | case Seq(h, t @ _*) if h.getX(0) == h.getX(h.size - 1) && h.getY(0) == h.getY(h.size - 1) => 70 | formRings(t, rings :+ h) 71 | case Seq(h, t @ _*) => 72 | val x = h.getX(h.size - 1) 73 | val y = h.getY(h.size - 1) 74 | 75 | formRings( 76 | t.find(line => x == line.getX(0) && y == line.getY(0)) match { 77 | case Some(next) => 78 | h.append(new PartialCoordinateSequence(next, 1)) +: t.filterNot(line => 79 | isEqual(line, next)) 80 | case None => 81 | t.find(line => x == line.getX(line.size - 1) && y == line.getY(line.size - 1)) match { 82 | case Some(next) => 83 | h.append(new PartialCoordinateSequence(new ReversedCoordinateSequence(next), 1)) +: t 84 | .filterNot(line => isEqual(line, next)) 85 | case None => throw new AssemblyException("Unable to connect segments.") 86 | } 87 | }, 88 | rings 89 | ) 90 | } 91 | } 92 | 93 | def formRings(segments: GenTraversable[LineString])( 94 | implicit geometryFactory: GeometryFactory): GenTraversable[Polygon] = { 95 | val csf = geometryFactory.getCoordinateSequenceFactory 96 | formRings(segments.map(_.getCoordinateSequence).map(s => new VirtualCoordinateSequence(Seq(s)))) 97 | .map(csf.create(_)) 98 | .map(geometryFactory.createPolygon) 99 | } 100 | 101 | def dissolveRings(rings: Array[Polygon]): (Seq[Polygon], Seq[Polygon]) = { 102 | Option(geometryFactory.createGeometryCollection(rings.asInstanceOf[Array[Geometry]]).union) match { 103 | case Some(mp) => 104 | val polygons = for (i <- 0 until mp.getNumGeometries) yield { 105 | mp.getGeometryN(i).asInstanceOf[Polygon] 106 | } 107 | 108 | (polygons.map(_.getExteriorRing.getCoordinates).map(geometryFactory.createPolygon), 109 | polygons.flatMap(getInteriorRings).map(geometryFactory.createPolygon)) 110 | case None => 111 | (Vector.empty[Polygon], Vector.empty[Polygon]) 112 | } 113 | } 114 | 115 | def getInteriorRings(p: Polygon): Seq[LinearRing] = 116 | for (i <- 0 until p.getNumInteriorRing) 117 | yield geometryFactory.createLinearRing(p.getInteriorRingN(i).getCoordinates) 118 | 119 | class AssemblyException(msg: String) extends Exception(msg) 120 | } 121 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/relations/utils/PartialCoordinateSequence.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.relations.utils 2 | import org.locationtech.jts.geom.{Coordinate, CoordinateSequence, Envelope} 3 | 4 | class PartialCoordinateSequence(sequence: CoordinateSequence, offset: Int) 5 | extends CoordinateSequence { 6 | private lazy val _size: Int = sequence.size() - offset 7 | 8 | private lazy val coordinates: Array[Coordinate] = { 9 | val coords = new Array[Coordinate](size()) 10 | 11 | for (i <- 0 until size) { 12 | coords(i) = getCoordinate(i) 13 | } 14 | 15 | coords 16 | } 17 | 18 | override def getDimension: Int = sequence.getDimension 19 | 20 | override def getCoordinate(i: Int): Coordinate = sequence.getCoordinate(offset + i) 21 | 22 | override def getCoordinateCopy(i: Int): Coordinate = sequence.getCoordinateCopy(offset + i) 23 | 24 | override def getCoordinate(index: Int, coord: Coordinate): Unit = 25 | sequence.getCoordinate(offset + index, coord) 26 | 27 | override def getOrdinate(index: Int, ordinateIndex: Int): Double = 28 | sequence.getOrdinate(offset + index, ordinateIndex) 29 | 30 | override def setOrdinate(index: Int, ordinateIndex: Int, value: Double): Unit = 31 | sequence.setOrdinate(offset + index, ordinateIndex, value) 32 | 33 | override def toCoordinateArray: Array[Coordinate] = coordinates 34 | 35 | override def expandEnvelope(env: Envelope): Envelope = { 36 | for (i <- 0 until size) { 37 | env.expandToInclude(getX(i), getY(i)) 38 | } 39 | 40 | env 41 | } 42 | 43 | override def getX(index: Int): Double = sequence.getX(offset + index) 44 | 45 | override def getY(index: Int): Double = sequence.getY(offset + index) 46 | 47 | override def size(): Int = _size 48 | 49 | override def clone(): AnyRef = new PartialCoordinateSequence(sequence, offset) 50 | 51 | override def copy(): PartialCoordinateSequence = new PartialCoordinateSequence(sequence.copy, offset) 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/relations/utils/ReversedCoordinateSequence.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.relations.utils 2 | import org.locationtech.jts.geom.{Coordinate, CoordinateSequence, Envelope} 3 | 4 | class ReversedCoordinateSequence(sequence: CoordinateSequence) extends CoordinateSequence { 5 | private lazy val coordinates: Array[Coordinate] = { 6 | val coords = new Array[Coordinate](size()) 7 | 8 | for (i <- size - 1 to 0) { 9 | coords(i) = getCoordinate(i) 10 | } 11 | 12 | coords 13 | } 14 | 15 | override def getDimension: Int = sequence.getDimension 16 | 17 | override def getCoordinate(i: Int): Coordinate = sequence.getCoordinate(getIndex(i)) 18 | 19 | override def getCoordinateCopy(i: Int): Coordinate = sequence.getCoordinateCopy(getIndex(i)) 20 | 21 | override def getCoordinate(index: Int, coord: Coordinate): Unit = 22 | sequence.getCoordinate(getIndex(index), coord) 23 | 24 | private def getIndex(i: Int): Int = size - 1 - i 25 | 26 | override def size(): Int = sequence.size 27 | 28 | override def getX(index: Int): Double = sequence.getX(getIndex(index)) 29 | 30 | override def getY(index: Int): Double = sequence.getY(getIndex(index)) 31 | 32 | override def getOrdinate(index: Int, ordinateIndex: Int): Double = 33 | sequence.getOrdinate(getIndex(index), ordinateIndex) 34 | 35 | override def setOrdinate(index: Int, ordinateIndex: Int, value: Double): Unit = 36 | sequence.setOrdinate(getIndex(index), ordinateIndex, value) 37 | 38 | override def toCoordinateArray: Array[Coordinate] = coordinates 39 | 40 | override def expandEnvelope(env: Envelope): Envelope = sequence.expandEnvelope(env) 41 | 42 | override def clone(): AnyRef = new ReversedCoordinateSequence(sequence) 43 | 44 | override def copy(): ReversedCoordinateSequence = new ReversedCoordinateSequence(sequence.copy) 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/relations/utils/VirtualCoordinateSequence.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.relations.utils 2 | import com.google.common.collect.{Range, RangeMap, TreeRangeMap} 3 | import org.locationtech.jts.geom.{Coordinate, CoordinateSequence, Envelope} 4 | 5 | // rather than being a nested set of CoordinateSequences, this is a mutable wrapper to avoid deep call stacks 6 | class VirtualCoordinateSequence(sequences: Seq[CoordinateSequence]) extends CoordinateSequence { 7 | // TODO this should be invalidated after append (but it doesn't actually matter because all of the appending will 8 | // occur ahead of time) 9 | private lazy val coordinates: Array[Coordinate] = { 10 | val coords = new Array[Coordinate](size()) 11 | 12 | for (i <- 0 until size) { 13 | coords(i) = getCoordinate(i) 14 | } 15 | 16 | coords 17 | } 18 | 19 | private val rangeMap: RangeMap[Integer, CoordinateSequence] = { 20 | val rm = TreeRangeMap.create[Integer, CoordinateSequence] 21 | 22 | sequences 23 | .zip(sequences.map(_.size).scanLeft(0)(_ + _).dropRight(1)) 24 | .map { 25 | case (seq, offset) => (seq, Range.closed(offset: Integer, offset + seq.size - 1: Integer)) 26 | } 27 | .foreach { case (seq, range) => rm.put(range, seq) } 28 | 29 | rm 30 | } 31 | 32 | private var dimension: Int = sequences.map(_.getDimension).min 33 | 34 | private var _size: Int = sequences.map(_.size).sum 35 | 36 | def append(sequence: CoordinateSequence): VirtualCoordinateSequence = { 37 | val upperEndpoint = rangeMap.span.upperEndpoint 38 | val range = Range.closed(upperEndpoint + 1: Integer, upperEndpoint + sequence.size: Integer) 39 | rangeMap.put(range, sequence) 40 | 41 | dimension = Math.min(dimension, sequence.getDimension) 42 | _size += sequence.size 43 | 44 | this 45 | } 46 | 47 | override def getDimension: Int = dimension 48 | 49 | override def getCoordinate(i: Int): Coordinate = { 50 | val (sequence, index) = getSequence(i) 51 | 52 | // bypass PackedCoordinateSequence.getCoordinate to prevent caching and associated allocation 53 | new Coordinate(sequence.getX(index), sequence.getY(index)) 54 | } 55 | 56 | private def getSequence(i: Int): (CoordinateSequence, Int) = { 57 | val entry = rangeMap.getEntry(i: Integer) 58 | 59 | (entry.getValue, i - entry.getKey.lowerEndpoint) 60 | } 61 | 62 | override def getCoordinateCopy(i: Int): Coordinate = { 63 | val (sequence, index) = getSequence(i) 64 | 65 | sequence.getCoordinateCopy(index) 66 | } 67 | 68 | override def getCoordinate(i: Int, coord: Coordinate): Unit = { 69 | val (sequence, index) = getSequence(i) 70 | 71 | sequence.getCoordinate(index, coord) 72 | } 73 | 74 | override def getOrdinate(i: Int, ordinateIndex: Int): Double = { 75 | val (sequence, index) = getSequence(i) 76 | 77 | sequence.getOrdinate(index, ordinateIndex) 78 | } 79 | 80 | override def setOrdinate(i: Int, ordinateIndex: Int, value: Double): Unit = { 81 | val (sequence, index) = getSequence(i) 82 | 83 | sequence.setOrdinate(index, ordinateIndex, value) 84 | } 85 | 86 | override def toCoordinateArray: Array[Coordinate] = coordinates 87 | 88 | override def expandEnvelope(env: Envelope): Envelope = { 89 | for (i <- 0 until size) { 90 | env.expandToInclude(getX(i), getY(i)) 91 | } 92 | 93 | env 94 | } 95 | 96 | override def getX(i: Int): Double = { 97 | val (sequence, index) = getSequence(i) 98 | 99 | sequence.getX(index) 100 | } 101 | 102 | override def getY(i: Int): Double = { 103 | val (sequence, index) = getSequence(i) 104 | 105 | sequence.getY(index) 106 | } 107 | 108 | override def size(): Int = _size 109 | 110 | override def clone(): AnyRef = { 111 | // we're already playing fast and loose 112 | this 113 | } 114 | 115 | override def copy(): VirtualCoordinateSequence = new VirtualCoordinateSequence(sequences.map(_.copy)) 116 | } 117 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/relations/utils/package.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.relations 2 | 3 | import org.locationtech.jts.geom.CoordinateSequence 4 | 5 | package object utils { 6 | 7 | /** 8 | * Tests whether two {@link CoordinateSequence}s are equal. 9 | * To be equal, the sequences must be the same length. 10 | * They do not need to be of the same dimension, 11 | * but the ordinate values for the smallest dimension of the two 12 | * must be equal. 13 | * Two NaN ordinates values are considered to be equal. 14 | * 15 | * Ported to Scala from JTS 1.15.0 16 | * 17 | * @param cs1 a CoordinateSequence 18 | * @param cs2 a CoordinateSequence 19 | * @return true if the sequences are equal in the common dimensions 20 | */ 21 | def isEqual(cs1: CoordinateSequence, cs2: CoordinateSequence): Boolean = { 22 | if (cs1.size != cs2.size) { 23 | false 24 | } else { 25 | val dim = Math.min(cs1.getDimension, cs2.getDimension) 26 | (0 until cs1.size).forall(i => { 27 | (0 until dim).forall(d => { 28 | val v1 = cs1.getOrdinate(i, d) 29 | val v2 = cs2.getOrdinate(i, d) 30 | 31 | v1 == v2 || (v1 == Double.NaN && v2 == Double.NaN) 32 | }) 33 | }) 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/AugmentedDiffMicroBatchReader.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.net.URI 4 | import java.util 5 | 6 | import org.apache.spark.internal.Logging 7 | import org.apache.spark.sql.catalyst.InternalRow 8 | import org.apache.spark.sql.sources.v2.DataSourceOptions 9 | import org.apache.spark.sql.sources.v2.reader.{InputPartition, InputPartitionReader} 10 | import vectorpipe.model.AugmentedDiff 11 | 12 | import scala.collection.JavaConverters._ 13 | import scala.compat.java8.OptionConverters._ 14 | 15 | case class AugmentedDiffStreamBatchTask(baseURI: URI, sequences: Seq[Int], handler: (Int, AugmentedDiffSource.RF) => Unit) 16 | extends InputPartition[InternalRow] { 17 | override def createPartitionReader(): InputPartitionReader[InternalRow] = 18 | AugmentedDiffStreamBatchReader(baseURI, sequences, handler) 19 | } 20 | 21 | case class AugmentedDiffStreamBatchReader(baseURI: URI, sequences: Seq[Int], handler: (Int, AugmentedDiffSource.RF) => Unit) 22 | extends ReplicationStreamBatchReader[AugmentedDiff](baseURI, sequences) { 23 | 24 | override def getSequence(baseURI: URI, sequence: Int): Seq[AugmentedDiff] = 25 | AugmentedDiffSource.getSequence(baseURI, sequence, handler) 26 | } 27 | 28 | case class AugmentedDiffMicroBatchReader(options: DataSourceOptions, checkpointLocation: String) 29 | extends ReplicationStreamMicroBatchReader[AugmentedDiff](options, checkpointLocation) 30 | with Logging { 31 | 32 | override def getCurrentSequence: Option[Int] = 33 | AugmentedDiffSource.getCurrentSequence(baseURI) 34 | 35 | private def baseURI: URI = 36 | options 37 | .get(Source.BaseURI) 38 | .asScala 39 | .map(new URI(_)) 40 | .getOrElse( 41 | throw new RuntimeException( 42 | s"${Source.BaseURI} is a required option for ${Source.AugmentedDiffs}" 43 | ) 44 | ) 45 | 46 | private def errorHandler: AugmentedDiffSourceErrorHandler = { 47 | val handlerClass = options 48 | .get(Source.ErrorHandler) 49 | .asScala 50 | .getOrElse("vectorpipe.sources.AugmentedDiffSourceErrorHandler") 51 | 52 | val handler = Class.forName(handlerClass).newInstance.asInstanceOf[AugmentedDiffSourceErrorHandler] 53 | handler.setOptions(options.asMap.asScala.toMap) 54 | handler 55 | } 56 | 57 | override def planInputPartitions(): util.List[InputPartition[InternalRow]] = 58 | sequenceRange 59 | .map(seq => 60 | AugmentedDiffStreamBatchTask(baseURI, Seq(seq), errorHandler.handle).asInstanceOf[InputPartition[InternalRow]]) 61 | .asJava 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/AugmentedDiffProvider.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.util.Optional 4 | 5 | import org.apache.spark.sql.sources.DataSourceRegister 6 | import org.apache.spark.sql.sources.v2.reader.DataSourceReader 7 | import org.apache.spark.sql.sources.v2.reader.streaming.MicroBatchReader 8 | import org.apache.spark.sql.sources.v2.{ 9 | DataSourceOptions, 10 | DataSourceV2, 11 | MicroBatchReadSupport, 12 | ReadSupport 13 | } 14 | import org.apache.spark.sql.types.StructType 15 | 16 | class AugmentedDiffProvider 17 | extends DataSourceV2 18 | with ReadSupport 19 | with MicroBatchReadSupport 20 | with DataSourceRegister { 21 | override def createMicroBatchReader( 22 | schema: Optional[StructType], 23 | checkpointLocation: String, 24 | options: DataSourceOptions 25 | ): MicroBatchReader = { 26 | if (schema.isPresent) { 27 | throw new IllegalStateException( 28 | "The augmented diff source does not support a user-specified schema." 29 | ) 30 | } 31 | 32 | AugmentedDiffMicroBatchReader(options, checkpointLocation) 33 | } 34 | 35 | override def shortName(): String = Source.AugmentedDiffs 36 | override def createReader(options: DataSourceOptions): DataSourceReader = 37 | AugmentedDiffReader(options) 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/AugmentedDiffReader.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.net.URI 4 | import java.util 5 | 6 | import geotrellis.vector.Geometry 7 | import org.apache.spark.sql.catalyst.InternalRow 8 | import org.apache.spark.sql.sources.v2.DataSourceOptions 9 | import org.apache.spark.sql.sources.v2.reader.InputPartition 10 | import vectorpipe.model.{AugmentedDiff, ElementWithSequence} 11 | import vectorpipe.util.RobustFeature 12 | 13 | import scala.collection.JavaConverters._ 14 | import scala.compat.java8.OptionConverters._ 15 | import scala.util.Random 16 | 17 | case class AugmentedDiffReader(options: DataSourceOptions) 18 | extends ReplicationReader[AugmentedDiff](options) { 19 | override def planInputPartitions(): util.List[InputPartition[InternalRow]] = { 20 | // prevent sequential diffs from being assigned to the same task 21 | val sequences = Random.shuffle((startSequence to endSequence).toList) 22 | 23 | sequences 24 | .grouped(Math.max(1, sequences.length / partitionCount)) 25 | .toList 26 | .map( 27 | AugmentedDiffStreamBatchTask(baseURI, _, errorHandler.handle) 28 | .asInstanceOf[InputPartition[InternalRow]] 29 | ) 30 | .asJava 31 | } 32 | 33 | private def baseURI: URI = 34 | options 35 | .get(Source.BaseURI) 36 | .asScala 37 | .map(new URI(_)) 38 | .getOrElse( 39 | throw new RuntimeException( 40 | s"${Source.BaseURI} is a required option for ${Source.AugmentedDiffs}" 41 | ) 42 | ) 43 | 44 | 45 | private def errorHandler: AugmentedDiffSourceErrorHandler = { 46 | val handlerClass = options 47 | .get(Source.ErrorHandler) 48 | .asScala 49 | .getOrElse("vectorpipe.sources.AugmentedDiffSourceErrorHandler") 50 | 51 | val handler = Class.forName(handlerClass).newInstance.asInstanceOf[AugmentedDiffSourceErrorHandler] 52 | handler.setOptions(options.asMap.asScala.toMap) 53 | handler 54 | } 55 | 56 | override def getCurrentSequence: Option[Int] = AugmentedDiffSource.getCurrentSequence(baseURI) 57 | } 58 | 59 | 60 | class AugmentedDiffSourceErrorHandler extends Serializable { 61 | def setOptions(options: Map[String, String]): Unit = () 62 | 63 | def handle(sequence: Int, feature: RobustFeature[Geometry, ElementWithSequence]): Unit = () 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/AugmentedDiffSource.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.io.{BufferedInputStream, File} 4 | import java.net.URI 5 | import java.nio.charset.StandardCharsets 6 | import java.sql.Timestamp 7 | import java.time.Instant 8 | import java.util.zip.GZIPInputStream 9 | 10 | import geotrellis.store.s3._ 11 | import geotrellis.vector._ 12 | 13 | import vectorpipe.model.{AugmentedDiff, ElementWithSequence} 14 | import vectorpipe.util._ 15 | //import vectorpipe.util.RobustFeatureFormats._ 16 | 17 | import org.apache.commons.io.IOUtils 18 | import org.apache.spark.internal.Logging 19 | import org.apache.spark.sql.Column 20 | import org.apache.spark.sql.functions.{floor, from_unixtime, to_timestamp, unix_timestamp} 21 | 22 | import _root_.io.circe._ 23 | import _root_.io.circe.generic.auto._ 24 | import cats.implicits._ 25 | 26 | import software.amazon.awssdk.services.s3.model.{GetObjectRequest, NoSuchKeyException, S3Exception} 27 | import software.amazon.awssdk.services.s3.S3Client 28 | import com.softwaremill.macmemo.memoize 29 | import org.joda.time.DateTime 30 | 31 | import scala.concurrent.duration.{Duration, _} 32 | 33 | 34 | object AugmentedDiffSource extends Logging { 35 | type RF = RobustFeature[Geometry, ElementWithSequence] 36 | 37 | private lazy val s3: S3Client = S3ClientProducer.get() 38 | val Delay: Duration = 15.seconds 39 | 40 | private implicit val dateTimeDecoder: Decoder[DateTime] = 41 | Decoder.instance(a => a.as[String].map(DateTime.parse)) 42 | 43 | def getFeatures(baseURI: URI, sequence: Int): Seq[Map[String, RF]] = { 44 | val bucket = baseURI.getHost 45 | val prefix = new File(baseURI.getPath.drop(1)).toPath 46 | // left-pad sequence 47 | val s = f"$sequence%09d" 48 | val key = prefix.resolve(s"${s.slice(0, 3)}/${s.slice(3, 6)}/${s.slice(6, 9)}.json.gz").toString 49 | 50 | logDebug(s"Fetching sequence $sequence") 51 | 52 | val obj = s3.getObject( 53 | GetObjectRequest 54 | .builder 55 | .bucket(bucket) 56 | .key(key) 57 | .build 58 | ) 59 | 60 | val bis = new BufferedInputStream(obj) 61 | val gzis = new GZIPInputStream(bis) 62 | 63 | try { 64 | IOUtils 65 | .toString(gzis, StandardCharsets.UTF_8) 66 | .lines 67 | .map { line => 68 | // Spark doesn't like RS-delimited JSON; perhaps Spray doesn't either 69 | line 70 | .replace("\u001e", "") 71 | .parseGeoJson[JsonRobustFeatureCollectionMap] 72 | .getAll[RF] 73 | } 74 | .toSeq 75 | } finally { 76 | gzis.close() 77 | bis.close() 78 | } 79 | } 80 | 81 | /** 82 | * Fetch all augmented diffs from a sequence number. 83 | * 84 | * This function collects the data in an augmented diff sequence file into 85 | * vectorpipe.model.AugmentedDiff objects. These diff files are expected to be 86 | * stored on S3 in .json.gz files. This method provides the option to process errors 87 | * generated when the new geometry in the diff is faulty. If `waitUntilAvailable` is 88 | * set to true, the process will block, in 15 second increments, until the sequence 89 | * file is available. 90 | */ 91 | def getSequence(baseURI: URI, sequence: Int, badGeometryHandler: (Int, RF) => Unit, waitUntilAvailable: Boolean): Seq[AugmentedDiff] = { 92 | logDebug(s"Fetching sequence $sequence") 93 | 94 | try { 95 | val robustFeatureMaps = getFeatures(baseURI, sequence) 96 | 97 | robustFeatureMaps.map{ m => 98 | if (m.contains("new") && !m("new").geom.isDefined) badGeometryHandler(sequence, m("new")) 99 | AugmentedDiff(sequence, m.get("old").map(_.toFeature), m("new").toFeature) 100 | } 101 | } catch { 102 | case e: S3Exception if e.isInstanceOf[NoSuchKeyException] || e.statusCode == 403 => 103 | logInfo(s"Encountered missing sequence (baseURI = ${baseURI}, sequence = ${sequence}), comparing with current for validity") 104 | getCurrentSequence(baseURI) match { 105 | case Some(s) if s > sequence => 106 | logInfo(s"$sequence is missing, continuing") 107 | Seq.empty[AugmentedDiff] 108 | case _ => 109 | if (waitUntilAvailable) { 110 | logInfo(s"$sequence is not yet available, sleeping.") 111 | Thread.sleep(Delay.toMillis) 112 | getSequence(baseURI, sequence, waitUntilAvailable) 113 | } else 114 | throw e 115 | } 116 | case t: Throwable => 117 | if (waitUntilAvailable) { 118 | logError(s"sequence $sequence caused an error", t) 119 | Thread.sleep(Delay.toMillis) 120 | getSequence(baseURI, sequence) 121 | } else 122 | throw t 123 | } 124 | } 125 | 126 | def getSequence(baseURI: URI, sequence: Int): Seq[AugmentedDiff] = 127 | getSequence(baseURI, sequence, {(_: Int, _: RF) => ()}, true) 128 | 129 | def getSequence(baseURI: URI, sequence: Int, waitUntilAvailable: Boolean): Seq[AugmentedDiff] = 130 | getSequence(baseURI, sequence, {(_: Int, _: RF) => ()}, waitUntilAvailable) 131 | 132 | def getSequence(baseURI: URI, sequence: Int, badGeometryHandler: (Int, RF) => Unit): Seq[AugmentedDiff] = 133 | getSequence(baseURI, sequence, badGeometryHandler, true) 134 | 135 | @memoize(maxSize = 1, expiresAfter = 30 seconds) 136 | def getCurrentSequence(baseURI: URI): Option[Int] = { 137 | val bucket = baseURI.getHost 138 | val prefix = new File(baseURI.getPath.drop(1)).toPath 139 | val key = prefix.resolve("state.yaml").toString 140 | 141 | try { 142 | val request = GetObjectRequest.builder() 143 | .bucket(bucket) 144 | .key(key) 145 | .build() 146 | val response = s3.getObjectAsBytes(request) 147 | 148 | val body = IOUtils 149 | .toString(response.asInputStream, StandardCharsets.UTF_8.toString) 150 | 151 | val state = yaml.parser 152 | .parse(body) 153 | .leftMap(err => err: Error) 154 | .flatMap(_.as[State]) 155 | .valueOr(throw _) 156 | 157 | logDebug(s"$baseURI state: ${state.sequence} @ ${state.last_run}") 158 | 159 | Some(state.sequence) 160 | } catch { 161 | case err: Throwable => 162 | logError("Error fetching / parsing changeset state.", err) 163 | 164 | None 165 | } 166 | } 167 | 168 | def timestampToSequence(timestamp: Timestamp): Int = 169 | ((timestamp.toInstant.getEpochSecond - 1347432900) / 60).toInt 170 | 171 | def timestampToSequence(timestamp: Column): Column = 172 | floor((unix_timestamp(timestamp) - 1347432900) / 60) 173 | 174 | def sequenceToTimestamp(sequence: Int): Timestamp = 175 | Timestamp.from(Instant.ofEpochSecond(sequence.toLong * 60 + 1347432900L)) 176 | 177 | def sequenceToTimestamp(sequence: Column): Column = 178 | to_timestamp(from_unixtime(sequence * 60 + 1347432900)) 179 | 180 | case class State(last_run: DateTime, sequence: Int) 181 | } 182 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/ChangeMicroBatchReader.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.net.URI 4 | import java.util 5 | 6 | import org.apache.spark.sql.catalyst.InternalRow 7 | import org.apache.spark.sql.sources.v2.DataSourceOptions 8 | import org.apache.spark.sql.sources.v2.reader.{InputPartition, InputPartitionReader} 9 | import vectorpipe.model.Change 10 | 11 | import scala.collection.JavaConverters._ 12 | 13 | case class ChangeStreamBatchTask(baseURI: URI, sequences: Seq[Int]) extends InputPartition[InternalRow] { 14 | override def createPartitionReader(): InputPartitionReader[InternalRow] = 15 | new ChangeStreamBatchReader(baseURI, sequences) 16 | } 17 | 18 | class ChangeStreamBatchReader(baseURI: URI, sequences: Seq[Int]) 19 | extends ReplicationStreamBatchReader[Change](baseURI, sequences) { 20 | 21 | override def getSequence(baseURI: URI, sequence: Int): Seq[Change] = 22 | ChangeSource.getSequence(baseURI, sequence) 23 | } 24 | 25 | case class ChangeMicroBatchReader(options: DataSourceOptions, checkpointLocation: String) 26 | extends ReplicationStreamMicroBatchReader[Change](options, checkpointLocation) { 27 | private lazy val baseURI = new URI( 28 | options 29 | .get(Source.BaseURI) 30 | .orElse("https://planet.osm.org/replication/minute/") 31 | ) 32 | 33 | override def getCurrentSequence: Option[Int] = 34 | ChangeSource.getCurrentSequence(baseURI) 35 | 36 | override def planInputPartitions(): util.List[InputPartition[InternalRow]] = 37 | sequenceRange 38 | .map( 39 | seq => ChangeStreamBatchTask(baseURI, Seq(seq)).asInstanceOf[InputPartition[InternalRow]] 40 | ) 41 | .asJava 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/ChangeProvider.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.util.Optional 4 | 5 | import org.apache.spark.sql.sources.DataSourceRegister 6 | import org.apache.spark.sql.sources.v2.reader.DataSourceReader 7 | import org.apache.spark.sql.sources.v2.reader.streaming.MicroBatchReader 8 | import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, MicroBatchReadSupport, ReadSupport} 9 | import org.apache.spark.sql.types.StructType 10 | 11 | class ChangeProvider 12 | extends DataSourceV2 13 | with ReadSupport 14 | with MicroBatchReadSupport 15 | with DataSourceRegister { 16 | override def createMicroBatchReader( 17 | schema: Optional[StructType], 18 | checkpointLocation: String, 19 | options: DataSourceOptions 20 | ): MicroBatchReader = { 21 | if (schema.isPresent) { 22 | throw new IllegalStateException( 23 | "The changes source does not support a user-specified schema." 24 | ) 25 | } 26 | 27 | ChangeMicroBatchReader(options, checkpointLocation) 28 | } 29 | 30 | override def shortName(): String = Source.Changes 31 | override def createReader(options: DataSourceOptions): DataSourceReader = 32 | ChangeReader(options) 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/ChangeReader.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.net.URI 4 | import java.util 5 | 6 | import org.apache.spark.sql.catalyst.InternalRow 7 | import org.apache.spark.sql.sources.v2.DataSourceOptions 8 | import org.apache.spark.sql.sources.v2.reader.InputPartition 9 | import vectorpipe.model.Change 10 | 11 | import scala.collection.JavaConverters._ 12 | import scala.util.Random 13 | 14 | case class ChangeReader(options: DataSourceOptions) extends ReplicationReader[Change](options) { 15 | override def planInputPartitions(): util.List[InputPartition[InternalRow]] = { 16 | // prevent sequential diffs from being assigned to the same task 17 | val sequences = Random.shuffle((startSequence to endSequence).toList) 18 | 19 | sequences 20 | .grouped(Math.max(1, sequences.length / partitionCount)) 21 | .toList 22 | .map( 23 | ChangeStreamBatchTask(baseURI, _) 24 | .asInstanceOf[InputPartition[InternalRow]] 25 | ) 26 | .asJava 27 | } 28 | 29 | private def baseURI = 30 | new URI( 31 | options 32 | .get(Source.BaseURI) 33 | .orElse("https://planet.osm.org/replication/minute/")) 34 | 35 | override def getCurrentSequence: Option[Int] = ChangeSource.getCurrentSequence(baseURI) 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/ChangeSource.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.io.{ByteArrayInputStream, IOException, StringReader} 4 | import java.net.URI 5 | import java.util.Properties 6 | import java.util.zip.GZIPInputStream 7 | 8 | import com.softwaremill.macmemo.memoize 9 | import javax.xml.parsers.SAXParserFactory 10 | import org.apache.spark.internal.Logging 11 | import org.joda.time.DateTime 12 | import vectorpipe.model 13 | import vectorpipe.model.Change 14 | import scalaj.http.Http 15 | 16 | import scala.concurrent.duration.{Duration, _} 17 | 18 | object ChangeSource extends Logging { 19 | val Delay: Duration = 15 seconds 20 | private val saxParserFactory = SAXParserFactory.newInstance 21 | 22 | def getSequence(baseURI: URI, sequence: Int): Seq[Change] = { 23 | val s = f"$sequence%09d" 24 | val path = s"${s.slice(0, 3)}/${s.slice(3, 6)}/${s.slice(6, 9)}.osc.gz" 25 | 26 | logInfo(s"Fetching sequence $sequence") 27 | 28 | try { 29 | val response = 30 | Http(baseURI.resolve(path).toString).asBytes 31 | 32 | if (response.code == 404) { 33 | logInfo(s"$sequence is not yet available, sleeping.") 34 | Thread.sleep(Delay.toMillis) 35 | getSequence(baseURI, sequence) 36 | } else { 37 | val bais = new ByteArrayInputStream(response.body) 38 | val gzis = new GZIPInputStream(bais) 39 | val parser = saxParserFactory.newSAXParser 40 | val handler = new model.Change.ChangeHandler(sequence) 41 | try { 42 | parser.parse(gzis, handler) 43 | val changes = handler.changes 44 | 45 | logDebug(s"Received ${changes.length} changes from sequence $sequence") 46 | 47 | changes 48 | } finally { 49 | gzis.close() 50 | bais.close() 51 | } 52 | } 53 | } catch { 54 | case e: IOException => 55 | logWarning(s"Error fetching change $sequence", e) 56 | Thread.sleep(Delay.toMillis) 57 | getSequence(baseURI, sequence) 58 | } 59 | } 60 | 61 | @memoize(maxSize = 1, expiresAfter = 30 seconds) 62 | def getCurrentSequence(baseURI: URI): Option[Int] = { 63 | try { 64 | val response = 65 | Http(baseURI.resolve("state.txt").toString).asString 66 | 67 | val state = new Properties 68 | state.load(new StringReader(response.body)) 69 | 70 | val sequence = state.getProperty("sequenceNumber").toInt 71 | val timestamp = DateTime.parse(state.getProperty("timestamp")) 72 | 73 | logDebug(s"$baseURI state: $sequence @ $timestamp") 74 | 75 | Some(sequence) 76 | } catch { 77 | case err: Throwable => 78 | logError("Error fetching or parsing changeset state.", err) 79 | logError(baseURI.toString) 80 | 81 | None 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/ChangesetMicroBatchReader.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.net.URI 4 | import java.util 5 | 6 | import org.apache.spark.sql.catalyst.InternalRow 7 | import org.apache.spark.sql.sources.v2.DataSourceOptions 8 | import org.apache.spark.sql.sources.v2.reader.{InputPartition, InputPartitionReader} 9 | import vectorpipe.model.Changeset 10 | 11 | import scala.collection.JavaConverters._ 12 | 13 | case class ChangesetStreamBatchTask(baseURI: URI, sequences: Seq[Int]) 14 | extends InputPartition[InternalRow] { 15 | override def createPartitionReader(): InputPartitionReader[InternalRow] = 16 | new ChangesetStreamBatchReader(baseURI, sequences) 17 | } 18 | 19 | class ChangesetStreamBatchReader(baseURI: URI, sequences: Seq[Int]) 20 | extends ReplicationStreamBatchReader[Changeset](baseURI, sequences) { 21 | 22 | override def getSequence(baseURI: URI, sequence: Int): Seq[Changeset] = 23 | ChangesetSource.getChangeset(baseURI, sequence) 24 | } 25 | 26 | class ChangesetMicroBatchReader(options: DataSourceOptions, checkpointLocation: String) 27 | extends ReplicationStreamMicroBatchReader[Changeset](options, checkpointLocation) { 28 | private lazy val baseURI = new URI( 29 | options 30 | .get(Source.BaseURI) 31 | .orElse("https://planet.osm.org/replication/changesets/") 32 | ) 33 | 34 | override def getCurrentSequence: Option[Int] = 35 | ChangesetSource.getCurrentSequence(baseURI).map(_.sequence.toInt) 36 | 37 | override def planInputPartitions(): util.List[InputPartition[InternalRow]] = 38 | sequenceRange 39 | .map( 40 | seq => ChangesetStreamBatchTask(baseURI, Seq(seq)).asInstanceOf[InputPartition[InternalRow]] 41 | ) 42 | .asJava 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/ChangesetProvider.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.util.Optional 4 | 5 | import org.apache.spark.sql.sources.DataSourceRegister 6 | import org.apache.spark.sql.sources.v2.reader.DataSourceReader 7 | import org.apache.spark.sql.sources.v2.reader.streaming.MicroBatchReader 8 | import org.apache.spark.sql.sources.v2.{ 9 | DataSourceOptions, 10 | DataSourceV2, 11 | MicroBatchReadSupport, 12 | ReadSupport 13 | } 14 | import org.apache.spark.sql.types.StructType 15 | 16 | class ChangesetProvider 17 | extends DataSourceV2 18 | with ReadSupport 19 | with MicroBatchReadSupport 20 | with DataSourceRegister { 21 | override def createMicroBatchReader( 22 | schema: Optional[StructType], 23 | checkpointLocation: String, 24 | options: DataSourceOptions 25 | ): MicroBatchReader = { 26 | if (schema.isPresent) { 27 | throw new IllegalStateException( 28 | "The changesets source does not support a user-specified schema." 29 | ) 30 | } 31 | 32 | new ChangesetMicroBatchReader(options, checkpointLocation) 33 | } 34 | 35 | override def shortName(): String = Source.Changesets 36 | override def createReader(options: DataSourceOptions): DataSourceReader = 37 | ChangesetReader(options) 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/ChangesetReader.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.net.URI 4 | import java.util 5 | 6 | import org.apache.spark.sql.catalyst.InternalRow 7 | import org.apache.spark.sql.sources.v2.DataSourceOptions 8 | import org.apache.spark.sql.sources.v2.reader.InputPartition 9 | import vectorpipe.model.Changeset 10 | 11 | import scala.collection.JavaConverters._ 12 | import scala.util.Random 13 | 14 | case class ChangesetReader(options: DataSourceOptions) 15 | extends ReplicationReader[Changeset](options) { 16 | override def planInputPartitions(): util.List[InputPartition[InternalRow]] = { 17 | // prevent sequential diffs from being assigned to the same task 18 | val sequences = Random.shuffle((startSequence to endSequence).toList) 19 | 20 | sequences 21 | .grouped(Math.max(1, sequences.length / partitionCount)) 22 | .toList 23 | .map( 24 | ChangesetStreamBatchTask(baseURI, _) 25 | .asInstanceOf[InputPartition[InternalRow]] 26 | ) 27 | .asJava 28 | } 29 | 30 | override protected def getCurrentSequence: Option[Int] = 31 | ChangesetSource.getCurrentSequence(baseURI).map(_.sequence.toInt) 32 | 33 | private def baseURI = 34 | new URI( 35 | options 36 | .get(Source.BaseURI) 37 | .orElse("https://planet.osm.org/replication/changesets/")) 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/ChangesetSource.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.io.{ByteArrayInputStream, IOException} 4 | import java.net.URI 5 | import java.nio.charset.StandardCharsets 6 | import java.time.Instant 7 | import java.util.zip.GZIPInputStream 8 | 9 | import cats.implicits._ 10 | import io.circe.generic.auto._ 11 | import io.circe.{yaml, _} 12 | import org.apache.commons.io.IOUtils 13 | import org.apache.spark.internal.Logging 14 | import org.joda.time.DateTime 15 | import org.joda.time.format.DateTimeFormat 16 | import vectorpipe.model.Changeset 17 | import scalaj.http.Http 18 | 19 | import scala.concurrent.duration.{Duration, _} 20 | import scala.util.Try 21 | import scala.xml.XML 22 | 23 | object ChangesetSource extends Logging { 24 | val Delay: Duration = 15 seconds 25 | // state.yaml uses a custom date format 26 | private val formatter = DateTimeFormat.forPattern("y-M-d H:m:s.SSSSSSSSS Z") 27 | 28 | private implicit val dateTimeDecoder: Decoder[DateTime] = 29 | Decoder.instance(a => a.as[String].map(DateTime.parse(_, formatter))) 30 | 31 | def getChangeset(baseURI: URI, sequence: Int, retry: Boolean = true): Seq[Changeset] = { 32 | val s = f"$sequence%09d" 33 | val path = s"${s.slice(0, 3)}/${s.slice(3, 6)}/${s.slice(6, 9)}.osm.gz" 34 | 35 | logDebug(s"Fetching sequence $sequence") 36 | 37 | try { 38 | val response = 39 | Http(baseURI.resolve(path).toString).asBytes 40 | 41 | if (response.code == 404) { 42 | if (retry) { 43 | logDebug(s"$sequence is not yet available, sleeping.") 44 | Thread.sleep(Delay.toMillis) 45 | getChangeset(baseURI, sequence) 46 | } else { 47 | logDebug(s"$sequence is yet available") 48 | Seq() 49 | } 50 | } else { 51 | // NOTE: if diff bodies get really large, switch to a SAX parser to help with the memory footprint 52 | val bais = new ByteArrayInputStream(response.body) 53 | val gzis = new GZIPInputStream(bais) 54 | try { 55 | val data = XML.loadString(IOUtils.toString(gzis, StandardCharsets.UTF_8)) 56 | 57 | val changesets = (data \ "changeset").map(Changeset.fromXML(_, sequence)) 58 | 59 | logDebug(s"Received ${changesets.length} changesets") 60 | 61 | changesets 62 | } finally { 63 | gzis.close() 64 | bais.close() 65 | } 66 | } 67 | } catch { 68 | case e: IOException => 69 | logWarning(s"Error fetching changeset $sequence", e) 70 | Thread.sleep(Delay.toMillis) 71 | getChangeset(baseURI, sequence) 72 | } 73 | } 74 | 75 | case class Sequence(last_run: DateTime, sequence: Long) 76 | 77 | private def grabSequence(baseURI: URI, filename: String): Sequence = { 78 | val response = 79 | Http(baseURI.resolve("state.yaml").toString).asString 80 | 81 | val state = yaml.parser 82 | .parse(response.body) 83 | .leftMap(err => err: Error) 84 | .flatMap(_.as[Sequence]) 85 | .valueOr(throw _) 86 | 87 | state 88 | } 89 | 90 | def getCurrentSequence(baseURI: URI): Option[Sequence] = { 91 | var state: Try[Sequence] = null 92 | 93 | for (i <- Range(0, 5)) { 94 | state = Try(grabSequence(baseURI, "state.yaml")) 95 | 96 | if (state.isSuccess) { 97 | logDebug(s"$baseURI state: ${state.get.sequence} @ ${state.get.last_run}") 98 | 99 | return Some(state.get) 100 | } 101 | 102 | Thread.sleep(5000) 103 | } 104 | 105 | logError("Error fetching / parsing changeset state.", state.failed.get) 106 | None 107 | } 108 | 109 | def getSequence(baseURI: URI, sequence: Long): Option[Sequence] = { 110 | val s = f"${sequence+1}%09d" 111 | val path = s"${s.slice(0, 3)}/${s.slice(3, 6)}/${s.slice(6, 9)}.state.txt" 112 | 113 | try { 114 | val state = grabSequence(baseURI, path) 115 | 116 | Some(state) 117 | } catch { 118 | case err: Throwable => 119 | logError("Error fetching / parsing changeset state.", err) 120 | 121 | None 122 | } 123 | } 124 | 125 | def estimateSequenceNumber(modifiedTime: Instant, baseURI: URI, maxIters: Int = 1000): Long = { 126 | val current = getCurrentSequence(baseURI) 127 | if (current.isDefined) { 128 | val diffMinutes = (current.get.last_run.getMillis/1000 - 129 | modifiedTime.getEpochSecond) / 60 130 | current.get.sequence - diffMinutes 131 | } else { 132 | // Some queries on the state.yaml fail, set up a failsafe 133 | // ###.state.txt may not be provided for all replications, so use changsets 134 | var i = 0 135 | var baseTime: Long = -1 136 | while (baseTime == -1 && i < maxIters) { 137 | baseTime = getChangeset(baseURI, i, false).map(_.createdAt.toInstant.getEpochSecond).sorted.lastOption.getOrElse(-1L) 138 | i += 1 139 | } 140 | if (i == maxIters) 141 | throw new IndexOutOfBoundsException(s"Couldn't find non-empty changeset in ${maxIters} attempts") 142 | 143 | val query = modifiedTime.getEpochSecond 144 | 145 | (query - baseTime) / 60 + i 146 | } 147 | } 148 | 149 | private def safeSequenceTime(baseURI: URI, sequence: Long): Option[Instant] = { 150 | val res = getSequence(baseURI, sequence) 151 | if (res.isDefined) { 152 | Some(Instant.parse(res.get.last_run.toString)) 153 | } else { 154 | getChangeset(baseURI, sequence.toInt, false).map(_.createdAt.toInstant).sortBy(_.getEpochSecond).lastOption.map{ inst => Instant.parse(inst.toString).plusSeconds(60) } 155 | } 156 | } 157 | 158 | def findSequenceFor(modifiedTime: Instant, baseURI: URI): Long = { 159 | var guess = estimateSequenceNumber(modifiedTime, baseURI) 160 | 161 | while (safeSequenceTime(baseURI, guess).map(_.isAfter(modifiedTime)).getOrElse(false)) { guess -= 1 } 162 | while (safeSequenceTime(baseURI, guess).map(_.isBefore(modifiedTime)).getOrElse(false)) { guess += 1 } 163 | 164 | guess 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/ReplicationReader.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import org.apache.spark.SparkEnv 4 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder 5 | import org.apache.spark.sql.internal.SQLConf 6 | import org.apache.spark.sql.sources.v2.DataSourceOptions 7 | import org.apache.spark.sql.sources.v2.reader.DataSourceReader 8 | import org.apache.spark.sql.types.StructType 9 | 10 | import scala.compat.java8.OptionConverters._ 11 | import scala.reflect.runtime.universe.TypeTag 12 | 13 | abstract class ReplicationReader[T <: Product: TypeTag](options: DataSourceOptions) 14 | extends DataSourceReader { 15 | private lazy val schema: StructType = ExpressionEncoder[T].schema 16 | 17 | val DefaultPartitionCount: Int = 18 | SparkEnv.get.conf 19 | .getInt(SQLConf.SHUFFLE_PARTITIONS.key, SQLConf.SHUFFLE_PARTITIONS.defaultValue.get) 20 | 21 | protected val partitionCount: Int = 22 | options.getInt(Source.PartitionCount, DefaultPartitionCount) 23 | 24 | protected var endSequence: Int = 25 | options 26 | .get(Source.EndSequence) 27 | .asScala 28 | .map(s => s.toInt - 1) 29 | .getOrElse(getCurrentSequence 30 | .getOrElse(throw new RuntimeException("Could not determine end sequence."))) 31 | 32 | override def readSchema(): StructType = schema 33 | 34 | protected def startSequence: Int = 35 | options 36 | .get(Source.StartSequence) 37 | .asScala 38 | .map(s => s.toInt) 39 | .getOrElse(getCurrentSequence 40 | .getOrElse(throw new RuntimeException("Could not determine start sequence."))) 41 | 42 | protected def getCurrentSequence: Option[Int] 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/ReplicationStreamBatchReader.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import java.net.URI 4 | 5 | import org.apache.spark.internal.Logging 6 | import org.apache.spark.sql.catalyst.InternalRow 7 | import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} 8 | import org.apache.spark.sql.sources.v2.reader.InputPartitionReader 9 | 10 | import scala.collection.parallel.ForkJoinTaskSupport 11 | import scala.concurrent.forkjoin.ForkJoinPool 12 | import scala.reflect.runtime.universe.TypeTag 13 | 14 | abstract class ReplicationStreamBatchReader[T <: Product: TypeTag](baseURI: URI, 15 | sequences: Seq[Int]) 16 | extends InputPartitionReader[InternalRow] 17 | with Logging { 18 | org.apache.spark.sql.jts.registerTypes() 19 | private lazy val rowEncoder = RowEncoder(encoder.schema).resolveAndBind() 20 | protected var index: Int = -1 21 | protected var items: Vector[T] = _ 22 | val Concurrency: Int = 8 23 | private lazy val encoder = ExpressionEncoder[T] 24 | 25 | override def next(): Boolean = { 26 | index += 1 27 | 28 | if (Option(items).isEmpty) { 29 | val parSequences = sequences.par 30 | val taskSupport = new ForkJoinTaskSupport(new ForkJoinPool(Concurrency)) 31 | parSequences.tasksupport = taskSupport 32 | 33 | items = parSequences.flatMap(seq => getSequence(baseURI, seq)).toVector 34 | 35 | taskSupport.environment.shutdown() 36 | } 37 | 38 | index < items.length 39 | } 40 | 41 | override def get(): InternalRow = encoder.toRow(items(index)) 42 | 43 | override def close(): Unit = Unit 44 | 45 | protected def getSequence(baseURI: URI, sequence: Int): Seq[T] 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/SequenceOffset.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | import org.apache.spark.sql.sources.v2.reader.streaming.Offset 3 | 4 | case class SequenceOffset(sequence: Int, pending: Boolean = false) 5 | extends Offset 6 | with Ordered[SequenceOffset] { 7 | override val json: String = s"[$sequence,${pending.compare(false)}]" 8 | 9 | def +(increment: Int): SequenceOffset = SequenceOffset(sequence + increment) 10 | def -(decrement: Int): SequenceOffset = SequenceOffset(sequence - decrement) 11 | def next: SequenceOffset = SequenceOffset(sequence, pending = true) 12 | 13 | override def compare(that: SequenceOffset): Int = 14 | sequence.compare(that.sequence) match { 15 | case 0 => pending.compare(that.pending) 16 | case x => x 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/sources/Source.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | object Source { 4 | val AugmentedDiffs: String = "augmented-diffs" 5 | val Changes: String = "changes" 6 | val Changesets: String = "changesets" 7 | 8 | val BaseURI: String = "base_uri" 9 | val BatchSize: String = "batch_size" 10 | val DatabaseURI: String = "db_uri" 11 | val PartitionCount: String = "partition_count" 12 | val ProcessName: String = "proc_name" 13 | val StartSequence: String = "start_sequence" 14 | val EndSequence: String = "end_sequence" 15 | 16 | val ErrorHandler: String = "error_handler" 17 | val ErrorCodes: Set[Int] = Set(403, 404) 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/util/Auth.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.util 2 | 3 | import java.net.URI 4 | 5 | case class Auth(user: Option[String], password: Option[String]) { 6 | def isDefined: Boolean = (user.isDefined && password.isDefined) 7 | } 8 | 9 | object Auth { 10 | def fromUri(uri: URI, userParam: String = "user", passwordParam: String = "password"): Auth = { 11 | val auth = getUriUserInfo(uri) 12 | if (auth.isDefined) { 13 | auth 14 | } else { 15 | val params = getUriParams(uri) 16 | auth.copy( 17 | user = auth.user.orElse(params.get(userParam)), 18 | password = auth.password.orElse(params.get(passwordParam)) 19 | ) 20 | } 21 | } 22 | 23 | /** Parse only the URI auth section */ 24 | def getUriUserInfo(uri: URI): Auth = { 25 | val info = uri.getUserInfo 26 | if (null == info) 27 | Auth(None, None) 28 | else { 29 | val chunk = info.split(":") 30 | if (chunk.length == 1) 31 | Auth(Some(chunk(0)), None) 32 | else 33 | Auth(Some(chunk(0)), Some(chunk(1))) 34 | } 35 | } 36 | 37 | /** Parse URI parameters */ 38 | def getUriParams(uri: URI): Map[String, String] = { 39 | val query = uri.getQuery 40 | if (null == query) 41 | Map.empty[String, String] 42 | else { 43 | query.split("&").map{ param => 44 | val arr = param.split("=") 45 | arr(0) -> arr(1) 46 | }.toMap 47 | } 48 | } 49 | } -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/util/DBUtils.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.util 2 | 3 | import java.net.URI 4 | import java.sql.{Connection, DriverManager} 5 | 6 | object DBUtils { 7 | def getJdbcConnection(uri: URI): Connection = { 8 | 9 | val cleanUri = new URI( 10 | uri.getScheme, 11 | Option(uri.getHost).getOrElse("localhost") + (if (uri.getPort > 0) ":" + uri.getPort else ""), 12 | uri.getPath, 13 | null.asInstanceOf[String], 14 | null.asInstanceOf[String] 15 | ) 16 | // also drops UserInfo 17 | 18 | val auth = Auth.fromUri(uri) 19 | (auth.user, auth.password) match { 20 | case (Some(user), Some(pass)) => 21 | DriverManager.getConnection(s"jdbc:${cleanUri.toString}", user, pass) 22 | case _ => 23 | DriverManager.getConnection(s"jdbc:${cleanUri.toString}") 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/util/Geocode.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.util 2 | 3 | import geotrellis.vector._ 4 | import geotrellis.vector.io.json._ 5 | 6 | import org.apache.spark.sql._ 7 | import org.apache.spark.sql.catalyst.encoders.RowEncoder 8 | import org.apache.spark.sql.functions._ 9 | import org.apache.spark.sql.types._ 10 | 11 | import org.locationtech.jts.geom.prep._ 12 | import org.locationtech.jts.index.ItemVisitor 13 | 14 | import _root_.io.circe.{Encoder => CirceEncoder, Decoder => CirceDecoder, _} 15 | import cats.syntax.either._ 16 | 17 | object Geocode { 18 | 19 | case class CountryId(code: String) 20 | 21 | object CountryIdCodecs { 22 | implicit val encodeCountryId: CirceEncoder[CountryId] = new CirceEncoder[CountryId] { 23 | final def apply(a: CountryId): Json = Json.obj( 24 | ("code", Json.fromString(a.code)) 25 | ) 26 | } 27 | implicit val decodeCountryId: CirceDecoder[CountryId] = new CirceDecoder[CountryId] { 28 | final def apply(c: HCursor): CirceDecoder.Result[CountryId] = 29 | for { 30 | code <- c.downField("ADM0_A3").as[String] 31 | } yield { 32 | CountryId(code) 33 | } 34 | } 35 | } 36 | 37 | import CountryIdCodecs._ 38 | 39 | object Countries { 40 | lazy val all: Vector[MultiPolygonFeature[CountryId]] = { 41 | val collection = 42 | Resource("countries.geojson"). 43 | parseGeoJson[JsonFeatureCollection] 44 | 45 | val polys = 46 | collection. 47 | getAllPolygonFeatures[CountryId]. 48 | map(_.mapGeom(MultiPolygon(_))) 49 | 50 | val mps = 51 | collection. 52 | getAllMultiPolygonFeatures[CountryId] 53 | 54 | polys ++ mps 55 | } 56 | 57 | def indexed: SpatialIndex[MultiPolygonFeature[CountryId]] = 58 | SpatialIndex.fromExtents(all) { mpf => mpf.geom.extent } 59 | } 60 | 61 | class CountryLookup() extends Serializable { 62 | private val index = 63 | geotrellis.vector.SpatialIndex.fromExtents( 64 | Countries.all. 65 | map { mpf => 66 | (PreparedGeometryFactory.prepare(mpf.geom), mpf.data) 67 | } 68 | ) { case (pg, _) => pg.getGeometry().extent } 69 | 70 | def lookup(geom: geotrellis.vector.Geometry): Traversable[CountryId] = { 71 | val t = 72 | new Traversable[(PreparedGeometry, CountryId)] { 73 | override def foreach[U](f: ((PreparedGeometry, CountryId)) => U): Unit = { 74 | val visitor = new ItemVisitor { 75 | override def visitItem(obj: AnyRef): Unit = 76 | f(obj.asInstanceOf[(PreparedGeometry, CountryId)]) 77 | } 78 | index.rtree.query(geom.getEnvelopeInternal, visitor) 79 | } 80 | } 81 | 82 | t. 83 | filter(_._1.intersects(geom)). 84 | map(_._2) 85 | } 86 | } 87 | 88 | def apply(geoms: DataFrame): DataFrame = { 89 | val newSchema = StructType(geoms.schema.fields :+ StructField( 90 | "countries", ArrayType(StringType, containsNull = false), nullable = true)) 91 | implicit val encoder: Encoder[Row] = RowEncoder(newSchema) 92 | 93 | geoms 94 | .mapPartitions { partition => 95 | val countryLookup = new CountryLookup() 96 | 97 | partition.map { row => 98 | val countryCodes = Option(row.getAs[Geometry]("geom")) match { 99 | case Some(geom) => countryLookup.lookup(geom).map(x => x.code) 100 | case None => Seq.empty[String] 101 | } 102 | 103 | Row.fromSeq(row.toSeq :+ countryCodes) 104 | } 105 | } 106 | } 107 | 108 | def regionsByChangeset(geomCountries: Dataset[Row]): DataFrame = { 109 | import geomCountries.sparkSession.implicits._ 110 | 111 | geomCountries 112 | .where('country.isNotNull) 113 | .groupBy('changeset) 114 | .agg(collect_set('country) as 'countries) 115 | 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/util/Implicits.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.util 2 | 3 | import geotrellis.vector._ 4 | import _root_.io.circe._ 5 | 6 | import scala.reflect.ClassTag 7 | 8 | object Implicits extends Implicits 9 | 10 | trait Implicits extends RobustFeatureFormats { 11 | implicit class RobustFeaturesToGeoJson[G <: Geometry: ClassTag, D: Encoder](features: Traversable[RobustFeature[G, D]]) { 12 | def toGeoJson(): String = { 13 | val fc = new JsonRobustFeatureCollection 14 | 15 | features.foreach(fc.add(_)) 16 | 17 | fc.asJson.noSpaces 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/util/JsonRobustFeatureCollection.scala: -------------------------------------------------------------------------------- 1 | 2 | package vectorpipe.util 3 | 4 | import cats.syntax.either._ 5 | import geotrellis.vector._ 6 | import _root_.io.circe._ 7 | import _root_.io.circe.syntax._ 8 | 9 | import scala.collection.immutable.VectorBuilder 10 | import scala.collection.mutable 11 | import scala.reflect.ClassTag 12 | 13 | class JsonRobustFeatureCollection(features: List[Json] = Nil) { 14 | private val buffer = mutable.ListBuffer(features: _*) 15 | 16 | def add[G <: Geometry: ClassTag, D: Encoder](feature: RobustFeature[G, D]) = 17 | buffer += RobustFeatureFormats.writeRobustFeatureJson(feature) 18 | 19 | def getAll[F: Decoder]: Vector[F] = { 20 | val ret = new VectorBuilder[F]() 21 | features.foreach{ _.as[F].foreach(ret += _) } 22 | ret.result() 23 | } 24 | 25 | def getAllRobustFeatures[F <: RobustFeature[_, _] :Decoder]: Vector[F] = 26 | getAll[F] 27 | 28 | def getAllPointFeatures[D: Decoder]() = getAll[RobustFeature[Point, D]] 29 | def getAllLineStringFeatures[D: Decoder]() = getAll[RobustFeature[LineString, D]] 30 | def getAllPolygonFeatures[D: Decoder]() = getAll[RobustFeature[Polygon, D]] 31 | def getAllMultiPointFeatures[D: Decoder]() = getAll[RobustFeature[MultiPoint, D]] 32 | def getAllMultiLineStringFeatures[D: Decoder]() = getAll[RobustFeature[MultiLineString, D]] 33 | def getAllMultiPolygonFeatures[D: Decoder]() = getAll[RobustFeature[MultiPolygon, D]] 34 | 35 | def getAllGeometries(): Vector[Geometry] = 36 | getAll[Point] ++ getAll[LineString] ++ getAll[Polygon] ++ 37 | getAll[MultiPoint] ++ getAll[MultiLineString] ++ getAll[MultiPolygon] 38 | 39 | def asJson: Json = { 40 | val bboxOption = getAllGeometries.map(_.extent).reduceOption(_ combine _) 41 | bboxOption match { 42 | case Some(bbox) => 43 | Json.obj( 44 | "type" -> "FeatureCollection".asJson, 45 | "bbox" -> Extent.listEncoder(bbox), 46 | "features" -> buffer.toVector.asJson 47 | ) 48 | case _ => 49 | Json.obj( 50 | "type" -> "FeatureCollection".asJson, 51 | "features" -> buffer.toVector.asJson 52 | ) 53 | } 54 | } 55 | } 56 | 57 | object JsonRobustFeatureCollection { 58 | def apply() = new JsonRobustFeatureCollection() 59 | 60 | def apply[G <: Geometry: ClassTag, D: Encoder](features: Traversable[RobustFeature[G, D]]) = { 61 | val fc = new JsonRobustFeatureCollection() 62 | features.foreach(fc.add(_)) 63 | fc 64 | } 65 | 66 | def apply(features: Traversable[Json])(implicit d: DummyImplicit): JsonRobustFeatureCollection = 67 | new JsonRobustFeatureCollection(features.toList) 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/util/JsonRobustFeatureCollectionMap.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.util 2 | 3 | import cats.syntax.either._ 4 | import geotrellis.vector._ 5 | import _root_.io.circe._ 6 | import _root_.io.circe.syntax._ 7 | 8 | import scala.collection.mutable 9 | import scala.reflect.ClassTag 10 | 11 | class JsonRobustFeatureCollectionMap(features: List[Json] = Nil) { 12 | private val buffer = mutable.ListBuffer(features:_*) 13 | 14 | def add[G <: Geometry: ClassTag, D: Encoder](featureMap: (String, RobustFeature[G, D])) = 15 | buffer += RobustFeatureFormats.writeRobustFeatureJsonWithID(featureMap) 16 | 17 | def asJson: Json = { 18 | val bboxOption = getAll[Geometry].map(_._2.extent).reduceOption(_ combine _) 19 | bboxOption match { 20 | case Some(bbox) => 21 | Json.obj( 22 | "type" -> "FeatureCollection".asJson, 23 | "bbox" -> Extent.listEncoder(bbox), 24 | "features" -> buffer.toVector.asJson 25 | ) 26 | case _ => 27 | Json.obj( 28 | "type" -> "FeatureCollection".asJson, 29 | "features" -> buffer.toVector.asJson 30 | ) 31 | } 32 | } 33 | 34 | private def getFeatureID(js: Json): String = { 35 | val c = js.hcursor 36 | val id = c.downField("id") 37 | id.as[String] match { 38 | case Right(i) => i 39 | case _ => 40 | id.as[Int] match { 41 | case Right(i) => i.toString 42 | case _ => throw DecodingFailure("Feature expected to have \"ID\" field", c.history) 43 | } 44 | } 45 | } 46 | 47 | def getAll[F: Decoder]: Map[String, F] = { 48 | var ret = Map[String, F]() 49 | features.foreach{ f => f.as[F].foreach(ret += getFeatureID(f) -> _) } 50 | ret 51 | } 52 | } 53 | 54 | object JsonRobustFeatureCollectionMap { 55 | def apply() = new JsonRobustFeatureCollectionMap() 56 | 57 | def apply[G <: Geometry: ClassTag, D: Encoder](features: Traversable[(String, RobustFeature[G, D])]) = { 58 | val fc = new JsonRobustFeatureCollectionMap() 59 | features.foreach(fc.add(_)) 60 | fc 61 | } 62 | 63 | def apply(features: Traversable[Json])(implicit d: DummyImplicit): JsonRobustFeatureCollectionMap = 64 | new JsonRobustFeatureCollectionMap(features.toList) 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/util/Resource.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.util 2 | 3 | import java.io.InputStream 4 | 5 | object Resource { 6 | def apply(name: String): String = { 7 | val stream: InputStream = getClass.getResourceAsStream(s"/$name") 8 | try { 9 | scala.io.Source.fromInputStream(stream).getLines.mkString(" ") 10 | } finally { 11 | stream.close() 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/util/RobustFeature.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.util 2 | 3 | import cats.syntax.either._ 4 | import geotrellis.vector._ 5 | import geotrellis.vector.io.json._ 6 | import _root_.io.circe._ 7 | import _root_.io.circe.syntax._ 8 | 9 | import scala.reflect.ClassTag 10 | import scala.util.{Try, Success, Failure} 11 | 12 | case class RobustFeature[+G <: Geometry: ClassTag, D](geom: Option[G], data: D) { 13 | def toFeature(): Feature[G, D] = { 14 | val g = geom match { 15 | case Some(gg) => gg 16 | case _ => MultiPoint.EMPTY 17 | } 18 | Feature(g.asInstanceOf[G], data) 19 | } 20 | } 21 | 22 | trait RobustFeatureFormats { 23 | def writeRobustFeatureJson[G <: Geometry: ClassTag, D: Encoder](obj: RobustFeature[G, D]): Json = { 24 | val feature = obj.toFeature 25 | Json.obj( 26 | "type" -> "Feature".asJson, 27 | "geometry" -> GeometryFormats.geometryEncoder(feature.geom), 28 | "bbox" -> Extent.listEncoder(feature.geom.extent), 29 | "properties" -> obj.data.asJson 30 | ) 31 | } 32 | 33 | def writeRobustFeatureJsonWithID[G <: Geometry: ClassTag, D: Encoder](idFeature: (String, RobustFeature[G, D])): Json = { 34 | val feature = idFeature._2.toFeature 35 | Json.obj( 36 | "type" -> "Feature".asJson, 37 | "geometry" -> GeometryFormats.geometryEncoder(feature.geom), 38 | "bbox" -> Extent.listEncoder(feature.geom.extent), 39 | "properties" -> idFeature._2.data.asJson, 40 | "id" -> idFeature._1.asJson 41 | ) 42 | } 43 | 44 | def readRobustFeatureJson[D: Decoder, G <: Geometry: Decoder: ClassTag](value: Json): RobustFeature[G, D] = { 45 | val c = value.hcursor 46 | (c.downField("type").as[String], c.downField("geometry").focus, c.downField("properties").focus) match { 47 | case (Right("Feature"), Some(geom), Some(data)) => 48 | //val g = Try(geom.convertTo[G]).toOption 49 | //val d = data.convertTo[D] 50 | (Try(geom.as[G].toOption).toOption.getOrElse(None), data.as[D].toOption) match { 51 | case (Some(g), Some(d)) if g isEmpty => RobustFeature(None, d) 52 | case (Some(g), Some(d)) => RobustFeature(Some(g), d) 53 | case (None, Some(d)) => RobustFeature(None, d) 54 | case (_, None) => throw new Exception(s"Feature expected well-formed data; got $data") 55 | } 56 | case _ => throw new Exception("Feature expected") 57 | } 58 | } 59 | 60 | def readRobustFeatureJsonWithID[D: Decoder, G <: Geometry: Decoder: ClassTag](value: Json): (String, RobustFeature[G, D]) = { 61 | val c = value.hcursor 62 | (c.downField("type").as[String], c.downField("geometry").focus, c.downField("properties").focus, c.downField("id").focus) match { 63 | case (Right("Feature"), Some(geom), Some(data), Some(id)) => 64 | //val g = Try(geom.convertTo[G]).toOption 65 | //val d = data.convertTo[D] 66 | (Try(geom.as[G].toOption).toOption.getOrElse(None), data.as[D].toOption, id.as[String].toOption) match { 67 | case (Some(g), Some(d), Some(i)) if g isEmpty => (i, RobustFeature(None, d)) 68 | case (Some(g), Some(d), Some(i)) => (i, RobustFeature(Some(g), d)) 69 | case (None, Some(d), Some(i)) => (i, RobustFeature(None, d)) 70 | case _ => throw new Exception(s"Feature expected well-formed id and data; got (${id}, ${data})") 71 | } 72 | case _ => throw new Exception("Feature expected") 73 | } 74 | } 75 | 76 | implicit def robustFeatureDecoder[G <: Geometry: Decoder: ClassTag, D: Decoder]: Decoder[RobustFeature[G, D]] = 77 | Decoder.decodeJson.emap { json: Json => 78 | Try(readRobustFeatureJson[D, G](json)) match { 79 | case Success(f) => Right(f) 80 | case Failure(e) => Left(e.getMessage) 81 | } 82 | } 83 | 84 | implicit def robustFeatureEncoder[G <: Geometry: Encoder: ClassTag, D: Encoder]: Encoder[RobustFeature[G, D]] = 85 | Encoder.encodeJson.contramap[RobustFeature[G, D]] { writeRobustFeatureJson } 86 | 87 | implicit val robustFeatureCollectionEncoder: Encoder[JsonRobustFeatureCollection] = 88 | Encoder.encodeJson.contramap[JsonRobustFeatureCollection] { _.asJson } 89 | 90 | implicit val robustFeatureCollectionDecoder: Decoder[JsonRobustFeatureCollection] = 91 | Decoder.decodeHCursor.emap { c: HCursor => 92 | (c.downField("type").as[String], c.downField("features").focus) match { 93 | case (Right("FeatureCollection"), Some(features)) => Right(JsonRobustFeatureCollection(features.asArray.toVector.flatten)) 94 | case _ => Left("FeatureCollection expected") 95 | } 96 | } 97 | 98 | implicit val robustFeatureCollectionMapEncoder: Encoder[JsonRobustFeatureCollectionMap] = 99 | Encoder.encodeJson.contramap[JsonRobustFeatureCollectionMap] { _.asJson } 100 | 101 | implicit val robustFeatureCollectionMapDecoder: Decoder[JsonRobustFeatureCollectionMap] = 102 | Decoder.decodeHCursor.emap { c: HCursor => 103 | (c.downField("type").as[String], c.downField("features").focus) match { 104 | case (Right("FeatureCollection"), Some(features)) => Right(JsonRobustFeatureCollectionMap(features.asArray.toVector.flatten)) 105 | case _ => Left("FeatureCollection expected") 106 | } 107 | } 108 | } 109 | 110 | object RobustFeatureFormats extends RobustFeatureFormats 111 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/util/package.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe 2 | 3 | package object util extends Implicits { 4 | def mergeMaps[K, V](m1: Map[K, V], m2: Map[K, V])(f: (V, V) => V): Map[K, V] = 5 | (m1.toSeq ++ m2.toSeq). 6 | groupBy(_._1). 7 | map { case (k, vs) => 8 | (k, vs.map(_._2).reduce(f)) 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/vectortile/Clipping.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.vectortile 2 | 3 | import geotrellis.layer.SpatialKey 4 | import geotrellis.layer.LayoutLevel 5 | import geotrellis.vector._ 6 | 7 | import scala.concurrent.ExecutionContext.Implicits.global 8 | 9 | object Clipping { 10 | def byLayoutCell(geom: Geometry, key: SpatialKey, layoutLevel: LayoutLevel): Geometry = { 11 | val ex = layoutLevel.layout.mapTransform.keyToExtent(key) 12 | 13 | // Preserve dimension of resultant geometry 14 | val clipped = geom match { 15 | case p: Point => p // points with the current key intersect the extent by definition 16 | case mp: MultiPoint => 17 | timedIntersect(mp, ex) match { 18 | case PointResult(pr) => pr 19 | case MultiPointResult(mpr) => mpr 20 | case NoResult => 21 | logger.warn(s"$geom was keyed to layout cell $key, but did not intersect $ex [zoom=${layoutLevel.zoom}]") 22 | geom 23 | case _ => // should never match here; just shut the compiler up 24 | geom 25 | } 26 | case l: LineString => 27 | timedIntersect(l, ex) match { 28 | case LineStringResult(lr) => lr 29 | case MultiLineStringResult(mlr) => mlr 30 | case GeometryCollectionResult(gcr) => 31 | gcr.getAll[LineString].length match { 32 | case 0 => MultiLineString() 33 | case 1 => gcr.getAll[LineString].head 34 | case _ => MultiLineString(gcr.getAll[LineString]) 35 | } 36 | case NoResult => 37 | logger.warn(s"$geom was keyed to layout cell $key, but did not intersect $ex [zoom=${layoutLevel.zoom}]") 38 | geom 39 | case _ => 40 | MultiLineString() // Discard (multi-)point results 41 | } 42 | case ml: MultiLineString => 43 | timedIntersect(ml, ex) match { 44 | case LineStringResult(lr) => lr 45 | case MultiLineStringResult(mlr) => mlr 46 | case GeometryCollectionResult(gcr) => 47 | (gcr.getAll[LineString].length, gcr.getAll[MultiLineString].length) match { 48 | case (0, 0) => MultiLineString() 49 | case (1, 0) => gcr.getAll[LineString].head 50 | case (0, 1) => gcr.getAll[MultiLineString].head 51 | case _ => MultiLineString(gcr.getAll[LineString] ++ gcr.getAll[MultiLineString].flatMap(_.lines.toSeq)) 52 | } 53 | case NoResult => 54 | logger.warn(s"$geom was keyed to layout cell $key, but did not intersect $ex [zoom=${layoutLevel.zoom}]") 55 | geom 56 | case _ => 57 | MultiLineString() // Discard (multi-)point results 58 | } 59 | case poly: Polygon => 60 | timedIntersect(poly, ex) match { 61 | case PolygonResult(pr) => pr 62 | case MultiPolygonResult(mpr) => mpr 63 | case GeometryCollectionResult(gcr) => 64 | gcr.getAll[Polygon].length match { 65 | case 0 => MultiPolygon() 66 | case 1 => gcr.getAll[Polygon].head 67 | case _ => MultiPolygon(gcr.getAll[Polygon]) 68 | } 69 | case NoResult => 70 | logger.warn(s"$geom was keyed to layout cell $key, but did not intersect $ex [zoom=${layoutLevel.zoom}]") 71 | geom 72 | case _ => MultiPolygon() // ignore point/line results 73 | } 74 | case mp: MultiPolygon => 75 | timedIntersect(mp, ex) match { 76 | case PolygonResult(pr) => pr 77 | case MultiPolygonResult(mpr) => mpr 78 | case GeometryCollectionResult(gcr) => 79 | (gcr.getAll[Polygon].length, gcr.getAll[MultiPolygon].length) match { 80 | case (0, 0) => MultiPolygon() 81 | case (1, 0) => gcr.getAll[Polygon].head 82 | case (0, 1) => gcr.getAll[MultiPolygon].head 83 | case _ => MultiPolygon(gcr.getAll[Polygon] ++ gcr.getAll[MultiPolygon].flatMap(_.polygons.toSeq)) 84 | } 85 | case NoResult => 86 | logger.warn(s"$geom was keyed to layout cell $key, but did not intersect $ex [zoom=${layoutLevel.zoom}]") 87 | geom 88 | case _ => MultiPolygon() // ignore point/line results 89 | } 90 | } 91 | clipped 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/vectortile/Simplify.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.vectortile 2 | 3 | import geotrellis.vector._ 4 | import geotrellis.layer._ 5 | import org.locationtech.jts.simplify.TopologyPreservingSimplifier 6 | 7 | object Simplify { 8 | 9 | /** 10 | * Simplifies geometry using JTS's topology-preserving simplifier. 11 | * 12 | * Note that there are known bugs with this simplifier. Please refer to the 13 | * JTS documentation. Faster simplifiers with fewer guarantees are available 14 | * there as well. 15 | */ 16 | def withJTS(g: Geometry, ld: LayoutDefinition): Geometry = { 17 | TopologyPreservingSimplifier.simplify(g, ld.cellSize.resolution) 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/vectortile/export/package.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.vectortile 2 | 3 | import geotrellis.layer.SpatialKey 4 | import geotrellis.spark.store.hadoop._ 5 | import geotrellis.spark.store.s3._ 6 | import geotrellis.vectortile._ 7 | import org.apache.spark.rdd.RDD 8 | 9 | import software.amazon.awssdk.services.s3.model.ObjectCannedACL 10 | 11 | import java.net.URI 12 | import java.io.ByteArrayOutputStream 13 | import java.util.zip.GZIPOutputStream 14 | 15 | package object export { 16 | def saveVectorTiles(vectorTiles: RDD[(SpatialKey, VectorTile)], zoom: Int, uri: URI): Unit = { 17 | uri.getScheme match { 18 | case "s3" => 19 | val path = uri.getPath 20 | val prefix = path.stripPrefix("/").stripSuffix("/") 21 | saveToS3(vectorTiles, zoom, uri.getAuthority, prefix) 22 | case _ => 23 | saveHadoop(vectorTiles, zoom, uri) 24 | } 25 | } 26 | 27 | private def saveToS3(vectorTiles: RDD[(SpatialKey, VectorTile)], zoom: Int, bucket: String, prefix: String) = { 28 | vectorTiles 29 | .mapValues { tile => 30 | val byteStream = new ByteArrayOutputStream() 31 | 32 | try { 33 | val gzipStream = new GZIPOutputStream(byteStream) 34 | try { 35 | gzipStream.write(tile.toBytes) 36 | } finally { 37 | gzipStream.close() 38 | } 39 | } finally { 40 | byteStream.close() 41 | } 42 | 43 | byteStream.toByteArray 44 | } 45 | .saveToS3( 46 | { sk: SpatialKey => s"s3://${bucket}/${prefix}/${zoom}/${sk.col}/${sk.row}.mvt" }, 47 | putObjectModifier = { request => 48 | request 49 | .toBuilder() 50 | .contentEncoding("gzip") 51 | .acl(ObjectCannedACL.PUBLIC_READ) 52 | .build() 53 | }) 54 | } 55 | 56 | private def saveHadoop(vectorTiles: RDD[(SpatialKey, VectorTile)], zoom: Int, uri: URI) = { 57 | vectorTiles 58 | .mapValues(_.toBytes) 59 | .saveToHadoop({ sk: SpatialKey => s"${uri}/${zoom}/${sk.col}/${sk.row}.mvt" }) 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/vectorpipe/vectortile/package.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe 2 | 3 | import geotrellis.proj4._ 4 | import geotrellis.layer.SpatialKey 5 | import geotrellis.layer.LayoutDefinition 6 | import geotrellis.vector._ 7 | import geotrellis.vectortile._ 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 10 | import org.apache.spark.sql.functions._ 11 | 12 | import scala.concurrent._ 13 | import scala.concurrent.duration._ 14 | import scala.util.{Try, Success, Failure} 15 | 16 | package object vectortile { 17 | type VectorTileFeature[+G <: Geometry] = Feature[G, Map[String, Value]] 18 | 19 | def vtf2mvtf[G <: Geometry](vtf: VectorTileFeature[G]): MVTFeature[G] = 20 | MVTFeature(vtf.geom, vtf.data) 21 | 22 | sealed trait LayerMultiplicity { val name: String } 23 | case class SingleLayer(val name: String) extends LayerMultiplicity 24 | case class LayerNamesInColumn(val name: String) extends LayerMultiplicity 25 | 26 | @transient lazy val logger = org.apache.log4j.Logger.getRootLogger 27 | 28 | @transient lazy val st_reprojectGeom = udf { (g: Geometry, srcProj: String, destProj: String) => 29 | val trans = Proj4Transform(CRS.fromString(srcProj), CRS.fromString(destProj)) 30 | if (Option(g).isDefined) { 31 | if (g.isEmpty) 32 | g 33 | else { 34 | g.reproject(trans) 35 | } 36 | } else { 37 | null 38 | } 39 | } 40 | 41 | def keyTo(layout: LayoutDefinition) = udf { g: Geometry => 42 | if (Option(g).isDefined && !g.isEmpty) { 43 | layout.mapTransform.keysForGeometry(g).toArray 44 | } else { 45 | Array.empty[SpatialKey] 46 | } 47 | } 48 | 49 | def getSpatialKey(k: GenericRowWithSchema): SpatialKey = SpatialKey(k.getInt(0), k.getInt(1)) 50 | 51 | def getSpatialKey(row: Row, field: String): SpatialKey = { 52 | val k = row.getAs[Row](field) 53 | SpatialKey(k.getInt(0), k.getInt(1)) 54 | } 55 | 56 | // case class IdFeature[+G <: Geometry, +D](geom: Geometry, data: D, id: Int) extends Feature[G, D](geom, data) { 57 | // override def mapGeom[T <: Geometry](f: G => T): IdFeature[T, D] = 58 | // IdFeature(f(geom), data, id) 59 | 60 | // override def mapData[T](f: D => T): IdFeature[G, T] = 61 | // IdFeature(geom, f(data), id) 62 | // } 63 | 64 | def timedIntersect[G <: Geometry](geom: G, ex: Extent)(implicit ec: ExecutionContext) = { 65 | val future = Future { geom.&(ex) } 66 | Try(Await.result(future, 5000 milliseconds)) match { 67 | case Success(res) => res 68 | case Failure(_) => 69 | logger.warn(s"Could not intersect $geom with $ex in 5000 milliseconds") 70 | NoResult 71 | } 72 | } 73 | 74 | case class VTContents(points: List[VectorTileFeature[Point]] = Nil, 75 | multipoints: List[VectorTileFeature[MultiPoint]] = Nil, 76 | lines: List[VectorTileFeature[LineString]] = Nil, 77 | multilines: List[VectorTileFeature[MultiLineString]] = Nil, 78 | polygons: List[VectorTileFeature[Polygon]] = Nil, 79 | multipolygons: List[VectorTileFeature[MultiPolygon]] = Nil) { 80 | def +(other: VTContents) = VTContents(points ++ other.points, 81 | multipoints ++ other.multipoints, 82 | lines ++ other.lines, 83 | multilines ++ other.multilines, 84 | polygons ++ other.polygons, 85 | multipolygons ++ other.multipolygons) 86 | def +[G <: Geometry](other: VectorTileFeature[G]) = other.geom match { 87 | case p : Point => copy(points=other.asInstanceOf[VectorTileFeature[Point]] :: points) 88 | case mp: MultiPoint => copy(multipoints=other.asInstanceOf[VectorTileFeature[MultiPoint]] :: multipoints) 89 | case l : LineString => copy(lines=other.asInstanceOf[VectorTileFeature[LineString]] :: lines) 90 | case ml: MultiLineString => copy(multilines=other.asInstanceOf[VectorTileFeature[MultiLineString]] :: multilines) 91 | case p : Polygon => copy(polygons=other.asInstanceOf[VectorTileFeature[Polygon]] :: polygons) 92 | case mp: MultiPolygon => copy(multipolygons=other.asInstanceOf[VectorTileFeature[MultiPolygon]] :: multipolygons) 93 | } 94 | } 95 | object VTContents { 96 | def empty() = VTContents() 97 | } 98 | 99 | def buildLayer[G <: Geometry](features: Iterable[VectorTileFeature[G]], layerName: String, ex: Extent, tileWidth: Int): Layer = { 100 | val contents = features.foldLeft(VTContents.empty){ (accum, feature) => accum + feature } 101 | val VTContents(pts, mpts, ls, mls, ps, mps) = contents 102 | StrictLayer( 103 | name=layerName, 104 | tileWidth=tileWidth, 105 | version=2, 106 | tileExtent=ex, 107 | points=pts.map(vtf2mvtf), 108 | multiPoints=mpts.map(vtf2mvtf), 109 | lines=ls.map(vtf2mvtf), 110 | multiLines=mls.map(vtf2mvtf), 111 | polygons=ps.map(vtf2mvtf), 112 | multiPolygons=mps.map(vtf2mvtf) 113 | ) 114 | } 115 | 116 | def buildSortedLayer[G <: Geometry](features: Iterable[VectorTileFeature[G]], layerName: String, ex: Extent, tileWidth: Int): Layer = { 117 | val contents = features.foldLeft(VTContents.empty){ (accum, feature) => accum + feature } 118 | val VTContents(pts, mpts, ls, mls, ps, mps) = contents 119 | StrictLayer( 120 | name=layerName, 121 | tileWidth=tileWidth, 122 | version=2, 123 | tileExtent=ex, 124 | points=pts.map(vtf2mvtf), 125 | multiPoints=mpts.map(vtf2mvtf), 126 | lines=ls.map(vtf2mvtf), 127 | multiLines=mls.map(vtf2mvtf), 128 | polygons=ps.sortWith(_.getArea > _.getArea).map(vtf2mvtf), 129 | multiPolygons=mps.sortWith(_.getArea > _.getArea).map(vtf2mvtf) 130 | ) 131 | } 132 | 133 | def buildVectorTile[G <: Geometry]( 134 | features: Iterable[VectorTileFeature[G]], 135 | layerName: String, 136 | ex: Extent, 137 | tileWidth: Int, 138 | sorted: Boolean 139 | ): VectorTile = { 140 | val layer = 141 | if (sorted) 142 | buildSortedLayer(features, layerName, ex, tileWidth) 143 | else 144 | buildLayer(features, layerName, ex, tileWidth) 145 | VectorTile(Map(layerName -> layer), ex) 146 | } 147 | 148 | def buildVectorTile[G <: Geometry]( 149 | layerFeatures: Map[String, Iterable[VectorTileFeature[G]]], 150 | ex: Extent, 151 | tileWidth: Int, 152 | sorted: Boolean 153 | ): VectorTile = { 154 | VectorTile(layerFeatures.map{ case (layerName, features) => (layerName, 155 | if (sorted) 156 | buildSortedLayer(features, layerName, ex, tileWidth) 157 | else 158 | buildLayer(features, layerName, ex, tileWidth)) 159 | }, ex) 160 | } 161 | 162 | } 163 | -------------------------------------------------------------------------------- /src/main/tut/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: home 3 | title: "Home" 4 | section: "section_home" 5 | position: 1 6 | technologies: 7 | - first: ["GeoTrellis", "Geographic data processing engine for high performance applications"] 8 | - second: ["Apache Spark", "An engine for large-scale data processing"] 9 | - third: ["Scala", "Functional Programming on the JVM"] 10 | --- 11 | 12 | # VectorPipe 13 | 14 | VectorPipe is a Scala library for transforming vector data of arbitrary 15 | sources into [Mapbox Vector Tiles](https://www.mapbox.com/vector-tiles/). It 16 | uses the VectorTile codec from the [GeoTrellis library 17 | suite](https://geotrellis.io/), which in turn is powered by [Apache 18 | Spark](https://spark.apache.org/). 19 | 20 | Currently VectorPipe can process: 21 | 22 | - OpenStreetMap XML / PBF* / ORC 23 | 24 | And produce: 25 | 26 | - Analytic Vector Tiles (AVTs) 27 | - Custom Vector Tile schemes (by writing a custom *Collator* function) 28 | 29 | Of course, you're not limited to just producing Vector Tiles. Once you've 30 | extracted your raw data into [GeoTrellis](https://geotrellis.io/) Geometries, 31 | you can do whatever you want with them (analytics, rasterizing, etc.). 32 | 33 | ### Dependencies 34 | 35 | - Scala 2.11 36 | - Apache Spark 2.1.0+ 37 | 38 | ### Getting Started 39 | 40 | To use VectorPipe, add the following to your `build.sbt`: 41 | 42 | ``` 43 | resolvers += Resolver.bintrayRepo("azavea", "maven") 44 | 45 | libraryDependencies += "com.azavea" %% "vectorpipe" % "0.1.0" 46 | ``` 47 | 48 | Now import the following, and you're good to go: 49 | 50 | ```tut:silent 51 | import vectorpipe._ 52 | ``` 53 | 54 | ### Performance 55 | 56 | Wow, fast! 57 | 58 | ### Related Projects 59 | 60 | - [OpenMapTiles](https://openmaptiles.org/) 61 | - [Mapbox](https://www.mapbox.com/) 62 | -------------------------------------------------------------------------------- /src/main/tut/outputs.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "Outputs" 4 | section: "section4" 5 | position: 4 6 | --- 7 | 8 | Types of VectorTiles! 9 | 10 | - AVTs 11 | - OpenMapTiles 12 | - Custom! 13 | -------------------------------------------------------------------------------- /src/main/tut/sources.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "Data Sources" 4 | section: "section3" 5 | position: 3 6 | --- 7 | 8 | Sources of Vector data! 9 | -------------------------------------------------------------------------------- /src/main/tut/usage.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: "Usage" 4 | section: "usage" 5 | position: 2 6 | --- 7 | 8 | {% include_relative usage/usage.md %} 9 | -------------------------------------------------------------------------------- /src/main/tut/usage/concepts.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: "Concepts" 4 | section: "usage" 5 | --- 6 | 7 | # Concepts 8 | 9 | VectorPipe strives to be straight-forward. With only a few simple function 10 | applications we can transform completely raw data into a grid of 11 | VectorTiles, ready for further processing. "Clipping" and "Collation" 12 | functions help us customize this process along the way. 13 | 14 | 15 | 16 | ### Data Sources 17 | 18 | Some source of Vector (re: geometric) data on the earth. Could come in any 19 | format (example: OpenStreetMap). 20 | 21 | For each data source that has first-class support, we expose a 22 | `vectorpipe.*` module with a matching name. Example: `vectorpipe.osm`. These 23 | modules expose all the types and functions necessary for transforming the 24 | raw data into the "Middle Ground" types. 25 | 26 | No first-class support for your favourite data source? Want to write it 27 | yourself, and maybe even keep it private? That's okay, just provide the 28 | function `YourData => RDD[Feature[G, D]]` and VectorPipe can handle the 29 | rest. 30 | 31 | ### The "Middle Ground" 32 | 33 | A collection of Geometries on the earth. The actual data can be distributed 34 | across multiple machines via Spark's `RDD` type. From this "middle ground", 35 | we can proceed with creating Vector Tiles, or (with the right supporting 36 | code) we could convert *back* into the format of the original source data. 37 | 38 | Note that via the method `VectorTile.toIterable`, the following conversion 39 | is possible: 40 | 41 | ```tut:silent 42 | import geotrellis.spark._ 43 | import geotrellis.vector._ 44 | import geotrellis.vectortile._ 45 | import org.apache.spark._ 46 | import org.apache.spark.rdd.RDD 47 | 48 | implicit val sc: SparkContext = new SparkContext( 49 | new SparkConf().setMaster("local[*]").setAppName("back-to-middle-ground") 50 | ) 51 | 52 | /* Mocked as `empty` for the example */ 53 | val tiles: RDD[(SpatialKey, VectorTile)] = sc.emptyRDD 54 | 55 | /* A VT layer converted back to the "middle ground", possibly for recollation */ 56 | val backToMiddle: RDD[(SpatialKey, Iterable[Feature[Geometry, Map[String, Value]]])] = 57 | tiles.mapValues(_.toIterable) 58 | 59 | /* Close up Spark nicely */ 60 | sc.stop() 61 | ``` 62 | 63 | ### Clipping Functions 64 | 65 | GeoTrellis has a consistent `RDD[(K, V)]` pattern for handling grids of 66 | tiled data, where `K` is the grid index and `V` is the actual value type. 67 | Before `RDD[(SpatialKey, VectorTile)]` can be achieved, we need to convert 68 | our gridless `RDD[Feature[G, D]]` into such a grid, such that each Feature's 69 | `Geometry` is reasonably clipped to the size of an individual tile. Depending 70 | on which clipping function you choose (from the `vectorpipe.Clip` object, or 71 | even your own custom one) the shape of the clipped Geometry will vary. See 72 | our Scaladocs for more detail on the available options. 73 | 74 | Admittedly, we sometimes can't guarantee the validity of incoming vector data. 75 | Clipping is known to occasionally fail on large, complex multipolygons, so 76 | we skip over these failures while optionally allowing to log them. Any logging 77 | framework can be used. 78 | 79 | ### Collation Functions 80 | 81 | Once clipped and gridded by `VectorPipe.toGrid`, we have a `RDD[(SpatialKey, 82 | Iterable[Feature[G, D]])]` that represents all the Geometry fragments 83 | present at each tiled location on the earth. This is the perfect shape to 84 | turn into a `VectorTile`. To do so, we need to choose a *Collator* function, 85 | which determines what VectorTile Layer each `Feature` should be placed into, 86 | and how (if at all) its corresponding metadata (the `D`) should be 87 | processed. 88 | 89 | Want to write your own Collator? The `Collate.generically` function will be 90 | of interest to you. 91 | 92 | ### Output Targets 93 | 94 | We can imagine two possible outputs for our completed grid of Vector Tiles: 95 | 96 | - A compressed GeoTrellis layer, saved to S3 [or 97 | elsewhere](https://geotrellis.readthedocs.io/en/latest/guide/tile-backends.html) 98 | - A dump of every tile as an `.mvt`, readable by other software 99 | 100 | Either option is simple, but outputting an `RDD[(SpatialKey, VectorTile)]` 101 | isn't actually the concern of VectorPipe - it can be handled entirely in 102 | client code via GeoTrellis functionality. An example of this can be found 103 | [in this repository](https://github.com/fosskers/vectorpipe-io). 104 | -------------------------------------------------------------------------------- /src/main/tut/usage/osm.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: "Reading OpenStreetMap Data" 4 | section: "usage" 5 | --- 6 | 7 | ## From XML 8 | 9 | OSM XML files usually appear with the extension `.osm`. Since the data is all string-based, 10 | these files can be quite large compared to their PBF or ORC equivalents. 11 | 12 | ```tut:silent 13 | import org.apache.spark._ 14 | import scala.util.{Success, Failure} 15 | import vectorpipe._ 16 | 17 | implicit val sc: SparkContext = new SparkContext( 18 | new SparkConf().setMaster("local[*]").setAppName("xml-example") 19 | ) 20 | 21 | val path: String = "/some/path/on/your/machine/foo.osm" 22 | 23 | osm.fromLocalXML(path) match { 24 | case Failure(e) => { } /* Parsing failed somehow... is the filepath correct? */ 25 | case Success((ns,ws,rs)) => { } /* (RDD[(Long, Node)], RDD[(Long, Way)], RDD[(Long, Relation)]) */ 26 | } 27 | 28 | sc.stop() 29 | ``` 30 | 31 | ## From PBF 32 | 33 | For the time being, `.osm.pbf` files can be used by first converting them to `.orc` 34 | files using the [osm2orc](https://github.com/mojodna/osm2orc) tool, and then following 35 | VectorPipe's ORC instructions given below. 36 | 37 | ## From ORC 38 | 39 | You must first include an extra dependency to the `libraryDependencies` list in your `build.sbt`: 40 | 41 | ``` 42 | "org.apache.spark" %% "spark-hive" % "2.2.0" 43 | ``` 44 | 45 | And then we can read our OSM data in parallel via Spark. Notice the use of `SparkSession` 46 | instead of `SparkContext` here: 47 | 48 | ```tut:silent 49 | import org.apache.spark.sql._ 50 | import scala.util.{Success, Failure} 51 | import vectorpipe._ 52 | 53 | implicit val ss: SparkSession = 54 | SparkSession.builder.master("local[*]").appName("orc-example").enableHiveSupport.getOrCreate 55 | 56 | val path: String = "s3://bucket/key/foo.orc" 57 | 58 | osm.fromORC(path) match { 59 | case Failure(err) => { } /* Does the file exist? Do you have the right AWS credentials? */ 60 | case Success((ns,ws,rs)) => { } /* (RDD[(Long, Node)], RDD[(Long, Way)], RDD[(Long, Relation)]) */ 61 | } 62 | 63 | ss.stop() 64 | ``` 65 | 66 | This approach will be particularly efficient when run on an EMR cluster, since 67 | EMR clusters have privileged access to S3. 68 | -------------------------------------------------------------------------------- /src/main/tut/usage/usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | Writing a small executable that uses VectorPipe is straight-forward. The 4 | entire `main` isn't much more than: 5 | 6 | ```tut:silent 7 | import geotrellis.proj4.WebMercator 8 | import geotrellis.spark._ 9 | import geotrellis.spark.tiling._ 10 | import geotrellis.vectortile.VectorTile 11 | import org.apache.spark._ 12 | import org.apache.spark.rdd.RDD 13 | import vectorpipe._ /* All types and functions. Also exposes the `osm` submodule used below. */ 14 | 15 | /* Initialize a `SparkContext`, necessary for all `RDD` work */ 16 | implicit val sc: SparkContext = new SparkContext( 17 | new SparkConf().setMaster("local[*]").setAppName("vectorpipe-example") 18 | ) 19 | 20 | /* Describe the dimensions of your data area */ 21 | val layout: LayoutDefinition = 22 | ZoomedLayoutScheme.layoutForZoom(15, WebMercator.worldExtent, 512) 23 | 24 | /* From an OSM data source, mocked as "empty" for this example */ 25 | val (nodes, ways, relations): (RDD[(Long, osm.Node)], RDD[(Long, osm.Way)], RDD[(Long, osm.Relation)]) = 26 | (sc.emptyRDD, sc.emptyRDD, sc.emptyRDD) 27 | 28 | /* All OSM Elements lifted into GeoTrellis Geometry types. 29 | * Note: type OSMFeature = Feature[Geometry, ElementData] 30 | */ 31 | val features: RDD[osm.OSMFeature] = 32 | osm.features(nodes, ways, relations).geometries 33 | 34 | /* All Geometries clipped to your `layout` grid */ 35 | val featGrid: RDD[(SpatialKey, Iterable[osm.OSMFeature])] = 36 | grid(Clip.byHybrid, logToStdout, layout, features) 37 | 38 | /* A grid of Vector Tiles */ 39 | val tiles: RDD[(SpatialKey, VectorTile)] = 40 | vectortiles(Collate.byOSM, layout, featGrid) 41 | 42 | /* Further processing here, writing to S3, etc. */ 43 | 44 | /* Halt Spark nicely */ 45 | sc.stop() 46 | ``` 47 | 48 | A full example of processing some OSM XML [can be found 49 | here](https://github.com/fosskers/vectorpipe-io). 50 | -------------------------------------------------------------------------------- /src/test/resources/.gitignore: -------------------------------------------------------------------------------- 1 | !*.orc 2 | -------------------------------------------------------------------------------- /src/test/resources/isle-of-man-latest.osm.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/isle-of-man-latest.osm.orc -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=WARN, console 2 | log4j.appender.console=org.apache.log4j.ConsoleAppender 3 | log4j.appender.console.target=System.out 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | # log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c: %m%n 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 7 | log4j.logger.osmesa=DEBUG -------------------------------------------------------------------------------- /src/test/resources/relation-110564.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-110564.orc -------------------------------------------------------------------------------- /src/test/resources/relation-110564.wkt: -------------------------------------------------------------------------------- 1 | MULTIPOLYGON (((-85.982597 34.392855, -85.979971 34.392887, -85.979885 34.392888, -85.976397 34.392888, -85.969464 34.39293, -85.96946 34.392309, -85.969469 34.390934, -85.969527 34.38977, -85.969542 34.389184, -85.978326 34.389014, -85.980136 34.388975, -85.982477 34.38893, -85.982539 34.390479, -85.982597 34.392855)), ((-85.991477 34.381899, -85.991142 34.3819, -85.987173 34.381924, -85.987092 34.389011, -85.983147 34.388938, -85.982574 34.388926, -85.982477 34.38893, -85.982563 34.387774, -85.982684 34.386311, -85.982673 34.385681, -85.982714 34.382035, -85.986874 34.381928, -85.987021 34.381777, -85.987076 34.38115, -85.98709 34.378146, -85.99014 34.378181, -85.990431 34.378189, -85.991498 34.378197, -85.991477 34.381899)), ((-85.969523 34.400126, -85.96934 34.400318, -85.969242 34.402769, -85.969203 34.403786, -85.968504 34.403761, -85.966925 34.403706, -85.966119 34.403681, -85.96504 34.403639, -85.965057 34.402798, -85.965119 34.400062, -85.969291 34.400119, -85.96938 34.396425, -85.972964 34.396491, -85.973719 34.396504, -85.97364 34.40018, -85.969523 34.400126)), ((-85.965119 34.400062, -85.962384 34.400035, -85.960656 34.400001, -85.960724 34.397709, -85.960747 34.397073, -85.96075 34.396963, -85.960772 34.396316, -85.963846 34.396282, -85.965205 34.396349, -85.965205 34.396921, -85.965119 34.400062)), ((-85.96938 34.396425, -85.965205 34.396349, -85.965206 34.394604, -85.965209 34.393635, -85.965212 34.392946, -85.967966 34.392944, -85.969464 34.39293, -85.96938 34.396425))) 2 | -------------------------------------------------------------------------------- /src/test/resources/relation-191199.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-191199.orc -------------------------------------------------------------------------------- /src/test/resources/relation-191199.wkt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-191199.wkt -------------------------------------------------------------------------------- /src/test/resources/relation-191204.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-191204.orc -------------------------------------------------------------------------------- /src/test/resources/relation-191204.wkt: -------------------------------------------------------------------------------- 1 | MULTIPOLYGON EMPTY 2 | -------------------------------------------------------------------------------- /src/test/resources/relation-1949938.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-1949938.orc -------------------------------------------------------------------------------- /src/test/resources/relation-1949938.wkt: -------------------------------------------------------------------------------- 1 | POLYGON ((-71.342046 41.8333126, -71.3423551 41.8336345, -71.3424089 41.8338235, -71.3423936 41.8340698, -71.3420938 41.8353984, -71.3418786 41.8357421, -71.3412098 41.8367385, -71.3411176 41.8371451, -71.3410484 41.8376434, -71.3411483 41.8384852, -71.3413405 41.8388403, -71.3415711 41.839264, -71.342063 41.8398481, -71.342432 41.840312, -71.3427856 41.8407071, -71.3432852 41.8412511, -71.3434005 41.8414115, -71.3435465 41.8415718, -71.3435542 41.8416863, -71.3434927 41.8417951, -71.3434159 41.8418352, -71.3432156 41.8418625, -71.3429086 41.8418868, -71.3418709 41.8419669, -71.3410945 41.8421101, -71.3408101 41.842196, -71.3403951 41.8422189, -71.3402874 41.8422991, -71.3401875 41.842488, -71.3400107 41.8425739, -71.3396187 41.8426197, -71.3393266 41.8425281, -71.3391037 41.8425625, -71.3390038 41.8426255, -71.3387117 41.8426197, -71.3385657 41.8425567, -71.3383197 41.842425, -71.3375818 41.8420013, -71.3372128 41.8418639, -71.3371052 41.8417722, -71.3371206 41.8415432, -71.3372436 41.8413714, -71.3374204 41.8412511, -71.3374204 41.8410851, -71.3372897 41.8410106, -71.3369899 41.8410965, -71.336744 41.8411366, -71.3362751 41.8410736, -71.3359061 41.8409763, -71.3355372 41.8407415, -71.3352297 41.8404666, -71.335076 41.8402662, -71.335076 41.8400657, -71.3351221 41.839768, -71.3351913 41.839579, -71.3355372 41.8391438, -71.3358523 41.8390292, -71.3361367 41.8389548, -71.3365134 41.8389548, -71.3367824 41.8390063, -71.3369438 41.8390235, -71.3370975 41.8390063, -71.3372052 41.8389204, -71.3373051 41.8386685, -71.3373819 41.8384566, -71.3373973 41.8383936, -71.3375972 41.8380958, -71.337651 41.8378954, -71.3376663 41.8376434, -71.3375126 41.8375059, -71.337282 41.8374601, -71.3370745 41.8375117, -71.3367824 41.8377465, -71.3365979 41.8379698, -71.3363904 41.8381359, -71.3362136 41.8382676, -71.335983 41.8384165, -71.3358139 41.8384222, -71.3356525 41.8383649, -71.3355295 41.8382447, -71.3355141 41.8381359, -71.3356294 41.8379297, -71.3357524 41.8378553, -71.33586 41.8376949, -71.3358446 41.8374945, -71.3358446 41.8372482, -71.3359984 41.8370993, -71.3361214 41.8370134, -71.3365595 41.8369562, -71.3368746 41.8368416, -71.3370207 41.8367042, -71.3370053 41.8365037, -71.3367286 41.8364465, -71.3365979 41.8363434, -71.3365595 41.8362346, -71.3365518 41.8360456, -71.3366902 41.8359081, -71.336767 41.8357363, -71.3368746 41.8355588, -71.3370822 41.8353927, -71.337136 41.8352381, -71.3371667 41.8350491, -71.3371513 41.8348258, -71.3370591 41.8346654, -71.3370591 41.8345509, -71.3371206 41.8343905, -71.337259 41.834316, -71.3373281 41.8342072, -71.3373743 41.83415, -71.337159 41.8337376, -71.3371437 41.8336173, -71.3371975 41.83352, -71.3372974 41.8334856, -71.3375126 41.8335715, -71.3378969 41.8337033, -71.3380968 41.8336116, -71.3381583 41.8335314, -71.3382813 41.8333596, -71.3385426 41.833142, -71.3388501 41.8329244, -71.3391037 41.8327697, -71.3393266 41.8325406, -71.3395649 41.8324032, -71.3398263 41.8322715, -71.3400338 41.832197, -71.3402029 41.8322027, -71.3403874 41.8322829, -71.3415865 41.8330561, -71.3417633 41.8331592, -71.342046 41.8333126), (-71.3413636 41.8354672, -71.3414174 41.8353641, -71.3414942 41.835324, -71.3415942 41.8353984, -71.341525 41.8355073, -71.3414174 41.8356275, -71.3413482 41.8355817, -71.3413636 41.8354672), (-71.3411945 41.8359024, -71.3413482 41.8359425, -71.3413636 41.8360513, -71.3412559 41.8362346, -71.3411868 41.8364007, -71.3410715 41.8364407, -71.3410177 41.8363434, -71.3410484 41.8361372, -71.3411253 41.8359941, -71.3411945 41.8359024), (-71.3400799 41.841944, -71.3398954 41.8417379, -71.3398186 41.8415145, -71.3398186 41.8414286, -71.3400338 41.8413084, -71.3402106 41.8413943, -71.3402951 41.8415432, -71.3403643 41.8416749, -71.3403413 41.841778, -71.3402106 41.8419383, -71.3400799 41.841944)) 2 | -------------------------------------------------------------------------------- /src/test/resources/relation-2554903.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-2554903.orc -------------------------------------------------------------------------------- /src/test/resources/relation-2580685.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-2580685.orc -------------------------------------------------------------------------------- /src/test/resources/relation-2580685.wkt: -------------------------------------------------------------------------------- 1 | MULTIPOLYGON (((-71.4589656 41.799364, -71.4585222 41.7994796, -71.4586724 41.7990153, -71.4587368 41.7985914, -71.458887 41.7982555, -71.4592947 41.7980635, -71.4597882 41.7978955, -71.4601423 41.797712, -71.4604534 41.7974156, -71.4602925 41.7971037, -71.4600564 41.7967998, -71.4600564 41.7965438, -71.4601423 41.7962559, -71.4599599 41.7959519, -71.4596082 41.7958057, -71.4588853 41.7942745, -71.4589886 41.7938896, -71.4597229 41.7935902, -71.4599065 41.7935474, -71.4609162 41.7953609, -71.4611801 41.7962419, -71.461008 41.7966012, -71.4614784 41.797448, -71.4620408 41.7979202, -71.4622242 41.7985172, -71.4622109 41.799042, -71.46188 41.7993469, -71.4614585 41.7994322, -71.4610194 41.7993555, -71.460354 41.7990476, -71.4595852 41.7990133, -71.4589656 41.799364), (-71.4614062 41.7986518, -71.4612426 41.7988664, -71.4609089 41.7989786, -71.4606145 41.7989932, -71.4602024 41.7989005, -71.4600126 41.7988176, -71.4599407 41.7987152, -71.4600061 41.7985591, -71.4602285 41.7984079, -71.4606865 41.798286, -71.461151 41.7982421, -71.46138 41.7984079, -71.4614062 41.7986518)), ((-71.4584952 41.7995265, -71.4589656 41.799364, -71.4590804 41.8006898, -71.4592704 41.8011716, -71.4593557 41.8017247, -71.4592489 41.8021513, -71.4588976 41.8024816, -71.4593012 41.8030966, -71.4596196 41.8037689, -71.4610194 41.8044617, -71.4619832 41.8048123, -71.4625684 41.8046413, -71.4626372 41.8042735, -71.4634404 41.8040854, -71.4640944 41.8041196, -71.4647713 41.8037774, -71.4648861 41.8032814, -71.464278 41.8029307, -71.4645304 41.8026912, -71.4652532 41.8026912, -71.4659072 41.8028281, -71.4662973 41.802734, -71.4670087 41.801682, -71.4669743 41.800724, -71.4671578 41.8007155, -71.4673185 41.8015708, -71.4667907 41.8029649, -71.4662744 41.8035636, -71.4654483 41.8041281, -71.4648631 41.8048038, -71.463151 41.805115, -71.462361 41.804966, -71.4614784 41.8049748, -71.4605375 41.8044702, -71.4597114 41.8042307, -71.4589656 41.8034524, -71.4586903 41.8028537, -71.458677 41.8027838, -71.4586418 41.802703, -71.458621 41.8024886, -71.4586706 41.8023715, -71.4587706 41.8021353, -71.459046 41.801853, -71.4587132 41.8009806, -71.4586788 41.8001424, -71.4584952 41.7995265))) 2 | -------------------------------------------------------------------------------- /src/test/resources/relation-3080946.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-3080946.orc -------------------------------------------------------------------------------- /src/test/resources/relation-3105056.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-3105056.orc -------------------------------------------------------------------------------- /src/test/resources/relation-3105056.wkt: -------------------------------------------------------------------------------- 1 | POLYGON ((-71.3264288 41.5069067, -71.3263203 41.5069497, -71.3262056 41.5070241, -71.3261505 41.5070885, -71.326092 41.5071901, -71.3260291 41.5073693, -71.325963 41.5075585, -71.325922 41.5076731, -71.325883 41.5076947, -71.3257676 41.5076736, -71.3256132 41.5076263, -71.3255477 41.507603, -71.3254939 41.5075638, -71.3254589 41.5075179, -71.3254449 41.5074619, -71.3254597 41.5074, -71.3255173 41.5072859, -71.3255791 41.5071711, -71.3255736 41.5068152, -71.3255687 41.50674, -71.325611 41.5067513, -71.3256436 41.5067582, -71.3256705 41.5067586, -71.3256924 41.5067488, -71.3261404 41.50684, -71.3261668 41.5068473, -71.3261869 41.5068513, -71.3262246 41.5068589, -71.3262246 41.5068679, -71.3262354 41.506883, -71.3262579 41.5068973, -71.3262824 41.5069051, -71.3263218 41.506903, -71.3263529 41.5068893, -71.3264288 41.5069067)) 2 | -------------------------------------------------------------------------------- /src/test/resources/relation-333501.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-333501.orc -------------------------------------------------------------------------------- /src/test/resources/relation-333501.wkt: -------------------------------------------------------------------------------- 1 | POLYGON ((-71.3364289 41.4799458, -71.3366843 41.4790572, -71.3368865 41.478988, -71.3368439 41.4783601, -71.336663 41.4783761, -71.3365442 41.4781598, -71.3369364 41.4780481, -71.3381528 41.477708, -71.3383414 41.4776608, -71.3384978 41.4776906, -71.3386815 41.4778123, -71.339168 41.4781548, -71.3392847 41.4782367, -71.3387163 41.4788549, -71.3384134 41.4792098, -71.3385822 41.4793315, -71.3382744 41.4797708, -71.3382223 41.4798627, -71.3377854 41.4798081, -71.3377208 41.4796616, -71.3369141 41.4796889, -71.3369066 41.4798677, -71.3365343 41.4799322, -71.3364289 41.4799458), (-71.3370199 41.4782179, -71.3383813 41.4778507, -71.3389892 41.4782749, -71.3388056 41.4784946, -71.3382821 41.4791493, -71.3381738 41.4792934, -71.3383417 41.4794107, -71.3381156 41.4797654, -71.3378676 41.4797206, -71.3378061 41.4795872, -71.337443 41.4796016, -71.3374347 41.4795441, -71.3372111 41.4795585, -71.3372193 41.4796118, -71.3368911 41.4796057, -71.3368911 41.4795708, -71.3368378 41.479577, -71.3368234 41.4796241, -71.3367023 41.4796077, -71.3367844 41.4792856, -71.3370347 41.4792097, -71.3370199 41.4782179)) 2 | -------------------------------------------------------------------------------- /src/test/resources/relation-393502.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-393502.orc -------------------------------------------------------------------------------- /src/test/resources/relation-393502.wkt: -------------------------------------------------------------------------------- 1 | POLYGON ((-71.4562227 41.6485987, -71.4561617 41.6486401, -71.4560719 41.648621, -71.4559852 41.648499, -71.4560684 41.6483234, -71.4561891 41.6481462, -71.456312 41.6479512, -71.4561934 41.6478207, -71.4558499 41.6476399, -71.4556242 41.6475887, -71.4553623 41.6474994, -71.455278 41.6475307, -71.4554576 41.6477624, -71.4556047 41.6478445, -71.4554322 41.6479837, -71.4554172 41.6482545, -71.4551891 41.6484802, -71.4547765 41.6488669, -71.4544546 41.6490673, -71.45424 41.6491475, -71.4540314 41.649123, -71.4535844 41.6491252, -71.4529913 41.6493056, -71.4527857 41.6494481, -71.4527976 41.6497243, -71.4528423 41.6500026, -71.4529466 41.6503055, -71.4528852 41.650835, -71.4529712 41.6510973, -71.4529405 41.6516331, -71.4529913 41.6520714, -71.4528935 41.6522887, -71.4521329 41.6531935, -71.4518542 41.6534193, -71.4516462 41.653417, -71.4515167 41.6533123, -71.4513968 41.6533739, -71.4515094 41.6534801, -71.4514263 41.6535232, -71.4513257 41.653429, -71.4511044 41.6535533, -71.450994 41.6536187, -71.4510103 41.6537167, -71.4503749 41.6541715, -71.4501844 41.654423, -71.449888 41.6546385, -71.4497378 41.6547958, -71.4495944 41.6548359, -71.449477 41.6549516, -71.449357 41.6549751, -71.4492382 41.6549484, -71.4491915 41.6549919, -71.4491545 41.6553574, -71.4491894 41.6556444, -71.4488881 41.6563126, -71.4486687 41.6565095, -71.4485691 41.6565879, -71.448441 41.6566786, -71.4482743 41.6567632, -71.4475918 41.6569869, -71.4473949 41.6571615, -71.4472649 41.6574506, -71.4469975 41.6577686, -71.446965 41.6583328, -71.4467593 41.6585282, -71.4464288 41.6586792, -71.4463475 41.6586284, -71.446196 41.6587191, -71.4462188 41.6587466, -71.4457896 41.6589791, -71.4454817 41.6590878, -71.4453846 41.6592176, -71.4454959 41.6592606, -71.4453015 41.6593598, -71.4452371 41.6593168, -71.4449099 41.6596544, -71.4447699 41.6596156, -71.4447248 41.6598738, -71.4449286 41.6599069, -71.4449152 41.659957, -71.4447476 41.659935, -71.4446804 41.6602495, -71.4445897 41.6604577, -71.4445357 41.660459, -71.4445424 41.6608086, -71.4446044 41.6610053, -71.4445981 41.6610969, -71.444818 41.6610641, -71.4448737 41.6612109, -71.4449199 41.6612159, -71.4449065 41.6613427, -71.4449374 41.6614677, -71.4450876 41.661627, -71.4451607 41.6617069, -71.4451862 41.6617783, -71.4451318 41.6617983, -71.4452552 41.6620057, -71.4452773 41.6620019, -71.4452597 41.6622439, -71.4452016 41.6622652, -71.4450983 41.66267, -71.4449401 41.6629243, -71.4449405 41.6631424, -71.4451117 41.6631449, -71.4449562 41.6635917, -71.4448563 41.6640225, -71.4428492 41.664013, -71.4409667 41.6640028, -71.4410086 41.6635043, -71.4409704 41.6630459, -71.4406563 41.6627032, -71.4405231 41.6624939, -71.4405799 41.6621481, -71.4410582 41.6614303, -71.4410348 41.6611838, -71.4411192 41.6609324, -71.441648 41.6604529, -71.441765 41.6602931, -71.4417262 41.6601353, -71.441636 41.6599996, -71.441442 41.6597342, -71.4414015 41.6595652, -71.441427 41.6591274, -71.4415719 41.6588499, -71.4415607 41.6585328, -71.4420302 41.6575009, -71.4421048 41.6572404, -71.4421307 41.6567142, -71.4424257 41.6562739, -71.4432187 41.6557317, -71.4437089 41.65514, -71.4438941 41.6547956, -71.4441515 41.6545408, -71.4442445 41.6543352, -71.4446573 41.654132, -71.4458846 41.653978, -71.4465442 41.6537136, -71.4468603 41.6534841, -71.4470949 41.6532021, -71.4473923 41.6525943, -71.4478523 41.652443, -71.4483847 41.6523728, -71.448728 41.6521774, -71.4489359 41.6519299, -71.4489171 41.6518417, -71.4487246 41.651737, -71.4486033 41.6515712, -71.4486176 41.6513594, -71.4489403 41.6510273, -71.4492344 41.6508874, -71.449339 41.6508027, -71.4493249 41.6506844, -71.4492149 41.6503867, -71.4494166 41.6495338, -71.449869 41.6492269, -71.4503998 41.6486283, -71.4504804 41.6483137, -71.4503091 41.6480198, -71.450056 41.6477186, -71.4501647 41.6474817, -71.4502232 41.6471998, -71.4510559 41.6469478, -71.4512425 41.6469841, -71.451467 41.6469178, -71.4515282 41.646955, -71.4515222 41.6477833, -71.4516529 41.6478644, -71.4517596 41.6478784, -71.4518637 41.6478163, -71.4521768 41.6474964, -71.4525273 41.6474225, -71.4530853 41.6470536, -71.4531583 41.6470422, -71.4532339 41.6470851, -71.453267 41.647215, -71.4532408 41.6472952, -71.4532515 41.6473503, -71.4533246 41.6474295, -71.4533823 41.647455, -71.4533743 41.6475001, -71.4535238 41.6476389, -71.4535612 41.6477262, -71.4537196 41.6478033, -71.4540525 41.6477985, -71.4541949 41.6476977, -71.4543566 41.647678, -71.4546696 41.6474009, -71.4553603 41.6472376, -71.4557675 41.6473042, -71.4561148 41.6475294, -71.4563012 41.6476389, -71.4565258 41.6476734, -71.4565082 41.6480817, -71.4563099 41.6482686, -71.4561667 41.6485057, -71.4562227 41.6485987)) -------------------------------------------------------------------------------- /src/test/resources/relation-5448156.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-5448156.orc -------------------------------------------------------------------------------- /src/test/resources/relation-5448691.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-5448691.orc -------------------------------------------------------------------------------- /src/test/resources/relation-5612959.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-5612959.orc -------------------------------------------------------------------------------- /src/test/resources/relation-61315.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-61315.orc -------------------------------------------------------------------------------- /src/test/resources/relation-61315.wkt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-61315.wkt -------------------------------------------------------------------------------- /src/test/resources/relation-6710544.orc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geotrellis/vectorpipe/a68f4cfae070371b5a16d9094c9ec4d87b7978b1/src/test/resources/relation-6710544.orc -------------------------------------------------------------------------------- /src/test/resources/view/cluster-view.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | Point cluster view 11 | 12 | 13 | 14 | 18 | 19 | 20 | 21 |
22 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/test/resources/view/layer-test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | Add a third party vector tile source 11 | 12 | 13 | 14 | 59 | 60 | 61 | 62 | 63 |
64 | 65 | 66 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /src/test/scala/vectorpipe/MultiPolygonRelationReconstructionSpec.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe 2 | 3 | import java.sql.Timestamp 4 | 5 | import geotrellis.spark.store.kryo.KryoRegistrator 6 | import geotrellis.vector._ 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.serializer.KryoSerializer 9 | import org.apache.spark.sql._ 10 | import org.apache.spark.sql.functions._ 11 | import org.scalatest.prop.{TableDrivenPropertyChecks, Tables} 12 | import org.scalatest.{Matchers, PropSpec} 13 | import vectorpipe.model.Member 14 | import org.locationtech.jts.io.WKTReader 15 | import org.locationtech.geomesa.spark.jts._ 16 | import vectorpipe.relations.MultiPolygons.build 17 | 18 | import scala.io.Source 19 | 20 | case class Fixture(id: Int, members: DataFrame, wkt: Seq[String]) 21 | 22 | trait SparkPoweredTables extends Tables { 23 | def wktReader = new WKTReader() 24 | 25 | val spark: SparkSession = SparkSession 26 | .builder 27 | .config( 28 | /* Settings compatible with both local and EMR execution */ 29 | new SparkConf() 30 | .setAppName(getClass.getName) 31 | .setIfMissing("spark.master", "local[*]") 32 | .setIfMissing("spark.serializer", classOf[KryoSerializer].getName) 33 | .setIfMissing("spark.kryo.registrator", classOf[KryoRegistrator].getName) 34 | .setIfMissing("spark.sql.orc.impl", "native") 35 | ).getOrCreate() 36 | spark.withJTS 37 | 38 | def relation(relation: Int): Fixture = Fixture(relation, orc(s"relation-$relation.orc"), readWktFile(s"relation-$relation.wkt")) 39 | 40 | def orc(filename: String): DataFrame = spark.read.orc(getClass.getResource("/" + filename).getPath) 41 | 42 | // osm2pgsql -c -d rhode_island -j -K -l rhode-island-latest.osm.pbf 43 | // select ST_AsText(way) from planet_osm_polygon where osm_id=-333501; 44 | 45 | def readWktFile(filename: String): Seq[String] = 46 | try { 47 | Source.fromInputStream(getClass.getResourceAsStream("/" + filename)).getLines.toSeq match { 48 | case expected if expected.isEmpty => 49 | Seq() 50 | case expected => 51 | expected 52 | } 53 | } catch { 54 | case _: Exception => Seq("[not provided]") 55 | } 56 | 57 | def asGeoms(relations: DataFrame): Seq[Geometry] = { 58 | import relations.sparkSession.implicits._ 59 | 60 | relations.select('geom).collect.map { row => 61 | row.getAs[Geometry]("geom") 62 | } 63 | } 64 | } 65 | 66 | // osm2pgsql -c -d rhode_island -j -K -l rhode-island-latest.osm.pbf 67 | // select ST_AsText(way) from planet_osm_polygon where osm_id=-333501; 68 | // to debug / visually validate (geoms won't match exactly), load WKT into geojson.io from Meta → Load WKT String 69 | // https://www.openstreetmap.org/relation/64420 70 | // to find multipolygons: select osm_id from planet_osm_polygon where osm_id < 0 and ST_GeometryType(way) = 'ST_MultiPolygon' order by osm_id desc; 71 | class MultiPolygonRelationExamples extends SparkPoweredTables { 72 | def examples = Table("multipolygon relation", 73 | relation(333501), // unordered, single polygon with 1 hole 74 | relation(393502), // single polygon, multiple outer parts, no holes 75 | relation(1949938), // unordered, single polygon with multiple holes 76 | relation(3105056), // multiple unordered outer parts in varying directions 77 | relation(2580685), // multipolygon: 2 polygons, one with 1 hole 78 | relation(3080946), // multipolygon: many polygons, no holes 79 | relation(5448156), // multipolygon made up of parcels 80 | relation(5448691), // multipolygon made up of parcels 81 | relation(6710544), // complex multipolygon 82 | relation(191199), // 4 segments; 2 are components of another (thus duplicates) 83 | relation(61315), // incomplete member list (sourced from an extract of a neighboring state) 84 | relation(2554903), // boundary w/ admin_centre + label node members 85 | relation(191204), // no members 86 | /* relation(5612959), // pathological case for unioning --- removed test, too pathological (address later?) */ 87 | relation(110564) // touching but not dissolve-able 88 | ) 89 | } 90 | 91 | class MultiPolygonRelationReconstructionSpec extends PropSpec with TableDrivenPropertyChecks with Matchers { 92 | property("should match expected WKT") { 93 | new MultiPolygonRelationExamples { 94 | forAll(examples) { fixture => 95 | import fixture.members.sparkSession.implicits._ 96 | 97 | // TODO rewrite fixtures with additional columns added below 98 | val actual: Seq[Geometry] = asGeoms(fixture.members 99 | .withColumn("version", lit(1)) 100 | .withColumn("minorVersion", lit(0)) 101 | .withColumn("updated", lit(Timestamp.valueOf("2001-01-01 00:00:00"))) 102 | .withColumn("validUntil", lit(Timestamp.valueOf("2002-01-01 00:00:00"))) 103 | .withColumn("geometry", st_geomFromWKB('geom)) 104 | .groupByKey { row => 105 | (row.getAs[Long]("changeset"), row.getAs[Long]("id"), row.getAs[Integer]("version"), row.getAs[Integer] 106 | ("minorVersion"), row.getAs[Timestamp]("updated"), row.getAs[Timestamp]("validUntil")) 107 | } 108 | .mapGroups { 109 | case ((changeset, id, version, minorVersion, updated, validUntil), rows) => 110 | val members = rows.toVector 111 | // TODO store Bytes as the type in fixtures 112 | val types = members.map { x => Member.typeFromString(x.getAs[String]("type")) } 113 | val roles = members.map(_.getAs[String]("role")) 114 | val geoms = members.map(_.getAs[Geometry]("geometry")) 115 | val mp = build(id, version, updated, types, roles, geoms).orNull 116 | 117 | (changeset, id, version, minorVersion, updated, validUntil, mp) 118 | } 119 | .toDF("changeset", "id", "version", "minorVersion", "updated", "validUntil", "geom") 120 | ).flatMap(Option.apply(_)) 121 | 122 | val expected = fixture.wkt.map(wktReader.read) 123 | 124 | try { 125 | actual should ===(expected) 126 | } catch { 127 | case e: Throwable => 128 | println(s"${fixture.id} actual:") 129 | actual.foreach(println) 130 | println(s"${fixture.id} expected:") 131 | fixture.wkt.foreach(println) 132 | 133 | throw e 134 | } 135 | } 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/test/scala/vectorpipe/ProcessOSMTest.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe 2 | 3 | import org.scalatest._ 4 | import vectorpipe.{internal => ProcessOSM} 5 | 6 | class ProcessOSMTest extends FunSpec with TestEnvironment with Matchers { 7 | val orcFile = getClass.getResource("/isle-of-man-latest.osm.orc").getPath 8 | 9 | val elements = ss.read.orc(orcFile) 10 | val nodes = ProcessOSM.preprocessNodes(elements).cache 11 | val nodeGeoms = ProcessOSM.constructPointGeometries(nodes).cache 12 | val wayGeoms = ProcessOSM.reconstructWayGeometries(elements, nodes).cache 13 | val relationGeoms = ProcessOSM.reconstructRelationGeometries(elements, wayGeoms).cache 14 | 15 | it("parses isle of man nodes") { 16 | info(s"Nodes: ${nodeGeoms.count}") 17 | } 18 | 19 | it("parses isle of man ways") { 20 | info(s"Ways: ${wayGeoms.count}") 21 | } 22 | 23 | it("parses isle of man relations") { 24 | info(s"Relations: ${relationGeoms.count}") 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/test/scala/vectorpipe/TestEnvironment.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Azavea 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package vectorpipe 18 | 19 | import org.apache.spark.serializer.KryoSerializer 20 | import org.apache.spark.sql.SparkSession 21 | import org.scalatest._ 22 | 23 | object TestEnvironment { 24 | } 25 | 26 | /* 27 | * These set of traits handle the creation and deletion of test directories on the local fs and hdfs, 28 | * It uses commons-io in at least one case (recursive directory deletion) 29 | */ 30 | trait TestEnvironment extends BeforeAndAfterAll { self: Suite with BeforeAndAfterAll => 31 | implicit val ss: SparkSession = SparkSession.builder 32 | .master("local[*]") 33 | .appName("VectorPipe Test") 34 | .config("spark.ui.enabled", "false") 35 | .config("spark.default.parallelism","8") 36 | .config("spark.serializer", classOf[KryoSerializer].getName) 37 | .config("spark.kryo.registrationRequired", "false") 38 | .config("spark.kryoserializer.buffer.max", "500m") 39 | .config("spark.sql.orc.impl", "native") 40 | .getOrCreate() 41 | 42 | // get the name of the class which mixes in this trait 43 | val name = this.getClass.getName 44 | 45 | override def beforeAll() = { 46 | ss.sparkContext.setJobGroup(this.getClass.getName, "test") 47 | } 48 | 49 | override def afterAll() = { 50 | ss.sparkContext.clearJobGroup() 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/test/scala/vectorpipe/functions/osm/FunctionSpec.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.functions.osm 2 | 3 | import org.apache.spark.sql.Row 4 | import org.scalatest.{FunSpec, Matchers} 5 | import vectorpipe.TestEnvironment 6 | 7 | class FunctionSpec extends FunSpec with TestEnvironment with Matchers { 8 | 9 | import ss.implicits._ 10 | 11 | describe("isArea") { 12 | it("marks 'area=*' appropriately") { 13 | Seq( 14 | Map("area" -> "yes") -> true, 15 | Map("area" -> "YES") -> true, 16 | Map("area" -> "YeS") -> true, 17 | Map("area" -> "1") -> true, 18 | Map("area" -> "true") -> true, 19 | Map("area" -> "True") -> true, 20 | Map("area" -> "no") -> false, 21 | Map("area" -> "no") -> false, 22 | Map("area" -> "0") -> false, 23 | Map("area" -> "something") -> false, 24 | Map("area" -> "yes;no") -> true, 25 | Map("area" -> "yes; no") -> true, 26 | Map("area" -> "yes ; no") -> true, 27 | Map("area" -> "yes ;no") -> true 28 | ) 29 | .toDF("tags", "value") 30 | .where(isArea('tags) =!= 'value) 31 | .count should equal(0) 32 | } 33 | 34 | it("respects area-keys") { 35 | Seq( 36 | Map("office" -> "architect") -> true, 37 | Map("waterway" -> "riverbank") -> true, 38 | Map("waterway" -> "canal") -> false, 39 | Map("aeroway" -> "aerodrome;apron") -> true, 40 | Map("aeroway" -> "aerodrome ; runway") -> true, 41 | Map("aeroway" -> "taxiway;runway") -> false 42 | ) 43 | .toDF("tags", "value") 44 | .where(isArea('tags) =!= 'value) 45 | .count should equal(0) 46 | } 47 | } 48 | 49 | describe("isMultiPolygon") { 50 | it("marks multipolygons and boundaries appropriately") { 51 | Seq( 52 | Map("type" -> "multipolygon") -> true, 53 | Map("type" -> "boundary") -> true, 54 | Map("type" -> "route") -> false, 55 | Map("type" -> "multipolygon;boundary") -> true, 56 | Map("type" -> "multipolygon ; boundary") -> true 57 | ) 58 | .toDF("tags", "value") 59 | .where(isMultiPolygon('tags) =!= 'value) 60 | .count should equal(0) 61 | } 62 | } 63 | 64 | describe("isRoute") { 65 | it("marks routes appropriately") { 66 | Seq( 67 | Map("type" -> "multipolygon") -> false, 68 | Map("type" -> "boundary") -> false, 69 | Map("type" -> "route") -> true, 70 | Map("type" -> "route;boundary") -> true, 71 | Map("type" -> "route ; boundary") -> true 72 | ) 73 | .toDF("tags", "value") 74 | .where(isRoute('tags) =!= 'value) 75 | .count should equal(0) 76 | } 77 | } 78 | 79 | describe("isBuilding") { 80 | it("marks buildings appropriately") { 81 | Seq( 82 | Map("building" -> "yes") -> true, 83 | Map("building" -> "no") -> false, 84 | Map("building" -> "false") -> false, 85 | Map("building" -> "farm") -> true, 86 | Map("building" -> "farm;apartments") -> true 87 | ) 88 | .toDF("tags", "value") 89 | .where(isBuilding('tags) =!= 'value) 90 | .count should equal(0) 91 | } 92 | } 93 | 94 | describe("isPOI") { 95 | it("marks POIs appropriately") { 96 | Seq( 97 | Map("amenity" -> "cafe") -> true, 98 | Map("shop" -> "bakery") -> true, 99 | Map("craft" -> "bakery") -> true, 100 | Map("office" -> "architect") -> true, 101 | Map("leisure" -> "disc_golf_course") -> true, 102 | Map("aeroway" -> "aerodrome") -> true, 103 | Map("highway" -> "motorway") -> false, 104 | Map("shop" -> "bakery ; dairy") -> true 105 | ) 106 | .toDF("tags", "value") 107 | .where(isPOI('tags) =!= 'value) 108 | .count should equal(0) 109 | } 110 | } 111 | 112 | describe("isRoad") { 113 | it("marks roads appropriately") { 114 | Seq( 115 | Map("highway" -> "motorway") -> true, 116 | Map("highway" -> "path") -> true, 117 | Map("highway" -> "path ;footway") -> true, 118 | Map("building" -> "yes") -> false 119 | ) 120 | .toDF("tags", "value") 121 | .where(isRoad('tags) =!= 'value) 122 | .count should equal(0) 123 | } 124 | } 125 | 126 | describe("isCoastline") { 127 | it("marks coastline appropriately") { 128 | Seq( 129 | Map("natural" -> "coastline") -> true, 130 | Map("natural" -> "water") -> false, 131 | Map("natural" -> "coastline ; water") -> true 132 | ) 133 | .toDF("tags", "value") 134 | .where(isCoastline('tags) =!= 'value) 135 | .count should equal(0) 136 | } 137 | } 138 | 139 | describe("isWaterway") { 140 | it("marks waterways appropriately") { 141 | Seq( 142 | Map("waterway" -> "river") -> true, 143 | Map("waterway" -> "riverbank") -> true, 144 | Map("waterway" -> "canal") -> true, 145 | Map("waterway" -> "stream") -> true, 146 | Map("waterway" -> "brook") -> true, 147 | Map("waterway" -> "drain") -> true, 148 | Map("waterway" -> "ditch") -> true, 149 | Map("waterway" -> "dam") -> true, 150 | Map("waterway" -> "weir") -> true, 151 | Map("waterway" -> "waterfall") -> true, 152 | Map("waterway" -> "pressurised") -> true, 153 | Map("waterway" -> "fuel") -> false, 154 | Map("waterway" -> "canal ; stream") -> true, 155 | Map("waterway" -> "canal ; fuel") -> true 156 | ) 157 | .toDF("tags", "value") 158 | .where(isWaterway('tags) =!= 'value) 159 | .count should equal(0) 160 | } 161 | } 162 | 163 | describe("removeUninterestingTags") { 164 | it("drops uninteresting tags") { 165 | Seq( 166 | Map("building" -> "yes", "created_by" -> "JOSM") 167 | ) 168 | .toDF("tags") 169 | .withColumn("tags", removeUninterestingTags('tags)) 170 | .collect() should equal(Array(Row(Map("building" -> "yes")))) 171 | } 172 | 173 | it("drops uninteresting single tags") { 174 | Seq( 175 | Map("building" -> "yes", "colour" -> "grey"), 176 | Map("colour" -> "grey") 177 | ) 178 | .toDF("tags") 179 | .withColumn("tags", removeUninterestingTags('tags)) 180 | .collect() should equal(Array(Row(Map("building" -> "yes", "colour" -> "grey")), Row(Map.empty))) 181 | } 182 | 183 | it("drops uninteresting prefixed tags") { 184 | Seq( 185 | Map("highway" -> "motorway", "tiger:reviewed" -> "no"), 186 | Map("building" -> "yes", "CLC:something" -> "something") 187 | ) 188 | .toDF("tags") 189 | .withColumn("tags", removeUninterestingTags('tags)) 190 | .collect() should equal(Array(Row(Map("highway" -> "motorway")), Row(Map("building" -> "yes")))) 191 | } 192 | 193 | it("drops tags with invalid keys") { 194 | Seq( 195 | Map("highway" -> "motorway", "k=v" -> "value"), 196 | Map("building" -> "yes", "land use" -> "something") 197 | ) 198 | .toDF("tags") 199 | .withColumn("tags", removeUninterestingTags('tags)) 200 | .collect() should equal(Array(Row(Map("highway" -> "motorway")), Row(Map("building" -> "yes")))) 201 | } 202 | } 203 | 204 | describe("removeSemiInterestingTags") { 205 | it("drops semi-interesting tags") { 206 | Seq( 207 | Map("building" -> "yes", "source" -> "MassGIS") 208 | ) 209 | .toDF("tags") 210 | .withColumn("tags", removeSemiInterestingTags('tags)) 211 | .collect() should equal(Array(Row(Map("building" -> "yes")))) 212 | } 213 | 214 | it("drops semi-interesting prefixed tags") { 215 | Seq( 216 | Map("highway" -> "motorway", "source:geometry" -> "MassGIS") 217 | ) 218 | .toDF("tags") 219 | .withColumn("tags", removeSemiInterestingTags('tags)) 220 | .collect() should equal(Array(Row(Map("highway" -> "motorway")))) 221 | } 222 | } 223 | 224 | } 225 | -------------------------------------------------------------------------------- /src/test/scala/vectorpipe/sources/AugmentedDiffSourceTest.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.sources 2 | 3 | import geotrellis.vector.Geometry 4 | import org.apache.spark.internal.Logging 5 | import org.scalatest.{FunSpec, Matchers} 6 | import vectorpipe.TestEnvironment 7 | import vectorpipe.model.ElementWithSequence 8 | import vectorpipe.util.RobustFeature 9 | 10 | class AugmentedDiffSourceSpec extends FunSpec with TestEnvironment with Matchers { 11 | 12 | import ss.implicits._ 13 | 14 | describe("Timestamp to sequence conversion") { 15 | it("should provide a round trip for simple conversion") { 16 | AugmentedDiffSource.timestampToSequence(AugmentedDiffSource.sequenceToTimestamp(3700047)) should be (3700047) 17 | } 18 | 19 | it("should provide a round trip for column functions") { 20 | val df = ss.createDataset(Seq(3700047)).toDF 21 | (df.select(AugmentedDiffSource.sequenceToTimestamp('value) as 'time) 22 | .select(AugmentedDiffSource.timestampToSequence('time) as 'value) 23 | .first 24 | .getLong(0)) should be (3700047) 25 | } 26 | } 27 | 28 | } 29 | 30 | class LogErrors extends AugmentedDiffSourceErrorHandler with Logging { 31 | override def handle(sequence: Int, feature: RobustFeature[Geometry, ElementWithSequence]) = { 32 | logWarning(s"Error in sequence ${sequence} for feature with metadata: ${feature.data}") 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/test/scala/vectorpipe/vectortile/LayerTestPipeline.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.vectortile 2 | 3 | import geotrellis.vector._ 4 | import org.apache.spark.sql.DataFrame 5 | import org.apache.spark.sql.functions 6 | import org.apache.spark.sql.functions.when 7 | 8 | import vectorpipe._ 9 | import vectorpipe.functions.osm._ 10 | 11 | case class LayerTestPipeline(geometryColumn: String, baseOutputURI: java.net.URI) extends Pipeline with Pipeline.Output { 12 | val layerMultiplicity = LayerNamesInColumn("layers") 13 | 14 | override def select(wayGeoms: DataFrame, targetZoom: Int, keyColumn: String): DataFrame = { 15 | import wayGeoms.sparkSession.implicits._ 16 | 17 | wayGeoms 18 | .withColumn("layers", when(isBuilding('tags), "buildings").when(isRoad('tags), "roads")) 19 | .where(functions.not(functions.isnull('layers))) 20 | } 21 | 22 | override def clip(geom: Geometry, key: geotrellis.layer.SpatialKey, layoutLevel: geotrellis.layer.LayoutLevel): Geometry = 23 | Clipping.byLayoutCell(geom, key, layoutLevel) 24 | } 25 | -------------------------------------------------------------------------------- /src/test/scala/vectorpipe/vectortile/PipelineSpec.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.vectortile 2 | 3 | import org.apache.spark.sql.functions 4 | import org.apache.spark.sql.functions.{isnull, lit} 5 | import org.locationtech.geomesa.spark.jts._ 6 | import org.scalatest._ 7 | import vectorpipe.{TestEnvironment, internal => vp, _} 8 | 9 | class PipelineSpec extends FunSpec with TestEnvironment with Matchers { 10 | import ss.implicits._ 11 | 12 | ss.withJTS 13 | val orcFile = getClass.getResource("/isle-of-man-latest.osm.orc").getPath 14 | val df = ss.read.orc(orcFile) 15 | 16 | describe("Vectortile Pipelines") { 17 | val nodes = vp.preprocessNodes(df, None) 18 | 19 | val nodeGeoms = nodes 20 | .filter(functions.not(isnull('lat))) 21 | .withColumn("geometry", st_makePoint('lon, 'lat)) 22 | .drop("lat", "lon") 23 | .withColumn("weight", lit(1)) 24 | .cache 25 | 26 | val wayGeoms = vp.reconstructWayGeometries(df, nodes).cache 27 | 28 | it("should generate a single zoom level") { 29 | val pipeline = TestPipeline("geometry", new java.net.URI("file:///tmp/iom-tiles"), 16) 30 | VectorPipe(nodeGeoms, pipeline, VectorPipe.Options.forZoom(8)) 31 | } 32 | 33 | it("should generate multiple zoom levels") { 34 | val pipeline = TestPipeline("geometry", new java.net.URI("file:///tmp/iom-tiles-pyramid"), 16) 35 | VectorPipe(nodeGeoms, pipeline, VectorPipe.Options.forZoomRange(6, 8)) 36 | } 37 | 38 | it("should generate multiple layers") { 39 | val pipeline = LayerTestPipeline("geom", new java.net.URI("file:///tmp/iom-layers")) 40 | VectorPipe(wayGeoms, pipeline, VectorPipe.Options.forZoom(14)) 41 | } 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/test/scala/vectorpipe/vectortile/TestPipeline.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.vectortile 2 | 3 | import geotrellis.raster.RasterExtent 4 | import geotrellis.layer._ 5 | import geotrellis.vector._ 6 | import geotrellis.vectortile._ 7 | 8 | import org.apache.spark.sql.{DataFrame, Row} 9 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 10 | import org.apache.spark.sql.functions 11 | import org.apache.spark.sql.functions.{array, col, explode, sum} 12 | 13 | import vectorpipe._ 14 | 15 | case class Bin(x: Int, y: Int) 16 | object Bin { 17 | def apply(tup: (Int, Int)): Bin = Bin(tup._1, tup._2) 18 | } 19 | 20 | case class TestPipeline(geometryColumn: String, baseOutputURI: java.net.URI, gridResolution: Int) extends Pipeline with Pipeline.Output { 21 | val weightedCentroid = new WeightedCentroid 22 | 23 | val layerMultiplicity = SingleLayer("points") 24 | 25 | override def reduce(input: DataFrame, layoutLevel: LayoutLevel, keyColumn: String): DataFrame = { 26 | import input.sparkSession.implicits._ 27 | 28 | val layout = layoutLevel.layout 29 | val binOfTile = functions.udf { (g: Geometry, key: GenericRowWithSchema) => 30 | val pt = g.asInstanceOf[Point] 31 | val k = getSpatialKey(key) 32 | val re = RasterExtent(layout.mapTransform.keyToExtent(k), gridResolution, gridResolution) 33 | val c = pt.getCoordinate 34 | Bin(re.mapToGrid(c.x, c.y)) 35 | } 36 | 37 | val st_geomToPoint = functions.udf { g: Geometry => g.asInstanceOf[Point] } 38 | 39 | input.withColumn(keyColumn, explode(col(keyColumn))) 40 | .withColumn("bin", binOfTile(col(geometryColumn), col(keyColumn))) 41 | .groupBy(col(keyColumn), col("bin")) 42 | .agg(sum('weight) as 'weight, weightedCentroid(st_geomToPoint(col(geometryColumn)), 'weight) as geometryColumn) 43 | .drop('bin) 44 | .withColumn(keyColumn, array(col(keyColumn))) 45 | } 46 | 47 | override def pack(row: Row, zoom: Int): VectorTileFeature[Point] = { 48 | val g = row.getAs[Point](geometryColumn) 49 | val weight = row.getAs[Long]("weight") 50 | 51 | Feature(g, Map( "weight" -> VInt64(weight) )) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test/scala/vectorpipe/vectortile/WeightedCentroid.scala: -------------------------------------------------------------------------------- 1 | package vectorpipe.vectortile 2 | 3 | import geotrellis.vector._ 4 | import org.apache.spark.sql.Row 5 | import org.apache.spark.sql.expressions.MutableAggregationBuffer 6 | import org.apache.spark.sql.expressions.UserDefinedAggregateFunction 7 | import org.apache.spark.sql.jts.PointUDT 8 | import org.apache.spark.sql.types._ 9 | import org.locationtech.jts.geom.{Coordinate, GeometryFactory} 10 | 11 | class WeightedCentroid extends UserDefinedAggregateFunction { 12 | 13 | // Define the schema of the input data 14 | override def inputSchema: org.apache.spark.sql.types.StructType = 15 | StructType(StructField("point", PointUDT) :: StructField("weight", DoubleType) :: Nil) 16 | 17 | // Define the types of the intermediate data structure 18 | override def bufferSchema: StructType = StructType( 19 | StructField("x", DoubleType) :: StructField("y", DoubleType) :: StructField("weight", DoubleType) :: Nil 20 | ) 21 | 22 | // Define the return type 23 | override def dataType: DataType = PointUDT 24 | 25 | // Does the function return the same value for the same input? 26 | override def deterministic: Boolean = true 27 | 28 | // Create a new, empty buffer structure 29 | override def initialize(buffer: MutableAggregationBuffer): Unit = { 30 | buffer(0) = 0.0 31 | buffer(1) = 0.0 32 | buffer(2) = 0.0 33 | } 34 | 35 | // Combine a new input with an existing buffer 36 | override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { 37 | val c = input.getAs[Point](0).getCoordinate 38 | val wt = input.getAs[Double](1) 39 | buffer(0) = buffer.getAs[Double](0) + c.x * wt 40 | buffer(1) = buffer.getAs[Double](1) + c.y * wt 41 | buffer(2) = buffer.getAs[Double](2) + wt 42 | } 43 | 44 | // Merge two intermediate buffers 45 | override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { 46 | buffer1(0) = buffer1.getAs[Double](0) + buffer2.getAs[Double](0) 47 | buffer1(1) = buffer1.getAs[Double](1) + buffer2.getAs[Double](1) 48 | buffer1(2) = buffer1.getAs[Double](2) + buffer2.getAs[Double](2) 49 | } 50 | 51 | // Produce the final output from a Row encoded with the bufferSchema 52 | override def evaluate(buffer: Row): Any = { 53 | val wx = buffer.getDouble(0) 54 | val wy = buffer.getDouble(1) 55 | val wt = buffer.getDouble(2) 56 | (new GeometryFactory).createPoint(new Coordinate(wx/wt, wy/wt)) 57 | } 58 | } 59 | --------------------------------------------------------------------------------