├── .gitignore ├── LICENSE ├── README.md ├── buildspec.yml ├── deployment ├── batch │ ├── makefiles │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── config-aws.mk.template │ │ ├── config-run.mk.template │ │ └── scripts │ │ │ └── configurations.json │ └── terraform │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── README.md │ │ ├── cluster │ │ ├── .gitignore │ │ ├── aws.tf │ │ ├── cluster-configurations.json │ │ ├── emr.tf │ │ ├── outputs.tf │ │ ├── security-group.tf │ │ └── variables.tf │ │ └── tfvars.tpl ├── docker │ ├── Dockerfile.osm_apps │ ├── Dockerfile.osm_refresh │ ├── build-containers.sh │ ├── log4j.properties │ ├── refresh-views.sh │ └── sources.list ├── monitor-checkpoints.sh ├── sql │ ├── 01-countries.sql │ ├── 02-checkpoints.sql │ ├── 03-users.sql │ ├── 04-hashtags.sql │ ├── 05-errors.sql │ ├── 06-changesets.sql │ ├── 07-changesets_countries.sql │ ├── 08-changesets_hashtags.sql │ ├── README.md │ └── materialized_views │ │ ├── country_statistics.sql │ │ ├── hashtag_statistics.sql │ │ ├── hashtag_user_statistics.sql │ │ ├── refreshments.sql │ │ └── user_statistics.sql └── streaming │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── config-deployment.mk.template │ ├── ecs-params.yml │ └── scripts │ ├── batch-generate-edit-histograms.sh │ ├── batch-generate-footprints.sh │ ├── batch-process.sh │ ├── create-log-groups.sh │ ├── define-production-streaming-update-tasks.sh │ ├── define-production-view-refresher.sh │ ├── define-staging-streaming-update-tasks.sh │ ├── define-staging-view-refresher.sh │ ├── define-streaming-augdiff-producer.sh │ ├── define-streaming-vectortile-tasks.sh │ ├── deploy-stats-refresher.sh │ ├── emr-configurations │ └── batch-process.json │ ├── expand.sh │ ├── get-tag.sh │ ├── latest-history-to-orc.sh │ └── stop-streaming-service.sh ├── docker-compose.yml ├── notebooks ├── Footprint_test_messy_vectortiles.json ├── OSM_Ingest.json └── zeppelin │ ├── Counting road length.json │ ├── Debugging long running ingest step.json │ ├── Working with ORC 1.json │ ├── Working with ORC.json │ └── hashtags.json ├── project └── build.properties ├── scripts ├── cibuild └── cipublish └── src ├── .gitignore ├── .sbtopts ├── .scalafmt.conf ├── Dockerfile.apps ├── Dockerfile.refresh ├── analytics ├── .envrc ├── .gitignore ├── bin │ ├── apply.sh │ └── update-tiles ├── build.sbt ├── project │ └── build.properties ├── resources │ └── log4j.properties └── src │ ├── main │ ├── resources │ │ ├── countries.geojson │ │ └── log4j.properties │ └── scala │ │ └── osmesa │ │ └── analytics │ │ ├── Analytics.scala │ │ ├── Countries.scala │ │ ├── EditHistogram.scala │ │ ├── Footprints.scala │ │ ├── Implicits.scala │ │ ├── Resource.scala │ │ ├── S3Utils.scala │ │ ├── VectorGrid.scala │ │ ├── raster │ │ ├── MutableSparseIntTile.scala │ │ ├── SparseIntTile.scala │ │ └── package.scala │ │ ├── stats │ │ ├── ChangesetMetadataForeachWriter.scala │ │ ├── ChangesetStatsForeachWriter.scala │ │ ├── functions │ │ │ └── package.scala │ │ └── package.scala │ │ ├── updater │ │ ├── Implicits.scala │ │ ├── Schema.scala │ │ ├── TileUpdater.scala │ │ ├── package.scala │ │ └── schemas │ │ │ ├── History.scala │ │ │ ├── Snapshot.scala │ │ │ ├── Urchn.scala │ │ │ └── package.scala │ │ └── vectorgrid │ │ └── package.scala │ └── test │ ├── resources │ └── log4j.properties │ └── scala │ └── osmesa │ └── analytics │ └── CountriesTest.scala ├── apps ├── build.sbt └── src │ └── main │ └── scala │ └── osmesa │ └── apps │ ├── DbUtils.scala │ ├── batch │ ├── ChangesetMetadataCreator.scala │ ├── ChangesetStatsCreator.scala │ ├── EditHistogramTileCreator.scala │ ├── FacetedEditHistogramTileCreator.scala │ ├── FootprintCreator.scala │ └── MergeChangesets.scala │ └── streaming │ ├── ChangeStreamProcessor.scala │ ├── ChangesetMetadataUpdater.scala │ ├── ChangesetStatsUpdater.scala │ ├── EditHistogramTileUpdater.scala │ ├── FacetedEditHistogramTileUpdater.scala │ ├── HashtagFootprintUpdater.scala │ ├── MergedChangesetStreamProcessor.scala │ ├── StreamingChangesetMetadataUpdater.scala │ ├── StreamingChangesetStatsUpdater.scala │ ├── StreamingEditHistogramTileUpdater.scala │ ├── StreamingFacetedEditHistogramTileUpdater.scala │ ├── StreamingUserFootprintTileUpdater.scala │ └── UserFootprintUpdater.scala ├── bench └── src │ └── main │ └── scala │ └── osmesa │ ├── Bench.scala │ ├── MetresBench.scala │ └── SAXBench.scala ├── bm ├── build.sbt ├── src │ └── main │ │ └── scala │ │ └── osmesa │ │ └── bm │ │ ├── BuildingMatching.scala │ │ ├── Downsample.scala │ │ ├── GenerateVT.scala │ │ ├── Homography.scala │ │ ├── QuadTreePartitioner.scala │ │ ├── VertexMatching.scala │ │ ├── VertexProjection.scala │ │ └── VolumeMatching.scala └── view │ └── index.html ├── build.sbt ├── docker ├── log4j.properties └── refresh-views.sh ├── project ├── Dependencies.scala ├── Settings.scala ├── Version.scala ├── assembly.sbt ├── build.properties └── plugins.sbt └── sbt /.gitignore: -------------------------------------------------------------------------------- 1 | # GeoMesa Dist (to be copied in) # 2 | services/hbase/geomesa-hbase-dist.tar.gz 3 | services/geoserver/geomesa-hbase-dist.tar.gz 4 | 5 | # Project generated files # 6 | 7 | metastore_db 8 | third_party_sources 9 | derby.log 10 | 11 | # Test Data # 12 | src/test-data 13 | 14 | # AWS # 15 | 16 | *.pem 17 | 18 | # Operating System Files # 19 | 20 | *.DS_Store 21 | Thumbs.db 22 | 23 | # Build Files # 24 | 25 | bin 26 | target 27 | build/ 28 | .gradle 29 | 30 | # Eclipse Project Files # 31 | 32 | .classpath 33 | .project 34 | .settings 35 | 36 | # Vagrant 37 | 38 | .vagrant 39 | 40 | # Terraform 41 | deployment/terraform/.terraform 42 | deployment/terraform/terraform.tfvars 43 | .terraform.tfstate.lock.info 44 | *.tfvars 45 | *.tfstate 46 | *.tfplan 47 | *.tfstate.backup 48 | .terraform 49 | 50 | # Makefile configs 51 | *.mk 52 | 53 | # Ansible 54 | deployment/ansible/roles/azavea.* 55 | 56 | # Node and Webpack 57 | node_modules/ 58 | npm-debug.log 59 | pgw.communitymapping*.js 60 | pgw.communitymapping*.js.map 61 | vendor.bundle.js 62 | dist/ 63 | 64 | 65 | # IntelliJ IDEA Files # 66 | 67 | *.iml 68 | *.ipr 69 | *.iws 70 | *.idea 71 | 72 | # macOS 73 | .DS_Store 74 | 75 | # Emacs # 76 | 77 | .ensime 78 | \#*# 79 | *~ 80 | .#* 81 | 82 | *.orc 83 | *.jar 84 | 85 | # Temporary credentials 86 | emr/terraform/auth.json 87 | 88 | -------------------------------------------------------------------------------- /buildspec.yml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | 3 | phases: 4 | install: 5 | runtime-versions: 6 | docker: 18 7 | java: openjdk11 8 | commands: 9 | - docker -v 10 | - java -version 11 | pre_build: 12 | commands: 13 | - echo "$DOCKER_HUB_ACCESS_TOKEN" | docker login --username "$DOCKER_HUB_USERNAME" --password-stdin 14 | build: 15 | commands: 16 | - mkdir -p /root/.sbt/launchers/1.3.0/ 17 | - curl -L -o /root/.sbt/launchers/1.3.0/sbt-launch.jar https://repo.scala-sbt.org/scalasbt/maven-releases/org/scala-sbt/sbt-launch/1.3.0/sbt-launch.jar 18 | - ls -lh /root/.sbt/launchers/1.3.0 19 | - ./scripts/cibuild 20 | - ./scripts/cipublish 21 | artifacts: 22 | files: 23 | - osmesa-dist/**/* 24 | -------------------------------------------------------------------------------- /deployment/batch/makefiles/.gitignore: -------------------------------------------------------------------------------- 1 | cluster-id.txt 2 | last-step-id.txt 3 | -------------------------------------------------------------------------------- /deployment/batch/makefiles/config-aws.mk.template: -------------------------------------------------------------------------------- 1 | export AWS_DEFAULT_REGION:=us-east-1 2 | 3 | export PEM_FILE:=[PEM FILE] 4 | export EC2_KEY:=[EC2 KEY] 5 | export SUBNET_ID:=[EMR SUBNET] 6 | -------------------------------------------------------------------------------- /deployment/batch/makefiles/config-run.mk.template: -------------------------------------------------------------------------------- 1 | export S3_BUCKET:=[TARGET BUCKET] 2 | export S3_URI:=s3://${S3_BUCKET} 3 | export S3_CATALOG := ${S3_URI}/[TARGET_CATALOG] 4 | 5 | export PLANET_ORC := s3://osm-pds/planet-history/history-latest.orc 6 | export OSM_HISTORY := [OSH_ORC_FILENAME_URI] 7 | export OUTPUT_LOCATION := [TARGET_BUCKET] 8 | 9 | export ORC_CACHE_LOCATION := ${S3_URI}/cache 10 | export VECTORTILE_CATALOG_LOCATION = ${S3_URI}/vectortiles 11 | 12 | export CHANGESET_CSV := [URI of OSM CSV table dump] 13 | export CHANGESET_COMMENTS_CSV := [URI of OSM CSV table dump] 14 | export CHANGESET_TAGS_CSV := [URI of OSM CSV table dump] 15 | export USER_CSV := [URI of OSM CSV table dump] 16 | export CHANGESET_ORC_DEST := [S3 URI of ORC] 17 | 18 | export PLANET_HISTORY_PBF := [S3 URI of target planet history PBF] 19 | export PLANET_HISTORY_ORC_DIR := [S3 URI of converted planet history ORCs] 20 | -------------------------------------------------------------------------------- /deployment/batch/makefiles/scripts/configurations.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Classification": "spark", 4 | "Properties": { 5 | "maximizeResourceAllocation": "false" 6 | } 7 | }, 8 | { 9 | "Classification": "spark-defaults", 10 | "Properties": { 11 | "spark.driver.maxResultSize": "3G", 12 | "spark.dynamicAllocation.enabled": "true", 13 | "spark.shuffle.service.enabled": "true", 14 | "spark.shuffle.compress": "true", 15 | "spark.shuffle.spill.compress": "true", 16 | "spark.rdd.compress": "true", 17 | "spark.executor.memoryOverhead": "1G", 18 | "spark.driver.memoryOverhead": "1G", 19 | "spark.driver.maxResultSize": "3G", 20 | "spark.executor.extraJavaOptions" : "-XX:+UseParallelGC -Dgeotrellis.s3.threads.rdd.write=64" 21 | } 22 | }, 23 | { 24 | "Classification": "hdfs-site", 25 | "Properties": { 26 | "dfs.replication": "1", 27 | "dfs.permissions": "false", 28 | "dfs.datanode.max.xcievers": "16384", 29 | "dfs.datanode.max.transfer.threads": "16384", 30 | "dfs.datanode.balance.max.concurrent.moves": "1000", 31 | "dfs.datanode.balance.bandwidthPerSec": "100000000" 32 | } 33 | }, 34 | { 35 | "Classification": "yarn-site", 36 | "Properties": { 37 | "yarn.resourcemanager.am.max-attempts": "1", 38 | "yarn.nodemanager.vmem-check-enabled": "false", 39 | "yarn.nodemanager.pmem-check-enabled": "false" 40 | } 41 | }, 42 | { 43 | "Classification": "hadoop-env", 44 | "Configurations": [ 45 | { 46 | "Classification": "export", 47 | "Properties": { 48 | "JAVA_HOME": "/usr/lib/jvm/java-1.8.0", 49 | "GDAL_DATA": "/usr/local/share/gdal", 50 | "LD_LIBRARY_PATH": "/usr/local/lib", 51 | "PYSPARK_PYTHON": "python27", 52 | "PYSPARK_DRIVER_PYTHON": "python27" 53 | } 54 | } 55 | ] 56 | }, 57 | { 58 | "Classification": "spark-env", 59 | "Configurations": [ 60 | { 61 | "Classification": "export", 62 | "Properties": { 63 | "JAVA_HOME": "/usr/lib/jvm/java-1.8.0", 64 | "GDAL_DATA": "/usr/local/share/gdal", 65 | "LD_LIBRARY_PATH": "/usr/local/lib", 66 | "SPARK_PRINT_LAUNCH_COMMAND": "1", 67 | "PYSPARK_PYTHON": "python27", 68 | "PYSPARK_DRIVER_PYTHON": "python27" 69 | } 70 | } 71 | ] 72 | }, 73 | { 74 | "Classification": "yarn-env", 75 | "Configurations": [ 76 | { 77 | "Classification": "export", 78 | "Properties": { 79 | "JAVA_HOME": "/usr/lib/jvm/java-1.8.0", 80 | "GDAL_DATA": "/usr/local/share/gdal", 81 | "LD_LIBRARY_PATH": "/usr/local/lib", 82 | "PYSPARK_PYTHON": "python27", 83 | "PYSPARK_DRIVER_PYTHON": "python27" 84 | } 85 | } 86 | ] 87 | } 88 | ] 89 | -------------------------------------------------------------------------------- /deployment/batch/terraform/.gitignore: -------------------------------------------------------------------------------- 1 | tfvars 2 | auth.json 3 | -------------------------------------------------------------------------------- /deployment/batch/terraform/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef AWS_PROFILE 3 | $(error AWS_PROFILE is not set) 4 | endif 5 | 6 | CLUSTER_ID ?= $(shell cd cluster && terraform output | grep emr-id | awk '{print $$NF}') 7 | MASTER_IP ?= $(shell cd cluster && terraform output | grep emr-master | awk '{print $$NF}') 8 | KEY_NAME ?= $(shell cd cluster && terraform output | grep key-name | awk '{print $$NF}') 9 | KEY_PATH ?= "~/.ssh/${KEY_NAME}.pem" 10 | 11 | # For circumvention of MFA when necessary 12 | AWS_ENV_VARS ?= AWS_ACCESS_KEY_ID=$(shell cat auth.json | jq -re '.Credentials.AccessKeyId') AWS_SECRET_ACCESS_KEY=$(shell cat auth.json | jq -re '.Credentials.SecretAccessKey') AWS_SESSION_TOKEN=$(shell cat auth.json | jq -re '.Credentials.SessionToken') 13 | 14 | # Get STS token to work around terraform's MFA difficulties 15 | auth.json: 16 | rm -rf auth.json 17 | cd cluster; aws \ 18 | --profile ${AWS_PROFILE} \ 19 | sts assume-role \ 20 | --role-arn="$(shell aws configure get --profile ${AWS_PROFILE} role_arn)" \ 21 | --role-session-name="power-user-session" > ../auth.json 22 | 23 | validate-cluster: auth.json 24 | cd cluster; $(AWS_ENV_VARS) terraform validate \ 25 | --var-file="../tfvars" \ 26 | -var "aws_profile=${AWS_PROFILE}" 27 | 28 | init-cluster: auth.json 29 | cd cluster; $(AWS_ENV_VARS) terraform init \ 30 | -var-file="../tfvars" \ 31 | -var "aws_profile=${AWS_PROFILE}" 32 | 33 | cluster-tfplan: auth.json 34 | cd cluster; $(AWS_ENV_VARS) terraform plan \ 35 | -var-file="../tfvars" \ 36 | -var "aws_profile=${AWS_PROFILE}" \ 37 | -out="cluster-tfplan" 38 | 39 | cluster: cluster-tfplan 40 | cd cluster; $(AWS_ENV_VARS) terraform apply "cluster-tfplan" 41 | 42 | ssh: auth.json 43 | $(AWS_ENV_VARS) aws emr ssh \ 44 | --cluster-id ${CLUSTER_ID} \ 45 | --key-pair-file ${KEY_PATH} 46 | 47 | proxy: 48 | ssh -i ${KEY_PATH} -ND 8157 hadoop@${MASTER_IP} 49 | 50 | destroy-cluster: auth.json 51 | cd cluster; $(AWS_ENV_VARS) terraform destroy \ 52 | -var-file="../tfvars" \ 53 | -var "aws_profile=${AWS_PROFILE}" 54 | 55 | osmesa.jar: 56 | echo TODO 57 | # cd ../src && sbt assembly 58 | # cp ../src/target/scala-2.11/osmesa-assembly-0.1.0.jar osmesa.jar 59 | 60 | upload-jar: osmesa.jar 61 | echo TODO 62 | # aws emr put --cluster-id ${CLUSTER_ID} --key-pair-file ${KEY_PATH} \ 63 | # --src osmesa.jar --dest /tmp/osmesa.jar 64 | 65 | 66 | print-vars: 67 | echo aws_profile: ${AWS_PROFILE} 68 | echo cluster_id: ${CLUSTER_ID} 69 | echo key_name: ${KEY_NAME} 70 | echo key_path: ${KEY_PATH} 71 | echo master_ip: ${MASTER_IP} 72 | echo env_vars: ${AWS_ENV_VARS} 73 | 74 | -------------------------------------------------------------------------------- /deployment/batch/terraform/README.md: -------------------------------------------------------------------------------- 1 | # OSMESA EMR 2 | 3 | This directory contains a make file to spin up an EMR cluster using [terraform](https://github.com/hashicorp/terraform). 4 | 5 | - [Requirements](#requirements) 6 | - [Makefile](#makefile) 7 | - [Running](#running) 8 | 9 | ## Requirements 10 | 11 | [Terraform 0.11.5](https://github.com/hashicorp/terraform/releases/tag/v0.11.5) 12 | 13 | ## Settings 14 | 15 | [cluster/variables.tf](cluster/variables.tf) contains the full set of variables 16 | which can be specified to modify an EMR deployment. Only those not 17 | provided defaults need to be specified, and these can be found within 18 | [tfvars.tpl](tfvars.tpl) - be sure to make a copy of this template and remove 19 | 'tpl' from the filename. 20 | 21 | 22 | ## Makefile 23 | 24 | | Command | Description 25 | |-----------------------|------------------------------------------------------------| 26 | |auth.json |Generate temporary session and key/secret | 27 | |validate-cluster |`terraform validate - Validate terraform | 28 | |init-cluster |`terraform init` - Initialize terraform | 29 | |cluster-tfplan |`terraform plan` - Plan out an 'apply' of this terraform | 30 | |cluster |`terraform` init, if it's the first run | 31 | |ssh |SSH into a running EMR cluster | 32 | |destroy-cluster |Destroy a running EMR cluster | 33 | |print-vars |Print out env vars for diagnostic and debug purposes | 34 | 35 | ## Running 36 | 37 | The Makefile in this directory provides commands to easily set up an EMR 38 | cluster with MFA, but doing so does require a minimal amount of configuration. 39 | It will be necessary to export your desired AWS profile as well as 40 | having set up `assume role` permissions and an MFA device for the AWS 41 | profile exported. You'll also need to make a copy of tfvars.tpl for 42 | adding parameters specific to your deployment. 43 | 44 | ```bash 45 | export AWS_PROFILE=my_profile 46 | cp tfvars.tpl tfvars 47 | # update tfvars with values appropriate to the EMR cluster you'd like 48 | make auth.json 49 | make cluster 50 | ``` 51 | 52 | `make auth.json` will prompt you for your MFA key and produce a 53 | session which terraform can use to get around MFA restrictions. 54 | 55 | **Note:** long startup times (10 minutes or more) probably indicates that you have 56 | chosen a spot price that is too low. 57 | 58 | This basic cluster will have a running Zeppelin interface that can be accessed 59 | via SSH tunnel with 60 | [foxyproxy](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-connect-master-node-proxy.html) 61 | 62 | ![Zeppelin Welcome](./images/zeppelin-welcome.png) 63 | 64 | This cluster will not have access to any code until we upload the 65 | appropriate jars and register them within zeppelin 66 | 67 | ```bash 68 | make upload-assembly 69 | ``` 70 | 71 | is issued. Upon doing so, you must configure Zeppelin to recognize this 72 | resource by going to the interpreters tab: 73 | 74 | ![Zeppelin interpreters](./images/zeppelin-interpreters.png) 75 | 76 | Edit the spark interpreter settings by adding the GeoTrellis jar into the 77 | class path (`make upload-assembly` copies the fat jar into, e.g., 78 | `/tmp/geotrellis-spark-etl-assembly-1.2.0-SNAPSHOT.jar`): 79 | 80 | ![Zeppelin interpreter edit](./images/zeppelin-interpreter-edit.png) 81 | 82 | You may then create a new notebook: 83 | 84 | ![Zeppelin Osmesa Notebook](./images/zeppelin-osmesa-notebook.png) 85 | 86 | wherein GeoTrellis deps can be imported: 87 | 88 | ![Zeppelin Osmesa example](./images/zeppelin-osmesa-example.png) 89 | -------------------------------------------------------------------------------- /deployment/batch/terraform/cluster/.gitignore: -------------------------------------------------------------------------------- 1 | .terraform* 2 | terraform.tfstate* 3 | tfvars 4 | cluster-tfplan 5 | -------------------------------------------------------------------------------- /deployment/batch/terraform/cluster/aws.tf: -------------------------------------------------------------------------------- 1 | 2 | # Marks AWS as a resource provider. 3 | provider "aws" { 4 | profile = "${var.aws_profile}" 5 | region = "${var.aws_region}" 6 | } 7 | 8 | -------------------------------------------------------------------------------- /deployment/batch/terraform/cluster/cluster-configurations.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Classification": "spark", 4 | "Properties": { 5 | "maximizeResourceAllocation": "true" 6 | } 7 | }, 8 | { 9 | "Classification": "spark-defaults", 10 | "Properties": { 11 | "spark.driver.maxResultSize": "3G", 12 | "spark.dynamicAllocation.enabled": "true", 13 | "spark.shuffle.service.enabled": "true", 14 | "spark.shuffle.compress": "true", 15 | "spark.shuffle.spill.compress": "true", 16 | "spark.rdd.compress": "true", 17 | "spark.yarn.executor.memoryOverhead": "1G", 18 | "spark.yarn.driver.memoryOverhead": "1G", 19 | "spark.driver.maxResultSize": "3G", 20 | "spark.executor.extraJavaOptions" : "-XX:+UseParallelGC -Dgeotrellis.s3.threads.rdd.write=64" 21 | } 22 | }, 23 | { 24 | "Classification": "yarn-site", 25 | "Properties": { 26 | "yarn.resourcemanager.am.max-attempts": "1", 27 | "yarn.nodemanager.vmem-check-enabled": "false", 28 | "yarn.nodemanager.pmem-check-enabled": "false" 29 | } 30 | } 31 | ] 32 | -------------------------------------------------------------------------------- /deployment/batch/terraform/cluster/emr.tf: -------------------------------------------------------------------------------- 1 | resource "aws_emr_cluster" "emr-spark-cluster" { 2 | name = "${var.user} - ${var.cluster_name}" 3 | applications = ["Hadoop", "Spark", "Ganglia", "Zeppelin"] 4 | release_label = "emr-5.8.0" 5 | service_role = "${var.emr_service_role}" 6 | 7 | ec2_attributes { 8 | instance_profile = "${var.emr_instance_profile}" 9 | key_name = "${var.key_name}" 10 | 11 | emr_managed_master_security_group = "${aws_security_group.emr-cluster.id}" 12 | emr_managed_slave_security_group = "${aws_security_group.emr-cluster.id}" 13 | } 14 | 15 | instance_group { 16 | instance_count = 1 17 | instance_role = "MASTER" 18 | instance_type = "${var.master_instance_type}" 19 | name = "emr-master" 20 | } 21 | 22 | instance_group { 23 | bid_price = "${var.bid_price}" 24 | instance_count = "${var.worker_count}" 25 | instance_role = "CORE" 26 | instance_type = "${var.worker_instance_type}" 27 | name = "emr-worker" 28 | } 29 | 30 | configurations = "cluster-configurations.json" 31 | } 32 | 33 | -------------------------------------------------------------------------------- /deployment/batch/terraform/cluster/outputs.tf: -------------------------------------------------------------------------------- 1 | output "emr-id" { 2 | value = "${aws_emr_cluster.emr-spark-cluster.id}" 3 | } 4 | 5 | output "emr-master" { 6 | value = "${aws_emr_cluster.emr-spark-cluster.master_public_dns}" 7 | } 8 | 9 | output "key-name" { 10 | value = "${var.key_name}" 11 | } 12 | -------------------------------------------------------------------------------- /deployment/batch/terraform/cluster/security-group.tf: -------------------------------------------------------------------------------- 1 | resource "aws_security_group" "emr-cluster" { 2 | ingress { 3 | from_port = 0 4 | to_port = 0 5 | protocol = "-1" 6 | self = true 7 | } 8 | 9 | ingress { 10 | from_port = "22" 11 | to_port = "22" 12 | protocol = "tcp" 13 | cidr_blocks = ["0.0.0.0/0"] 14 | } 15 | 16 | egress { 17 | from_port = 0 18 | to_port = 0 19 | protocol = "-1" 20 | cidr_blocks = ["0.0.0.0/0"] 21 | } 22 | 23 | lifecycle { 24 | create_before_destroy = true 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /deployment/batch/terraform/cluster/variables.tf: -------------------------------------------------------------------------------- 1 | variable "aws_region" { 2 | type = "string" 3 | description = "AWS Region" 4 | default = "us-east-1" 5 | } 6 | 7 | variable "aws_profile" { 8 | type = "string" 9 | description = "AWS Profile" 10 | } 11 | 12 | variable "key_name" { 13 | type = "string" 14 | description = "The name of the EC2 secret key (primarily for SSH access)" 15 | } 16 | 17 | variable "worker_count" { 18 | type = "string" 19 | description = "The number of worker nodes" 20 | default = "1" 21 | } 22 | 23 | variable "emr_service_role" { 24 | type = "string" 25 | description = "EMR service role" 26 | default = "EMR_DefaultRole" 27 | } 28 | 29 | variable "emr_instance_profile" { 30 | type = "string" 31 | description = "EMR instance profile" 32 | default = "EMR_EC2_DefaultRole" 33 | } 34 | 35 | variable "bid_price" { 36 | type = "string" 37 | description = "Bid Price" 38 | default = "0.07" 39 | } 40 | 41 | variable "user" { 42 | default = "EMR" 43 | } 44 | 45 | variable "cluster_name" { 46 | default = "Testing" 47 | } 48 | 49 | variable "master_instance_type" { 50 | default = "m3.2xlarge" 51 | } 52 | 53 | variable "worker_instance_type" { 54 | default = "m3.xlarge" 55 | } 56 | -------------------------------------------------------------------------------- /deployment/batch/terraform/tfvars.tpl: -------------------------------------------------------------------------------- 1 | aws_region = "" 2 | 3 | key_name = "" 4 | 5 | worker_count = "" 6 | 7 | cluster_name = "" 8 | -------------------------------------------------------------------------------- /deployment/docker/Dockerfile.osm_apps: -------------------------------------------------------------------------------- 1 | FROM bde2020/spark-master:2.4.4-hadoop2.7 2 | 3 | COPY osmesa-apps.jar /opt/osmesa-apps.jar 4 | COPY log4j.properties /spark/conf/ 5 | COPY refresh-views.sh /usr/local/bin/refresh-views.sh 6 | 7 | WORKDIR /opt 8 | -------------------------------------------------------------------------------- /deployment/docker/Dockerfile.osm_refresh: -------------------------------------------------------------------------------- 1 | FROM alpine:3.12 2 | 3 | RUN apk update && apk add bash postgresql-client 4 | COPY refresh-views.sh /usr/local/bin/refresh-views.sh 5 | 6 | WORKDIR /opt 7 | -------------------------------------------------------------------------------- /deployment/docker/build-containers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "No version tag has been set. Do not run this script directly; instead, issue" 5 | echo " make build-container" 6 | echo "from the 'streaming' directory." 7 | exit 1 8 | else 9 | echo "Version tag is set to '${VERSION_TAG}'" 10 | fi 11 | 12 | set -xe 13 | SBT_DIR="../../src" 14 | JAR_DIR=${SBT_DIR}/apps/target/scala-2.11/ 15 | DOCKER_DIR=$(pwd) 16 | 17 | cp ${JAR_DIR}/osmesa-apps.jar ${DOCKER_DIR}/osmesa-apps.jar 18 | docker build -f Dockerfile.osm_apps --tag osm_apps:${VERSION_TAG} ${DOCKER_DIR} 19 | docker build -f Dockerfile.osm_refresh --tag osm_refresh:${VERSION_TAG} ${DOCKER_DIR} 20 | rm ${DOCKER_DIR}/osmesa-apps.jar 21 | -------------------------------------------------------------------------------- /deployment/docker/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=WARN, console 2 | log4j.appender.console=org.apache.log4j.ConsoleAppender 3 | log4j.appender.console.target=System.out 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | # log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c: %m%n 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 7 | log4j.logger.osmesa=DEBUG 8 | log4j.logger.vectorpipe=DEBUG -------------------------------------------------------------------------------- /deployment/docker/refresh-views.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo "$(date -Iseconds): Starting view refreshment in $DATABASE_NAME" 4 | 5 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently user_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then 6 | echo "$(date -Iseconds): Refreshing user statistics" 7 | # refresh in the background to return immediately 8 | psql -Aqt \ 9 | -c "REFRESH MATERIALIZED VIEW CONCURRENTLY user_statistics" \ 10 | -c "UPDATE refreshments SET updated_at=now() where mat_view='user_statistics'" \ 11 | $DATABASE_URL & 12 | else 13 | echo "$(date -Iseconds): User stats table already refreshing" 14 | fi 15 | 16 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently hashtag_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then 17 | echo "$(date -Iseconds): Refreshing hashtag statistics" 18 | # refresh in the background to return immediately 19 | psql -Aqt \ 20 | -c "REFRESH MATERIALIZED VIEW CONCURRENTLY hashtag_statistics" \ 21 | -c "UPDATE refreshments SET updated_at=now() where mat_view='hashtag_statistics'" \ 22 | $DATABASE_URL & 23 | else 24 | echo "$(date -Iseconds): Hashtag stats table already refreshing" 25 | fi 26 | 27 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently country_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then 28 | # refresh in the background to return immediately 29 | echo "$(date -Iseconds): Refreshing country statistics" 30 | psql -Aqt \ 31 | -c "REFRESH MATERIALIZED VIEW CONCURRENTLY country_statistics" \ 32 | -c "UPDATE refreshments SET updated_at=now() where mat_view='country_statistics'" \ 33 | $DATABASE_URL & 34 | else 35 | echo "$(date -Iseconds): Country stats table already refreshing" 36 | fi 37 | 38 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently hashtag_user_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then 39 | # refresh in the background to return immediately 40 | echo "$(date -Iseconds): Refreshing hashtag/user statistics" 41 | psql -Aqt \ 42 | -c "REFRESH MATERIALIZED VIEW CONCURRENTLY hashtag_user_statistics" \ 43 | -c "UPDATE refreshments SET updated_at=now() where mat_view='hashtag_user_statistics'" \ 44 | $DATABASE_URL & 45 | else 46 | echo "$(date -Iseconds): Hashtag/user stats table already refreshing" 47 | fi 48 | 49 | wait 50 | echo "$(date -Iseconds): Completed" 51 | -------------------------------------------------------------------------------- /deployment/docker/sources.list: -------------------------------------------------------------------------------- 1 | deb http://mirrors.linode.com/debian/ stretch main 2 | deb-src http://mirrors.linode.com/debian/ stretch main 3 | deb http://mirrors.linode.com/debian-security/ stretch/updates main 4 | deb-src http://mirrors.linode.com/debian-security/ stretch/updates main 5 | deb http://mirrors.linode.com/debian/ stretch-updates main 6 | deb-src http://mirrors.linode.com/debian/ stretch-updates main 7 | -------------------------------------------------------------------------------- /deployment/monitor-checkpoints.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Send email if ChangesetStatsUpdater is at least OFFSET_THRESHOLD minutes behind. 3 | # Requires that `mailx` be installed. 4 | # 5 | # Ensure the following env variables are set: 6 | # - DATABASE_URL: A valid postgres connection string 7 | # - ENVIRONMENT: A unique string describing the environment, usually "staging"|"production" 8 | # - FROM_EMAIL: Email address to send alert from 9 | # - TO_EMAIL: Email address to send alert to 10 | # - SMTP_HOSTNAME: Hostname of SMTP server to send mail to 11 | # Optional: 12 | # - OFFSET_THRESHOLD: Default 10. Offset in minutes to begin alerting at. 13 | # - SMTP_PORT: Default 25 14 | 15 | set -e 16 | 17 | CHANGESET_CHECKPOINT=$(psql -Aqtc "select sequence from checkpoints where proc_name = 'ChangesetStatsUpdater'" $DATABASE_URL) 18 | EPOCH_NOW=$(date +%s) 19 | ADIFF_SEQUENCE_NOW=$(( (${EPOCH_NOW} - 1347432900) / 60 )) 20 | 21 | OFFSET=$(( ${ADIFF_SEQUENCE_NOW} - ${CHANGESET_CHECKPOINT} )) 22 | 23 | if (( ${OFFSET} >= ${OFFSET_THRESHOLD:-10} )); then 24 | echo "OSMesa ChangesetStatsUpdater in ${ENVIRONMENT} is behind by ${OFFSET}" | \ 25 | mailx \ 26 | -s "ALERT: OSMesa ChangesetStats Slow (${ENVIRONMENT})" \ 27 | -S smtp=smtp://${SMTP_HOSTNAME}:${SMTP_PORT:-25} \ 28 | -S from="${FROM_EMAIL}" \ 29 | "${TO_EMAIL}" 30 | fi 31 | -------------------------------------------------------------------------------- /deployment/sql/02-checkpoints.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE checkpoints ( 2 | proc_name text NOT NULL UNIQUE, 3 | sequence integer NOT NULL, 4 | PRIMARY KEY(proc_name) 5 | ); 6 | 7 | -------------------------------------------------------------------------------- /deployment/sql/03-users.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE users ( 2 | id integer NOT NULL, 3 | name text, 4 | PRIMARY KEY(id) 5 | ); 6 | -------------------------------------------------------------------------------- /deployment/sql/04-hashtags.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE hashtags ( 2 | id serial, 3 | hashtag text NOT NULL UNIQUE, 4 | PRIMARY KEY(id) 5 | ); 6 | 7 | CREATE UNIQUE INDEX ON hashtags (hashtag); 8 | 9 | -- support for LIKE queries on hashtags 10 | CREATE EXTENSION pg_trgm; 11 | CREATE INDEX trgm_idx_hashtags ON hashtags USING gin (hashtag gin_trgm_ops); 12 | -------------------------------------------------------------------------------- /deployment/sql/05-errors.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE errors ( 2 | id bigint NOT NULL, 3 | type smallint, 4 | sequence integer, 5 | tags jsonb, 6 | nds bigint[], 7 | changeset bigint, 8 | uid bigint, 9 | "user" text, 10 | updated timestamp with time zone, 11 | visible boolean, 12 | version integer, 13 | PRIMARY KEY(id) 14 | ); 15 | -------------------------------------------------------------------------------- /deployment/sql/06-changesets.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE changesets ( 2 | id bigint NOT NULL, 3 | measurements jsonb, 4 | counts jsonb, 5 | total_edits integer, 6 | editor text, 7 | user_id integer, 8 | created_at timestamp with time zone, 9 | closed_at timestamp with time zone, 10 | augmented_diffs integer[], 11 | updated_at timestamp with time zone, 12 | PRIMARY KEY(id) 13 | ); 14 | 15 | CREATE INDEX changesets_user_id ON changesets(user_id); 16 | 17 | CREATE INDEX changesets_created_at_index 18 | ON changesets (created_at); 19 | 20 | CREATE INDEX changesets_closed_at_index 21 | ON changesets (closed_at); 22 | 23 | CREATE INDEX changesets_updated_at_index 24 | ON changesets (updated_at); 25 | 26 | -------------------------------------------------------------------------------- /deployment/sql/07-changesets_countries.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE changesets_countries ( 2 | changeset_id integer NOT NULL 3 | CONSTRAINT changesets_countries_changesets_id_fk 4 | REFERENCES changesets, 5 | country_id integer NOT NULL 6 | CONSTRAINT changesets_countries_countries_id_fk 7 | REFERENCES countries, 8 | edit_count integer NOT NULL, 9 | augmented_diffs integer[], 10 | PRIMARY KEY(changeset_id, country_id) 11 | ); 12 | 13 | -- support joining on foreign keys (add index in reverse order of the primary key) 14 | CREATE INDEX changesets_countries_country_id_changeset_id_index 15 | ON changesets_countries (country_id, changeset_id); 16 | -------------------------------------------------------------------------------- /deployment/sql/08-changesets_hashtags.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE changesets_hashtags ( 2 | changeset_id integer NOT NULL 3 | CONSTRAINT changesets_hashtags_changesets_id_fk 4 | REFERENCES changesets, 5 | hashtag_id integer NOT NULL 6 | CONSTRAINT changesets_hashtags_hashtags_id_fk 7 | REFERENCES hashtags, 8 | PRIMARY KEY(changeset_id, hashtag_id) 9 | ); 10 | 11 | -- support joining on foreign keys (add index in reverse order of the primary key) 12 | CREATE INDEX changesets_hashtags_hashtag_id_changeset_id_index 13 | ON changesets_hashtags (hashtag_id, changeset_id); 14 | -------------------------------------------------------------------------------- /deployment/sql/README.md: -------------------------------------------------------------------------------- 1 | ## SQL Definitions 2 | 3 | This directory contains files with SQL definitions to set up a fresh OSMesa stats database. The files in this top level are the definitions for the required tables which are constructed by the batch ingest process and subsequently updated by the streaming tasks. The SQL files in the `materialized_views` directory are used to create aggregated summaries of the fundamental stats that are more useful for direct consumption by a user. These materialized views will not, however, automatically update; they will need to be refreshed periodically. 4 | -------------------------------------------------------------------------------- /deployment/sql/materialized_views/country_statistics.sql: -------------------------------------------------------------------------------- 1 | DROP MATERIALIZED VIEW IF EXISTS country_statistics; 2 | CREATE MATERIALIZED VIEW country_statistics AS 3 | WITH changesets AS ( 4 | SELECT 5 | * 6 | FROM changesets 7 | -- ignore users 0 and 1 8 | WHERE user_id > 1 9 | ), 10 | general AS ( 11 | SELECT 12 | country_id, 13 | max(coalesce(closed_at, created_at)) last_edit, 14 | count(*) changeset_count, 15 | sum(coalesce(edit_count, 0)) edit_count, 16 | max(updated_at) updated_at 17 | FROM changesets 18 | JOIN changesets_countries ON changesets.id = changesets_countries.changeset_id 19 | GROUP BY country_id 20 | ), 21 | processed_changesets AS ( 22 | SELECT 23 | id, 24 | user_id, 25 | country_id, 26 | measurements, 27 | counts, 28 | edit_count 29 | FROM changesets 30 | JOIN changesets_countries ON changesets.id = changesets_countries.changeset_id 31 | ), 32 | hashtag_counts AS ( 33 | SELECT 34 | RANK() OVER (PARTITION BY country_id ORDER BY sum(coalesce(edit_count, 0)) DESC) AS rank, 35 | country_id, 36 | hashtag, 37 | count(*) changesets, 38 | sum(coalesce(edit_count, 0)) edits 39 | FROM processed_changesets 40 | JOIN changesets_hashtags ON processed_changesets.id = changesets_hashtags.changeset_id 41 | JOIN hashtags ON changesets_hashtags.hashtag_id = hashtags.id 42 | GROUP BY country_id, hashtag 43 | ), 44 | hashtags AS ( 45 | SELECT 46 | country_id, 47 | jsonb_object_agg(hashtag, changesets) hashtag_changesets, 48 | jsonb_object_agg(hashtag, edits) hashtag_edits 49 | FROM hashtag_counts 50 | WHERE rank <= 10 51 | GROUP BY country_id 52 | ), 53 | user_counts AS ( 54 | SELECT 55 | RANK() OVER (PARTITION BY country_id ORDER BY sum(coalesce(edit_count, 0)) DESC) AS rank, 56 | country_id, 57 | user_id, 58 | count(*) changesets, 59 | sum(coalesce(edit_count, 0)) edits 60 | FROM processed_changesets 61 | GROUP BY country_id, user_id 62 | ), 63 | users AS ( 64 | SELECT 65 | country_id, 66 | jsonb_object_agg(user_id, changesets) user_changesets, 67 | jsonb_object_agg(user_id, edits) user_edits 68 | FROM user_counts 69 | WHERE rank <= 10 70 | GROUP BY country_id 71 | ), 72 | measurements AS ( 73 | SELECT 74 | id, 75 | country_id, 76 | key, 77 | value 78 | FROM processed_changesets 79 | CROSS JOIN LATERAL jsonb_each(measurements) 80 | ), 81 | aggregated_measurements_kv AS ( 82 | SELECT 83 | country_id, 84 | key, 85 | sum((value->>0)::numeric) AS value 86 | FROM measurements 87 | GROUP BY country_id, key 88 | ), 89 | aggregated_measurements AS ( 90 | SELECT 91 | country_id, 92 | jsonb_object_agg(key, value) measurements 93 | FROM aggregated_measurements_kv 94 | GROUP BY country_id 95 | ), 96 | counts AS ( 97 | SELECT 98 | id, 99 | country_id, 100 | key, 101 | value 102 | FROM processed_changesets 103 | CROSS JOIN LATERAL jsonb_each(counts) 104 | ), 105 | aggregated_counts_kv AS ( 106 | SELECT 107 | country_id, 108 | key, 109 | sum((value->>0)::numeric) AS value 110 | FROM counts 111 | GROUP BY country_id, key 112 | ), 113 | aggregated_counts AS ( 114 | SELECT 115 | country_id, 116 | jsonb_object_agg(key, value) counts 117 | FROM aggregated_counts_kv 118 | GROUP BY country_id 119 | ) 120 | SELECT 121 | general.country_id, 122 | countries.name country_name, 123 | countries.code country_code, 124 | -- NOTE these are per-changeset, not per-country, so stats are double-counted 125 | measurements, 126 | -- NOTE these are per-changeset, not per-country, so stats are double-counted 127 | counts, 128 | general.changeset_count, 129 | general.edit_count, 130 | general.last_edit, 131 | general.updated_at, 132 | user_changesets, 133 | user_edits, 134 | hashtag_changesets, 135 | hashtag_edits 136 | FROM general 137 | JOIN countries ON country_id = countries.id 138 | LEFT OUTER JOIN users USING (country_id) 139 | LEFT OUTER JOIN hashtags USING (country_id) 140 | LEFT OUTER JOIN aggregated_measurements USING (country_id) 141 | LEFT OUTER JOIN aggregated_counts USING (country_id); 142 | 143 | CREATE UNIQUE INDEX IF NOT EXISTS country_statistics_id ON country_statistics(country_code); 144 | -------------------------------------------------------------------------------- /deployment/sql/materialized_views/hashtag_statistics.sql: -------------------------------------------------------------------------------- 1 | DROP MATERIALIZED VIEW IF EXISTS hashtag_statistics; 2 | CREATE MATERIALIZED VIEW hashtag_statistics AS 3 | WITH general AS ( 4 | SELECT 5 | hashtag_id, 6 | max(coalesce(closed_at, created_at)) last_edit, 7 | count(*) changeset_count, 8 | sum(coalesce(total_edits, 0)) edit_count, 9 | max(updated_at) updated_at 10 | FROM changesets 11 | JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id 12 | GROUP BY hashtag_id 13 | ), 14 | processed_changesets AS ( 15 | SELECT 16 | id, 17 | user_id, 18 | hashtag_id, 19 | measurements, 20 | counts, 21 | total_edits 22 | FROM changesets 23 | JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id 24 | ), 25 | user_counts AS ( 26 | SELECT 27 | RANK() OVER (PARTITION BY hashtag_id ORDER BY sum(coalesce(total_edits, 0)) DESC) AS rank, 28 | hashtag_id, 29 | user_id, 30 | count(*) changesets, 31 | sum(coalesce(total_edits, 0)) edit_count 32 | FROM processed_changesets 33 | GROUP BY hashtag_id, user_id 34 | ), 35 | users AS ( 36 | SELECT 37 | hashtag_id, 38 | jsonb_object_agg(user_id, changesets) user_changesets, 39 | jsonb_object_agg(user_id, edit_count) user_edits 40 | FROM user_counts 41 | WHERE rank <= 10 42 | GROUP BY hashtag_id 43 | ), 44 | measurements AS ( 45 | SELECT 46 | id, 47 | hashtag_id, 48 | key, 49 | value 50 | FROM processed_changesets 51 | CROSS JOIN LATERAL jsonb_each(measurements) 52 | ), 53 | aggregated_measurements_kv AS ( 54 | SELECT 55 | hashtag_id, 56 | key, 57 | sum((value->>0)::numeric) AS value 58 | FROM measurements 59 | GROUP BY hashtag_id, key 60 | ), 61 | aggregated_measurements AS ( 62 | SELECT 63 | hashtag_id, 64 | jsonb_object_agg(key, value) measurements 65 | FROM aggregated_measurements_kv 66 | GROUP BY hashtag_id 67 | ), 68 | counts AS ( 69 | SELECT 70 | id, 71 | hashtag_id, 72 | key, 73 | value 74 | FROM processed_changesets 75 | CROSS JOIN LATERAL jsonb_each(counts) 76 | ), 77 | aggregated_counts_kv AS ( 78 | SELECT 79 | hashtag_id, 80 | key, 81 | sum((value->>0)::numeric) AS value 82 | FROM counts 83 | GROUP BY hashtag_id, key 84 | ), 85 | aggregated_counts AS ( 86 | SELECT 87 | hashtag_id, 88 | jsonb_object_agg(key, value) counts 89 | FROM aggregated_counts_kv 90 | GROUP BY hashtag_id 91 | ) 92 | SELECT 93 | hashtags.hashtag tag, 94 | general.hashtag_id, 95 | measurements, 96 | counts, 97 | general.changeset_count, 98 | general.edit_count, 99 | general.last_edit, 100 | general.updated_at, 101 | user_changesets, 102 | user_edits 103 | FROM general 104 | JOIN hashtags ON hashtag_id = hashtags.id 105 | LEFT OUTER JOIN users USING (hashtag_id) 106 | LEFT OUTER JOIN aggregated_measurements USING (hashtag_id) 107 | LEFT OUTER JOIN aggregated_counts USING (hashtag_id); 108 | 109 | CREATE UNIQUE INDEX IF NOT EXISTS hashtag_statistics_hashtag_id ON hashtag_statistics(hashtag_id); -------------------------------------------------------------------------------- /deployment/sql/materialized_views/hashtag_user_statistics.sql: -------------------------------------------------------------------------------- 1 | DROP MATERIALIZED VIEW IF EXISTS hashtag_user_statistics; 2 | CREATE MATERIALIZED VIEW hashtag_user_statistics AS 3 | WITH general AS ( 4 | SELECT 5 | user_id, 6 | hashtag_id, 7 | array_agg(id) changesets, 8 | max(coalesce(closed_at, created_at)) last_edit, 9 | count(*) changeset_count, 10 | sum(coalesce(total_edits, 0)) edit_count, 11 | max(updated_at) updated_at 12 | FROM changesets 13 | JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id 14 | GROUP BY user_id, hashtag_id 15 | ), 16 | measurements AS ( 17 | SELECT 18 | id, 19 | user_id, 20 | hashtag_id, 21 | key, 22 | value 23 | FROM changesets 24 | JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id 25 | CROSS JOIN LATERAL jsonb_each(measurements) 26 | ), 27 | aggregated_measurements_kv AS ( 28 | SELECT 29 | user_id, 30 | hashtag_id, 31 | key, 32 | sum((value->>0)::numeric) AS value 33 | FROM measurements 34 | GROUP BY user_id, hashtag_id, key 35 | ), 36 | aggregated_measurements AS ( 37 | SELECT 38 | user_id, 39 | hashtag_id, 40 | jsonb_object_agg(key, value) measurements 41 | FROM aggregated_measurements_kv 42 | GROUP BY user_id, hashtag_id 43 | ), 44 | counts AS ( 45 | SELECT 46 | id, 47 | user_id, 48 | hashtag_id, 49 | key, 50 | value 51 | FROM changesets 52 | JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id 53 | CROSS JOIN LATERAL jsonb_each(counts) 54 | ), 55 | aggregated_counts_kv AS ( 56 | SELECT 57 | user_id, 58 | hashtag_id, 59 | key, 60 | sum((value->>0)::numeric) AS value 61 | FROM counts 62 | GROUP BY user_id, hashtag_id, key 63 | ), 64 | aggregated_counts AS ( 65 | SELECT 66 | user_id, 67 | hashtag_id, 68 | jsonb_object_agg(key, value) counts 69 | FROM aggregated_counts_kv 70 | GROUP BY user_id, hashtag_id 71 | ) 72 | SELECT 73 | user_id, 74 | users.name, 75 | general.hashtag_id, 76 | hashtags.hashtag, 77 | measurements, 78 | counts, 79 | last_edit, 80 | changeset_count, 81 | edit_count, 82 | updated_at 83 | FROM general 84 | LEFT OUTER JOIN hashtags ON general.hashtag_id = hashtags.id 85 | LEFT OUTER JOIN aggregated_measurements USING (user_id, hashtag_id) 86 | LEFT OUTER JOIN aggregated_counts USING (user_id, hashtag_id) 87 | JOIN users ON user_id = users.id; 88 | 89 | CREATE UNIQUE INDEX IF NOT EXISTS hashtag_user_statistics_pk ON hashtag_user_statistics(hashtag_id, user_id); -------------------------------------------------------------------------------- /deployment/sql/materialized_views/refreshments.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE refreshments ( 2 | mat_view text NOT NULL, 3 | updated_at timestamp with time zone, 4 | PRIMARY KEY(mat_view) 5 | ); 6 | 7 | INSERT INTO refreshments VALUES ('user_statistics', to_timestamp(0)), ('country_statistics', to_timestamp(0)), ('hashtag_statistics', to_timestamp(0)), ('hashtag_user_statistics', to_timestamp(0)); -------------------------------------------------------------------------------- /deployment/sql/materialized_views/user_statistics.sql: -------------------------------------------------------------------------------- 1 | DROP MATERIALIZED VIEW IF EXISTS user_statistics; 2 | CREATE MATERIALIZED VIEW user_statistics AS 3 | WITH general AS ( 4 | SELECT 5 | user_id, 6 | array_agg(id) changesets, 7 | max(coalesce(closed_at, created_at)) last_edit, 8 | count(*) changeset_count, 9 | sum(coalesce(total_edits, 0)) edit_count, 10 | max(updated_at) updated_at 11 | FROM changesets 12 | GROUP BY user_id 13 | ), 14 | country_counts AS ( 15 | SELECT 16 | user_id, 17 | code, 18 | count(*) changesets, 19 | sum(coalesce(total_edits, 0)) edits 20 | FROM changesets 21 | JOIN changesets_countries ON changesets.id = changesets_countries.changeset_id 22 | JOIN countries ON changesets_countries.country_id = countries.id 23 | GROUP BY user_id, code 24 | ), 25 | countries AS ( 26 | SELECT 27 | user_id, 28 | jsonb_object_agg(code, changesets) country_changesets, 29 | jsonb_object_agg(code, edits) country_edits 30 | FROM country_counts 31 | GROUP BY user_id 32 | ), 33 | edit_day_counts AS ( 34 | SELECT 35 | user_id, 36 | date_trunc('day', coalesce(closed_at, created_at))::date AS day, 37 | count(*) changesets, 38 | sum(coalesce(total_edits, 0)) edits 39 | FROM changesets 40 | WHERE coalesce(closed_at, created_at) IS NOT NULL 41 | GROUP BY user_id, day 42 | ), 43 | edit_days AS ( 44 | SELECT 45 | user_id, 46 | jsonb_object_agg(day, changesets) day_changesets, 47 | jsonb_object_agg(day, edits) day_edits 48 | FROM edit_day_counts 49 | GROUP BY user_id 50 | ), 51 | editor_counts AS ( 52 | SELECT 53 | RANK() OVER (PARTITION BY user_id ORDER BY sum(coalesce(total_edits, 0)) DESC) AS rank, 54 | user_id, 55 | editor, 56 | count(*) changesets, 57 | sum(coalesce(total_edits, 0)) edits 58 | FROM changesets 59 | WHERE editor IS NOT NULL 60 | GROUP BY user_id, editor 61 | ), 62 | editors AS ( 63 | SELECT 64 | user_id, 65 | jsonb_object_agg(editor, changesets) editor_changesets, 66 | jsonb_object_agg(editor, edits) editor_edits 67 | FROM editor_counts 68 | WHERE rank <= 10 69 | GROUP BY user_id 70 | ), 71 | hashtag_counts AS ( 72 | SELECT 73 | RANK() OVER (PARTITION BY user_id ORDER BY sum(coalesce(total_edits, 0)) DESC) AS rank, 74 | user_id, 75 | hashtag, 76 | count(*) changesets, 77 | sum(coalesce(total_edits)) edits 78 | FROM changesets 79 | JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id 80 | JOIN hashtags ON changesets_hashtags.hashtag_id = hashtags.id 81 | GROUP BY user_id, hashtag 82 | ), 83 | hashtags AS ( 84 | SELECT 85 | user_id, 86 | jsonb_object_agg(hashtag, changesets) hashtag_changesets, 87 | jsonb_object_agg(hashtag, edits) hashtag_edits 88 | FROM hashtag_counts 89 | WHERE rank <= 50 90 | GROUP BY user_id 91 | ), 92 | measurements AS ( 93 | SELECT 94 | id, 95 | user_id, 96 | key, 97 | value 98 | FROM changesets 99 | CROSS JOIN LATERAL jsonb_each(measurements) 100 | ), 101 | aggregated_measurements_kv AS ( 102 | SELECT 103 | user_id, 104 | key, 105 | sum((value->>0)::numeric) AS value 106 | FROM measurements 107 | GROUP BY user_id, key 108 | ), 109 | aggregated_measurements AS ( 110 | SELECT 111 | user_id, 112 | jsonb_object_agg(key, value) measurements 113 | FROM aggregated_measurements_kv 114 | GROUP BY user_id 115 | ), 116 | counts AS ( 117 | SELECT 118 | id, 119 | user_id, 120 | key, 121 | value 122 | FROM changesets 123 | CROSS JOIN LATERAL jsonb_each(counts) 124 | ), 125 | aggregated_counts_kv AS ( 126 | SELECT 127 | user_id, 128 | key, 129 | sum((value->>0)::numeric) AS value 130 | FROM counts 131 | GROUP BY user_id, key 132 | ), 133 | aggregated_counts AS ( 134 | SELECT 135 | user_id, 136 | jsonb_object_agg(key, value) counts 137 | FROM aggregated_counts_kv 138 | GROUP BY user_id 139 | ) 140 | SELECT 141 | user_id AS id, 142 | users.name, 143 | measurements, 144 | counts, 145 | last_edit, 146 | changeset_count, 147 | edit_count, 148 | editor_changesets, 149 | editor_edits, 150 | day_changesets, 151 | day_edits, 152 | country_changesets, 153 | country_edits, 154 | hashtag_changesets, 155 | hashtag_edits, 156 | updated_at 157 | FROM general 158 | LEFT OUTER JOIN countries USING (user_id) 159 | LEFT OUTER JOIN editors USING (user_id) 160 | LEFT OUTER JOIN edit_days USING (user_id) 161 | LEFT OUTER JOIN hashtags USING (user_id) 162 | LEFT OUTER JOIN aggregated_measurements USING (user_id) 163 | LEFT OUTER JOIN aggregated_counts USING (user_id) 164 | JOIN users ON user_id = users.id; 165 | 166 | CREATE UNIQUE INDEX IF NOT EXISTS user_statistics_id ON user_statistics(id); 167 | -------------------------------------------------------------------------------- /deployment/streaming/.gitignore: -------------------------------------------------------------------------------- 1 | config-*.mk 2 | -------------------------------------------------------------------------------- /deployment/streaming/README.md: -------------------------------------------------------------------------------- 1 | # Streaming Stats Deployment via AWS ECS 2 | 3 | Amazon ECS is a system for deploying containers on top of AWS managed 4 | infrastructure. ECS is the deployment strategy we've provided resources 5 | for and would suggest because failures and even the hardware hiccups 6 | (say, the loss of a machine) will be automatically resolved so that 7 | the stream can get back to work. In conjunction with a checkpointing 8 | mechanism which ensures the stream starts near where it left off, these 9 | streams are highly resilient. 10 | 11 | An ECS deployment consists of a few different pieces: 12 | 13 | - The ECS cluster: scales EC2 instances up and down as necessary 14 | - Services: describe long-running programs that should maintain availability 15 | - Tasks: one or more containerized processes being run by the cluster 16 | - Containers: docker images uploaded to AWS ECR to be pulled upon each task creation 17 | 18 | The long-running stream which keeps statistics up to date by 19 | continuously polling Overpass augmented diffs and OSM changesets is 20 | deployed as an ECS cluster. This cluster has a service that tracks 21 | a lone streaming task and reboots the stream from the latest saved 22 | checkpoint (which lives on the table 'checkpoints' in the database being 23 | updated) to ensure that failures aren't fatal to the long-running 24 | process. 25 | 26 | Our ECS deployment process relies on the use of the `ecs-cli` tool, which is 27 | similar in spirit to `docker-compose`, but manages containers on ECS instead 28 | of on a local docker instance. You can install `ecs-cli` by issuing the 29 | command 30 | ```bash 31 | curl -o /usr/local/bin/ecs-cli https://s3.amazonaws.com/amazon-ecs-cli/ecs-cli-linux-amd64-latest 32 | ``` 33 | 34 | ## Deployment Steps 35 | 36 | 1. Copy `config-aws.mk.example` to `config-aws.mk` and 37 | `config-local.mk.example` to `config-local.mk`. These can be configured in a 38 | moment. 39 | 40 | 2. Build the osm_apps container 41 | 42 | ```bash 43 | make build-container 44 | ``` 45 | 46 | 3. Create an IAM role for EC2 instances. This will become `INSTANCE_ROLE`. 47 | The "AmazonEC2ContainerServiceforEC2Role" policy should be attached. 48 | 49 | 4. Edit [config-aws.mk.tpl](./config-aws.mk.tpl) with variables appropriate 50 | to your AWS account and desired deployment (choose VPCs, Security Groups, 51 | etc) and save the file as `config-aws.mk` Note that you will need to provide 52 | an ECR repo URI (which you'll have to set up manually via the AWS console) in 53 | order to use your container on AWS. 54 | 55 | 5. Manually create an ECS cluster backed by EC2 instances (not fargate), and 56 | be sure to record the cluster name in `config-aws.mk`. It should now be 57 | possible to configure ECS-CLI to deploy services against your cluster: 58 | 59 | ```bash 60 | make configure-cluster 61 | ``` 62 | 63 | 6. Assuming all's well, you're ready to deploy. Update the docker-compose 64 | which defines your services with the appropriate variables: 65 | 66 | ```bash 67 | make docker-compose.deploy.yml 68 | ``` 69 | 70 | 7. Push your image to ECR: 71 | 72 | ```bash 73 | make push-image 74 | ``` 75 | 76 | 8. Bring the cluster up: 77 | 78 | ```bash 79 | make cluster-up 80 | ``` 81 | 82 | 9. Deploy the service (this will create new task definitions as necessary): 83 | 84 | ```bash 85 | make start-service 86 | ``` 87 | 88 | ### Updating an Existing ECS Service 89 | 90 | If there is already a streaming task running on an ECS cluster that needs to 91 | be updated, then the procedure above can be abbreviated. Please perform steps 92 | 1, 2, 4, 7, and 9. 93 | 94 | ## Local Testing 95 | 96 | From a clean environment, 97 | 98 | 1. In the `deployment/streaming` directory, update the missing values in 99 | `config-local.mk.example` (LOCAL_AUGDIFF_SOURCE, LOCAL_AUGDIFF_START, 100 | LOCAL_CHANGE_START, LOCAL_CHANGESET_START) and save to `config-local.mk`. 101 | 102 | 2. In the same directory, ensure that `config-aws.mk` exists. You may `touch` 103 | the file if it does not. 104 | 105 | 3. Execute `make build-container` followed by `make start-local`. You should 106 | observe a stream of log messages. Any errors should appear in this window. 107 | 108 | 4. If you want to verify that the system is operating up to spec, you may 109 | ```bash 110 | docker exec -it streaming_db_1 bash 111 | psql -U postgres 112 | ``` 113 | (The trailing "1" may need to be incremented. See `docker ps` for the proper 114 | name.) From there, you may issue a `\d` directive and verify that the DB is 115 | populated with the correct tables. 116 | 117 | 5. You may now test the operation of the system in the DB interface by issuing 118 | queries against the available tables and observing the log output. The 119 | content of the tables will update as the system runs. 120 | -------------------------------------------------------------------------------- /deployment/streaming/config-deployment.mk.template: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # AWS properties 3 | ################################################################################ 4 | export KEYPAIR := 5 | export SUBNET := 6 | export AWS_REGION := us-east-1 7 | export IAM_ACCOUNT := 8 | 9 | ################################################################################ 10 | # Streaming resource definitions 11 | ################################################################################ 12 | export STREAMING_INSTANCE_TYPE := m4.xlarge 13 | export ECR_STATS_IMAGE := 14 | export ECR_REFRESH_IMAGE := 15 | export AWS_LOG_GROUP := streaming-stats-updater 16 | export ECS_SUBNET := ${SUBNET} 17 | export ECS_SECURITY_GROUP := 18 | 19 | export CLUSTER_NAME_DEPLOYMENT := 20 | export CLUSTER_NAME_STAGING := 21 | 22 | export NODE_OPTIONS=--max-old-space-size=7168 23 | export DRIVER_MEMORY=8192m 24 | export ECS_MEMORY_GB=8 25 | export AUGDIFF_ECS_MEMORY_GB=8 26 | 27 | export AUGDIFF_SERVICE_NAME := "azavea-overpass-diff-publisher" 28 | export AUGDIFF_ECR_IMAGE := 29 | export AUGDIFF_SOURCE := 30 | export ONRAMP_URL := 31 | export OVERPASS_URL := 32 | export CHANGESET_SOURCE := 33 | export CHANGE_SOURCE := 34 | 35 | export DB_BASE_URI := 36 | export PRODUCTION_DB := 37 | export STAGING_DB := 38 | 39 | export NETWORK_CONFIGURATION="{\"awsvpcConfiguration\": {\"subnets\": [\"${ECS_SUBNET}\"], \"securityGroups\": [\"${ECS_SECURITY_GROUP}\"], \"assignPublicIp\": \"DISABLED\"}}" 40 | 41 | # Uncomment the following to raise resource allocations to get past a large changeset 42 | #export TURBO_BOOST := yes 43 | 44 | ################################################################################ 45 | # Batch resource definitions 46 | ################################################################################ 47 | export MASTER_SECURITY_GROUP := 48 | export WORKER_SECURITY_GROUP := 49 | export SERVICE_ACCESS_SG := 50 | export SANDBOX_SG := 51 | 52 | export S3_ROOT_URI := 53 | export S3_LOG_URI := ${S3_ROOT_URI}/logs/ 54 | 55 | export BATCH_CORE_INSTANCE_TYPE := m4.xlarge 56 | export BATCH_MASTER_INSTANCE_TYPE := m4.xlarge 57 | export OSMESA_APPS_JAR := s3:///osmesa-apps.jar 58 | 59 | export PLANET_HISTORY_PBF := 60 | export PLANET_HISTORY_ORC_DIR := 61 | export HISTORY_ORC := 62 | 63 | export CHANGESET_CSV := 64 | export CHANGESET_COMMENTS_CSV := 65 | export CHANGESET_TAGS_CSV := 66 | export USERS_CSV := 67 | export CHANGESETS_ORC := 68 | 69 | export FOOTPRINT_VT_LOCATION := 70 | export HISTOGRAM_VT_LOCATION := 71 | 72 | export MAX_PG_CONNECTIONS := 64 73 | 74 | # Uncomment the following line to save the processed stats 75 | # export STATS_SNAPSHOT_ORC := 76 | 77 | # Uncomment the following line to use the above snapshot in lieu of recalculating from history; setting to "no" will not turn feature off 78 | # export USE_SNAPSHOT := yes 79 | -------------------------------------------------------------------------------- /deployment/streaming/ecs-params.yml: -------------------------------------------------------------------------------- 1 | # this file should be in deployment dir (relative to Makefile path) 2 | # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html 3 | # NOTE: comment it out for the test case 4 | version: 1 5 | task_definition: 6 | services: 7 | changeset-stream: 8 | mem_reservation: 2048m 9 | augdiff-stream: 10 | mem_reservation: 2048m 11 | user-footprint-updater: 12 | mem_reservation: 4096m 13 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/batch-generate-edit-histograms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | aws emr create-cluster \ 9 | --applications Name=Ganglia Name=Spark \ 10 | --ebs-root-volume-size 10 \ 11 | --ec2-attributes '{ 12 | "KeyName": "${KEYPAIR}", 13 | "InstanceProfile":"EMR_EC2_DefaultRole", 14 | "ServiceAccessSecurityGroup": "${SERVICE_ACCESS_SECURITY_GROUP}", 15 | "SubnetId": "${SUBNET}", 16 | "EmrManagedSlaveSecurityGroup": "${EMR_SLAVE_SECURITY_GROUP}", 17 | "EmrManagedMasterSecurityGroup": "${EMR_MASTER_SECURITY_GROUP}" 18 | }' \ 19 | --service-role EMR_DefaultRole \ 20 | --release-label emr-5.19.0 \ 21 | --name 'Faceted State of the Data tile generation' \ 22 | --instance-groups '[ 23 | { 24 | "InstanceCount": 1, 25 | "BidPrice": "OnDemandPrice", 26 | "InstanceGroupType": "MASTER", 27 | "InstanceType": "${BATCH_MASTER_INSTANCE_TYPE}", 28 | "Name":"Master" 29 | }, { 30 | "InstanceCount": 20, 31 | "BidPrice": "OnDemandPrice", 32 | "InstanceGroupType": "CORE", 33 | "InstanceType": "${BATCH_CORE_INSTANCE_TYPE}", 34 | "Name":"Workers" 35 | } 36 | ]' \ 37 | --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \ 38 | --auto-terminate \ 39 | --region us-east-1 \ 40 | --steps '[ 41 | { 42 | "Args": [ 43 | "spark-submit", 44 | "--deploy-mode", "cluster", 45 | "--class", "osmesa.apps.batch.FacetedEditHistogramTileCreator", 46 | "--conf", "spark.executor.memoryOverhead=2g", 47 | "--conf", "spark.sql.shuffle.partitions=2000", 48 | "--conf", "spark.speculation=true", 49 | "${OSMESA_APPS_JAR}", 50 | "--history", "${HISTORY_ORC}", 51 | "--out", "${HISTOGRAM_VT_LOCATION}" 52 | ], 53 | "Type": "CUSTOM_JAR", 54 | "ActionOnFailure": "TERMINATE_CLUSTER", 55 | "Jar": "command-runner.jar", 56 | "Properties": "", 57 | "Name": "FacetedEditHistogramTileCreator" 58 | } 59 | ]' 60 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/batch-generate-footprints.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | aws emr create-cluster \ 9 | --applications Name=Ganglia Name=Spark \ 10 | --ebs-root-volume-size 10 \ 11 | --ec2-attributes '{ 12 | "KeyName": "${KEYPAIR}", 13 | "InstanceProfile":"EMR_EC2_DefaultRole", 14 | "ServiceAccessSecurityGroup": "${SERVICE_ACCESS_SECURITY_GROUP}", 15 | "SubnetId": "${SUBNET}", 16 | "EmrManagedSlaveSecurityGroup": "${EMR_SLAVE_SECURITY_GROUP}", 17 | "EmrManagedMasterSecurityGroup": "${EMR_MASTER_SECURITY_GROUP}" 18 | }' \ 19 | --service-role EMR_DefaultRole \ 20 | --release-label emr-5.19.0 \ 21 | --name 'User footprint tile generation' \ 22 | --instance-groups '[ 23 | { 24 | "InstanceCount": 1, 25 | "BidPrice": "OnDemandPrice", 26 | "InstanceGroupType": "MASTER", 27 | "InstanceType": "${BATCH_MASTER_INSTANCE_TYPE}", 28 | "Name":"Master" 29 | }, { 30 | "InstanceCount": 20, 31 | "BidPrice": "OnDemandPrice", 32 | "InstanceGroupType": "CORE", 33 | "InstanceType": "${BATCH_CORE_INSTANCE_TYPE}", 34 | "Name":"Workers" 35 | } 36 | ]' \ 37 | --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \ 38 | --auto-terminate \ 39 | --region ${AWS_REGION} \ 40 | --steps '[ 41 | { 42 | "Args": [ 43 | "spark-submit", 44 | "--deploy-mode", "cluster", 45 | "--class", "osmesa.apps.batch.FootprintCreator", 46 | "--conf", "spark.executor.memoryOverhead=2g", 47 | "--conf", "spark.sql.shuffle.partitions=2000", 48 | "--conf", "spark.speculation=true", 49 | "${OSMESA_APPS_JAR}", 50 | "--history", "${HISTORY_ORC}", 51 | "--changesets", "${CHANGESETS_ORC}", 52 | "--out", "${FOOTPRINT_VT_LOCATION}", 53 | "--type", "users", 54 | ], 55 | "Type": "CUSTOM_JAR", 56 | "ActionOnFailure": "TERMINATE_CLUSTER", 57 | "Jar": "command-runner.jar", 58 | "Properties": "", 59 | "Name": "FootprintCreator" 60 | } 61 | ]' 62 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/batch-process.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | CLUSTER_NAME=$1 9 | NUM_EXECUTORS=$2 10 | 11 | shift 2 12 | 13 | ARGS= 14 | while [ "$#" -gt 1 ] ; do 15 | ARGS="$ARGS 16 | { 17 | \"Args\": $2, 18 | \"Type\": \"CUSTOM_JAR\", 19 | \"ActionOnFailure\": \"CONTINUE\", 20 | \"Jar\": \"command-runner.jar\", 21 | \"Properties\": \"\", 22 | \"Name\": \"$1\" 23 | }" 24 | if [ "$#" -gt 2 ]; then 25 | ARGS="$ARGS," 26 | fi 27 | shift 2 28 | done 29 | 30 | set -x 31 | aws emr create-cluster \ 32 | --applications Name=Ganglia Name=Spark Name=Hive \ 33 | --log-uri ${S3_LOG_URI} \ 34 | --configurations "file://scripts/emr-configurations/batch-process.json" \ 35 | --ebs-root-volume-size 10 \ 36 | --ec2-attributes "{ 37 | \"KeyName\": \"${KEYPAIR}\", 38 | \"InstanceProfile\":\"EMR_EC2_DefaultRole\", 39 | \"SubnetId\": \"${SUBNET}\", 40 | \"EmrManagedMasterSecurityGroup\": \"${MASTER_SECURITY_GROUP}\", 41 | \"EmrManagedSlaveSecurityGroup\": \"${WORKER_SECURITY_GROUP}\", 42 | \"ServiceAccessSecurityGroup\": \"${SERVICE_ACCESS_SG}\", 43 | \"AdditionalMasterSecurityGroups\": [\"${SANDBOX_SG}\"], 44 | \"AdditionalSlaveSecurityGroups\": [\"${SANDBOX_SG}\"] 45 | }" \ 46 | --service-role EMR_DefaultRole \ 47 | --release-label emr-5.29.0 \ 48 | --name "$CLUSTER_NAME" \ 49 | --instance-groups "[ 50 | { 51 | \"InstanceCount\": 1, 52 | \"BidPrice\": \"OnDemandPrice\", 53 | \"InstanceGroupType\": \"MASTER\", 54 | \"InstanceType\": \"${BATCH_MASTER_INSTANCE_TYPE}\", 55 | \"Name\":\"Master\", 56 | \"EbsConfiguration\": { 57 | \"EbsOptimized\": true, 58 | \"EbsBlockDeviceConfigs\": [{ 59 | \"VolumeSpecification\": { 60 | \"VolumeType\": \"gp2\", 61 | \"SizeInGB\": 1024 62 | } 63 | }] 64 | } 65 | }, { 66 | \"InstanceCount\": ${NUM_EXECUTORS}, 67 | \"BidPrice\": \"OnDemandPrice\", 68 | \"InstanceGroupType\": \"CORE\", 69 | \"InstanceType\": \"${BATCH_CORE_INSTANCE_TYPE}\", 70 | \"Name\":\"Workers\", 71 | \"EbsConfiguration\": { 72 | \"EbsOptimized\": true, 73 | \"EbsBlockDeviceConfigs\": [{ 74 | \"VolumeSpecification\": { 75 | \"VolumeType\": \"gp2\", 76 | \"SizeInGB\": 1024 77 | } 78 | }] 79 | } 80 | } 81 | ]" \ 82 | --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \ 83 | --auto-terminate \ 84 | --region us-east-1 \ 85 | --steps "[ 86 | $ARGS 87 | ]" 88 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/create-log-groups.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | DEFINED_GROUPS=$(aws logs describe-log-groups | jq '.logGroups[].logGroupName' | sed -e 's/"//g') 9 | 10 | if [[ $DEFINED_GROUPS != *"/ecs/${AWS_LOG_GROUP}"* ]]; then 11 | aws logs create-log-group \ 12 | --log-group-name /ecs/${AWS_LOG_GROUP} 13 | fi 14 | 15 | if [[ $DEFINED_GROUPS != *"/ecs/${AWS_LOG_GROUP}${TASK_SUFFIX}"* ]]; then 16 | aws logs create-log-group \ 17 | --log-group-name /ecs/${AWS_LOG_GROUP}${TASK_SUFFIX} 18 | fi 19 | 20 | if [[ $DEFINED_GROUPS != *"/ecs/streaming-user-footprint-tile-updater"* ]]; then 21 | aws logs create-log-group \ 22 | --log-group-name /ecs/streaming-user-footprint-tile-updater 23 | fi 24 | 25 | if [[ $DEFINED_GROUPS != *"/ecs/streaming-edit-histogram-tile-updater"* ]]; then 26 | aws logs create-log-group \ 27 | --log-group-name /ecs/streaming-edit-histogram-tile-updater 28 | fi 29 | 30 | if [[ $DEFINED_GROUPS != *"/ecs/osmesa-streaming-augdiff-producer"* ]]; then 31 | aws logs create-log-group \ 32 | --log-group-name /ecs/osmesa-streaming-augdiff-producer 33 | fi 34 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/define-production-streaming-update-tasks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | aws ecs register-task-definition \ 9 | --family streaming-stats-updater \ 10 | --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \ 11 | --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \ 12 | --network-mode awsvpc \ 13 | --requires-compatibilities EC2 FARGATE \ 14 | --cpu "1 vCPU" \ 15 | --memory "4 GB" \ 16 | --container-definitions "[ 17 | { 18 | \"logConfiguration\": { 19 | \"logDriver\": \"awslogs\", 20 | \"options\": { 21 | \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}\", 22 | \"awslogs-region\": \"${AWS_REGION}\", 23 | \"awslogs-stream-prefix\": \"ecs\" 24 | } 25 | }, 26 | \"command\": [ 27 | \"/spark/bin/spark-submit\", 28 | \"--driver-memory\", \"2048m\", 29 | \"--class\", \"osmesa.apps.streaming.StreamingChangesetStatsUpdater\", 30 | \"/opt/osmesa-apps.jar\", 31 | \"--augmented-diff-source\", \"${AUGDIFF_SOURCE}\" 32 | ], 33 | \"environment\": [ 34 | { 35 | \"name\": \"DATABASE_URL\", 36 | \"value\": \"${DB_BASE_URI}/${PRODUCTION_DB}\" 37 | } 38 | ], 39 | \"image\": \"${ECR_STATS_IMAGE}:production\", 40 | \"name\": \"streaming-changeset-stats-updater\" 41 | }, 42 | { 43 | \"logConfiguration\": { 44 | \"logDriver\": \"awslogs\", 45 | \"options\": { 46 | \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}\", 47 | \"awslogs-region\": \"${AWS_REGION}\", 48 | \"awslogs-stream-prefix\": \"ecs\" 49 | } 50 | }, 51 | \"command\": [ 52 | \"/spark/bin/spark-submit\", 53 | \"--driver-memory\", \"2048m\", 54 | \"--class\", \"osmesa.apps.streaming.StreamingChangesetMetadataUpdater\", 55 | \"/opt/osmesa-apps.jar\", 56 | \"--changeset-source\", \"${CHANGESET_SOURCE}\" 57 | ], 58 | \"environment\": [ 59 | { 60 | \"name\": \"DATABASE_URL\", 61 | \"value\": \"${DB_BASE_URI}/${PRODUCTION_DB}\" 62 | } 63 | ], 64 | \"image\": \"${ECR_STATS_IMAGE}:production\", 65 | \"name\": \"streaming-changeset-metadata-updater\" 66 | } 67 | ]" 68 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/define-production-view-refresher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | aws ecs register-task-definition \ 9 | --family osmesa-stats-view-refresher \ 10 | --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \ 11 | --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \ 12 | --network-mode awsvpc \ 13 | --requires-compatibilities EC2 FARGATE \ 14 | --cpu "0.25 vCPU" \ 15 | --memory "0.5 GB" \ 16 | --container-definitions "[ 17 | { 18 | \"logConfiguration\": { 19 | \"logDriver\": \"awslogs\", 20 | \"options\": { 21 | \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}\", 22 | \"awslogs-region\": \"${AWS_REGION}\", 23 | \"awslogs-stream-prefix\": \"ecs\" 24 | } 25 | }, 26 | \"command\": [ 27 | \"refresh-views.sh\" 28 | ], 29 | \"environment\": [ 30 | { 31 | \"name\": \"DATABASE_URL\", 32 | \"value\": \"${DB_BASE_URI}/${PRODUCTION_DB}\" 33 | }, 34 | { 35 | \"name\": \"DATABASE_NAME\", 36 | \"value\": \"${PRODUCTION_DB}\" 37 | } 38 | ], 39 | \"image\": \"${ECR_REFRESH_IMAGE}:production\", 40 | \"name\": \"stats-view-refresher\" 41 | } 42 | ]" 43 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/define-staging-streaming-update-tasks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | aws ecs register-task-definition \ 9 | --family streaming-stats-updater${TASK_SUFFIX} \ 10 | --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \ 11 | --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \ 12 | --network-mode awsvpc \ 13 | --requires-compatibilities EC2 FARGATE \ 14 | --cpu "1 vCPU" \ 15 | --memory "${ECS_MEMORY_GB} GB" \ 16 | --container-definitions "[ 17 | { 18 | \"logConfiguration\": { 19 | \"logDriver\": \"awslogs\", 20 | \"options\": { 21 | \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}${TASK_SUFFIX}\", 22 | \"awslogs-region\": \"${AWS_REGION}\", 23 | \"awslogs-stream-prefix\": \"ecs\" 24 | } 25 | }, 26 | \"command\": [ 27 | \"/spark/bin/spark-submit\", 28 | \"--driver-memory\", \"${DRIVER_MEMORY}\", 29 | \"--class\", \"osmesa.apps.streaming.StreamingChangesetStatsUpdater\", 30 | \"/opt/osmesa-apps.jar\", 31 | \"--augmented-diff-source\", \"${AUGDIFF_SOURCE}\" 32 | ], 33 | \"environment\": [ 34 | { 35 | \"name\": \"DATABASE_URL\", 36 | \"value\": \"${DB_BASE_URI}/${STAGING_DB}\" 37 | } 38 | ], 39 | \"image\": \"${ECR_STATS_IMAGE}:${CONTAINER_TAG}\", 40 | \"name\": \"streaming-changeset-stats-updater${TASK_SUFFIX}\" 41 | }, 42 | { 43 | \"logConfiguration\": { 44 | \"logDriver\": \"awslogs\", 45 | \"options\": { 46 | \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}${TASK_SUFFIX}\", 47 | \"awslogs-region\": \"${AWS_REGION}\", 48 | \"awslogs-stream-prefix\": \"ecs\" 49 | } 50 | }, 51 | \"command\": [ 52 | \"/spark/bin/spark-submit\", 53 | \"--driver-memory\", \"${DRIVER_MEMORY}\", 54 | \"--class\", \"osmesa.apps.streaming.StreamingChangesetMetadataUpdater\", 55 | \"/opt/osmesa-apps.jar\", 56 | \"--changeset-source\", \"${CHANGESET_SOURCE}\" 57 | ], 58 | \"environment\": [ 59 | { 60 | \"name\": \"DATABASE_URL\", 61 | \"value\": \"${DB_BASE_URI}/${STAGING_DB}\" 62 | } 63 | ], 64 | \"image\": \"${ECR_STATS_IMAGE}:${CONTAINER_TAG}\", 65 | \"name\": \"streaming-changeset-metadata-updater${TASK_SUFFIX}\" 66 | } 67 | ]" 68 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/define-staging-view-refresher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | aws ecs register-task-definition \ 9 | --family osmesa-stats-view-refresher${TASK_SUFFIX} \ 10 | --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \ 11 | --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \ 12 | --network-mode awsvpc \ 13 | --requires-compatibilities EC2 FARGATE \ 14 | --cpu "0.25 vCPU" \ 15 | --memory "0.5 GB" \ 16 | --container-definitions "[ 17 | { 18 | \"logConfiguration\": { 19 | \"logDriver\": \"awslogs\", 20 | \"options\": { 21 | \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}${TASK_SUFFIX}\", 22 | \"awslogs-region\": \"${AWS_REGION}\", 23 | \"awslogs-stream-prefix\": \"ecs\" 24 | } 25 | }, 26 | \"command\": [ 27 | \"refresh-views.sh\" 28 | ], 29 | \"environment\": [ 30 | { 31 | \"name\": \"DATABASE_URL\", 32 | \"value\": \"${DB_BASE_URI}/${STAGING_DB}\" 33 | }, 34 | { 35 | \"name\": \"DATABASE_NAME\", 36 | \"value\": \"${STAGING_DB}\" 37 | } 38 | ], 39 | \"image\": \"${ECR_REFRESH_IMAGE}:${CONTAINER_TAG}\", 40 | \"name\": \"stats-view-refresher${TASK_SUFFIX}\" 41 | } 42 | ]" 43 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/define-streaming-augdiff-producer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | aws ecs register-task-definition \ 9 | --family "${AUGDIFF_SERVICE_NAME}${TASK_SUFFIX}" \ 10 | --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \ 11 | --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \ 12 | --network-mode awsvpc \ 13 | --requires-compatibilities EC2 FARGATE \ 14 | --cpu "1 vCPU" \ 15 | --memory "${AUGDIFF_ECS_MEMORY_GB} GB" \ 16 | --container-definitions "[ 17 | { 18 | \"logConfiguration\": { 19 | \"logDriver\": \"awslogs\", 20 | \"options\": { 21 | \"awslogs-group\": \"/ecs/osmesa-streaming-augdiff-producer\", 22 | \"awslogs-region\": \"${AWS_REGION}\", 23 | \"awslogs-stream-prefix\": \"ecs\" 24 | } 25 | }, 26 | \"command\": [ 27 | \"-s\", 28 | \"onramp\", 29 | \"${AUGDIFF_SOURCE}\" 30 | ], 31 | \"environment\": [ 32 | { 33 | \"name\": \"OVERPASS_URL\", 34 | \"value\": \"${OVERPASS_URL}\" 35 | }, 36 | { 37 | \"name\": \"ONRAMP_URL\", 38 | \"value\": \"${ONRAMP_URL}\" 39 | }, 40 | { 41 | \"name\": \"NODE_OPTIONS\", 42 | \"value\": \"${NODE_OPTIONS}\" 43 | } 44 | ], 45 | \"image\": \"${AUGDIFF_ECR_IMAGE}\", 46 | \"name\": \"${AUGDIFF_SERVICE_NAME}\" 47 | } 48 | ]" 49 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/define-streaming-vectortile-tasks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | aws ecs register-task-definition \ 9 | --family streaming-edit-histogram-tile-updater \ 10 | --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \ 11 | --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \ 12 | --network-mode awsvpc \ 13 | --requires-compatibilities EC2 FARGATE \ 14 | --cpu "4 vCPU" \ 15 | --memory "30 GB" \ 16 | --container-definitions "[ 17 | { 18 | \"logConfiguration\": { 19 | \"logDriver\": \"awslogs\", 20 | \"options\": { 21 | \"awslogs-group\": \"/ecs/streaming-edit-histogram-tile-updater\", 22 | \"awslogs-region\": \"${AWS_REGION}\", 23 | \"awslogs-stream-prefix\": \"ecs\" 24 | } 25 | }, 26 | \"command\": [ 27 | \"/spark/bin/spark-submit\", 28 | \"--driver-memory\", \"27G\", 29 | \"--class\", \"osmesa.apps.streaming.StreamingFacetedEditHistogramTileUpdater\", 30 | \"/opt/osmesa-apps.jar\", 31 | \"--augmented-diff-source\", \"${AUGDIFF_SOURCE}\", 32 | \"--tile-source\", \"${HISTOGRAM_VT_LOCATION}\", 33 | \"--batch-size\", \"4\" 34 | ], 35 | \"environment\": [ 36 | { 37 | \"name\": \"DATABASE_URL\", 38 | \"value\": \"${DB_BASE_URI}/${PRODUCTION_DB}\" 39 | } 40 | ], 41 | \"image\": \"${ECR_STATS_IMAGE}:production\", 42 | \"name\": \"streaming-edit-histogram-tile-updater\" 43 | } 44 | ]" 45 | 46 | aws ecs register-task-definition \ 47 | --family streaming-user-footprint-tile-updater \ 48 | --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \ 49 | --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \ 50 | --network-mode awsvpc \ 51 | --requires-compatibilities EC2 FARGATE \ 52 | --cpu "2 vCPU" \ 53 | --memory "8 GB" \ 54 | --container-definitions "[ 55 | { 56 | \"logConfiguration\": { 57 | \"logDriver\": \"awslogs\", 58 | \"options\": { 59 | \"awslogs-group\": \"/ecs/streaming-user-footprint-tile-updater\", 60 | \"awslogs-region\": \"${AWS_REGION}\", 61 | \"awslogs-stream-prefix\": \"ecs\" 62 | } 63 | }, 64 | \"command\": [ 65 | \"/spark/bin/spark-submit\", 66 | \"--driver-memory\", \"7G\", 67 | \"--class\", \"osmesa.apps.streaming.StreamingUserFootprintTileUpdater\", 68 | \"/opt/osmesa-apps.jar\", 69 | \"--change-source\", \"${CHANGE_SOURCE}\", 70 | \"--tile-source\", \"${FOOTPRINT_VT_LOCATION}\", 71 | \"--batch-size\", \"4\" 72 | ], 73 | \"environment\": [ 74 | { 75 | \"name\": \"DATABASE_URL\", 76 | \"value\": \"${DB_BASE_URI}/${PRODUCTION_DB}\" 77 | } 78 | ], 79 | \"image\": \"${ECR_STATS_IMAGE}:production\", 80 | \"name\": \"streaming-user-footprint-tile-updater\" 81 | } 82 | ]" 83 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/deploy-stats-refresher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | aws events put-rule --schedule-expression "rate(1 minute)" --name osmesa-stats-view-refresher${TASK_SUFFIX} 9 | aws events put-targets \ 10 | --rule "osmesa-stats-view-refresher${TASK_SUFFIX}" \ 11 | --targets "[ 12 | { 13 | \"Id\": \"osmesa-stats-view-refresher${TASK_SUFFIX}\", 14 | \"Arn\": \"arn:aws:ecs:${AWS_REGION}:${IAM_ACCOUNT}:cluster/${ECS_CLUSTER}\", 15 | \"RoleArn\": \"arn:aws:iam::${IAM_ACCOUNT}:role/ecsEventsRole\", 16 | \"EcsParameters\": { 17 | \"TaskDefinitionArn\": \"arn:aws:ecs:${AWS_REGION}:${IAM_ACCOUNT}:task-definition/osmesa-stats-view-refresher${TASK_SUFFIX}\", 18 | \"TaskCount\": 1, 19 | \"LaunchType\": \"FARGATE\", 20 | \"NetworkConfiguration\": { 21 | \"awsvpcConfiguration\": { 22 | \"Subnets\": [\"${ECS_SUBNET}\"], 23 | \"SecurityGroups\": [\"${ECS_SECURITY_GROUP}\"], 24 | \"AssignPublicIp\": \"DISABLED\" 25 | } 26 | } 27 | } 28 | } 29 | ]" 30 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/emr-configurations/batch-process.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Classification": "spark", 4 | "Properties": { 5 | "maximizeResourceAllocation": "false" 6 | } 7 | }, 8 | { 9 | "Classification": "spark-defaults", 10 | "Properties": { 11 | "spark.dynamicAllocation.enabled": "true", 12 | "spark.shuffle.service.enabled": "true", 13 | "spark.shuffle.compress": "true", 14 | "spark.shuffle.spill.compress": "true", 15 | "spark.sql.shuffle.partitions": "2000", 16 | "spark.speculation": "true", 17 | "spark.rdd.compress": "true", 18 | "spark.executor.memory": "2G", 19 | "spark.executor.memoryOverhead": "1G", 20 | "spark.driver.cores": "2", 21 | "spark.driver.memory": "10G", 22 | "spark.driver.memoryOverhead": "1G", 23 | "spark.driver.maxResultSize": "3G", 24 | "spark.executor.extraJavaOptions" : "-XX:+UseParallelGC -Dgeotrellis.s3.threads.rdd.write=64" 25 | } 26 | }, 27 | { 28 | "Classification": "hdfs-site", 29 | "Properties": { 30 | "dfs.replication": "1", 31 | "dfs.permissions": "false", 32 | "dfs.datanode.max.xcievers": "16384", 33 | "dfs.datanode.max.transfer.threads": "16384", 34 | "dfs.datanode.balance.max.concurrent.moves": "1000", 35 | "dfs.datanode.balance.bandwidthPerSec": "100000000" 36 | } 37 | }, 38 | { 39 | "Classification": "yarn-site", 40 | "Properties": { 41 | "yarn.resourcemanager.am.max-attempts": "1", 42 | "yarn.nodemanager.vmem-check-enabled": "false", 43 | "yarn.nodemanager.pmem-check-enabled": "false" 44 | } 45 | }, 46 | { 47 | "Classification": "hadoop-env", 48 | "Configurations": [ 49 | { 50 | "Classification": "export", 51 | "Properties": { 52 | "JAVA_HOME": "/usr/lib/jvm/java-1.8.0", 53 | "GDAL_DATA": "/usr/local/share/gdal", 54 | "LD_LIBRARY_PATH": "/usr/local/lib", 55 | "PYSPARK_PYTHON": "python27", 56 | "PYSPARK_DRIVER_PYTHON": "python27" 57 | } 58 | } 59 | ] 60 | }, 61 | { 62 | "Classification": "spark-env", 63 | "Configurations": [ 64 | { 65 | "Classification": "export", 66 | "Properties": { 67 | "JAVA_HOME": "/usr/lib/jvm/java-1.8.0", 68 | "GDAL_DATA": "/usr/local/share/gdal", 69 | "LD_LIBRARY_PATH": "/usr/local/lib", 70 | "SPARK_PRINT_LAUNCH_COMMAND": "1", 71 | "PYSPARK_PYTHON": "python27", 72 | "PYSPARK_DRIVER_PYTHON": "python27" 73 | } 74 | } 75 | ] 76 | }, 77 | { 78 | "Classification": "yarn-env", 79 | "Configurations": [ 80 | { 81 | "Classification": "export", 82 | "Properties": { 83 | "JAVA_HOME": "/usr/lib/jvm/java-1.8.0", 84 | "GDAL_DATA": "/usr/local/share/gdal", 85 | "LD_LIBRARY_PATH": "/usr/local/lib", 86 | "PYSPARK_PYTHON": "python27", 87 | "PYSPARK_DRIVER_PYTHON": "python27" 88 | } 89 | } 90 | ] 91 | } 92 | ] 93 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/expand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | PROG=$(basename $0) 6 | 7 | usage() 8 | { 9 | echo "${PROG} " 10 | } 11 | 12 | expand() 13 | { 14 | local template="$(cat $1)" 15 | eval "echo \"${template}\"" 16 | } 17 | 18 | case $# in 19 | 1) expand "$1";; 20 | *) usage; exit 0;; 21 | esac 22 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/get-tag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$(git branch | grep '* master')" = "* master" ]; then 4 | while true; do 5 | >&2 echo "You are on the master branch. Do you wish to publish to the production tag?" 6 | select yn in "Yes" "No"; do 7 | case $yn in 8 | Yes ) VERSION_TAG="production"; break;; 9 | No ) VERSION_TAG="latest"; break;; 10 | esac 11 | done 12 | break 13 | done 14 | else 15 | if [ -z ${OVERRIDE_TAG+x} ]; then 16 | VERSION_TAG="latest" 17 | else 18 | VERSION_TAG=${OVERRIDE_TAG} 19 | fi 20 | fi 21 | 22 | echo -n "${VERSION_TAG}" 23 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/latest-history-to-orc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install/build osm2orc 4 | cd /mnt 5 | sudo yum install -y git 6 | git clone https://github.com/mojodna/osm2orc.git 7 | cd osm2orc 8 | ./gradlew distTar 9 | tar xf build/distributions/osm2orc-*.tar -C /tmp 10 | 11 | # Download latest planet history file 12 | aws s3 cp $PLANET_HISTORY_PBF /mnt 13 | 14 | # Convert to ORC 15 | DATE=$(stat /mnt/planet-history.osm.pbf | sed -n -e 's/-//g;s/Modify: \([0-9\-]*\).*/\1/p') 16 | /tmp/osm2orc-*/bin/osm2orc /mnt/planet-history.osm.pbf /mnt/planet-${DATE}.osh.orc 17 | 18 | # Upload ORC 19 | aws s3 cp /mnt/planet-${DATE}.osh.orc $PLANET_HISTORY_ORC_DIR 20 | -------------------------------------------------------------------------------- /deployment/streaming/scripts/stop-streaming-service.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${VERSION_TAG+x} ]; then 4 | echo "Do not run this script directly. Use the Makefile in the parent directory." 5 | exit 1 6 | fi 7 | 8 | SERVICE=$1 9 | echo "Attempting to stop $SERVICE on cluster $ECS_CLUSTER" 10 | 11 | check_status() { 12 | STATUS=$(aws ecs describe-services --services $SERVICE --cluster $ECS_CLUSTER | jq '.services[].status') 13 | } 14 | 15 | check_status 16 | if [[ $STATUS == "\"ACTIVE\"" ]]; then 17 | aws ecs delete-service --service $SERVICE --cluster $ECS_CLUSTER --force 18 | echo "Waiting for shut down" 19 | check_status 20 | while [[ $STATUS != "\"INACTIVE\"" ]]; do 21 | echo " current status: $STATUS, still waiting" 22 | sleep 15s 23 | check_status 24 | done 25 | echo " final status: $STATUS" 26 | else 27 | echo "Status was $STATUS, nothing to stop" 28 | fi 29 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2.3' 2 | services: 3 | backend: 4 | image: "osmesa-streaming-stats:${VERSION_TAG}" 5 | build: 6 | context: ./src 7 | dockerfile: Dockerfile.apps 8 | 9 | refresher: 10 | image: "osmesa-stats-refresher:${VERSION_TAG}" 11 | build: 12 | context: ./src 13 | dockerfile: Dockerfile.refresh 14 | 15 | database: 16 | image: quay.io/azavea/postgis:2.4-postgres10.6-slim 17 | environment: 18 | - POSTGRES_USER=osmesa_stats 19 | - POSTGRES_PASSWORD=osmesa_stats 20 | - POSTGRES_DB=osmesa_stats 21 | healthcheck: 22 | test: ["CMD", "pg_isready", "-U", "osmesa_stats"] 23 | interval: 3s 24 | timeout: 3s 25 | retries: 3 26 | start_period: 5s 27 | ports: 28 | - 5433:5432 29 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.2 2 | -------------------------------------------------------------------------------- /scripts/cibuild: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [[ -n "${OSMESA_DEBUG}" ]]; then 6 | set -x 7 | fi 8 | 9 | if [ -z ${VERSION_TAG+x} ]; then 10 | VERSION_TAG="$(git rev-parse --short HEAD)" 11 | echo "VERSION_TAG was unset; using ${VERSION_TAG}" 12 | fi 13 | 14 | DIR="$(dirname "$0")/../" 15 | 16 | if [ "${BASH_SOURCE[0]}" = "${0}" ]; then 17 | echo "Building Scala assembly JAR" 18 | pushd "${DIR}/src" 19 | ./sbt "apps/assembly" 20 | popd 21 | 22 | VERSION_TAG="${VERSION_TAG}" docker-compose \ 23 | -f docker-compose.yml \ 24 | build backend refresher 25 | fi 26 | -------------------------------------------------------------------------------- /scripts/cipublish: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [[ -n "${OSMESA_DEBUG}" ]]; then 6 | set -x 7 | fi 8 | 9 | if [ -z ${VERSION_TAG+x} ]; then 10 | VERSION_TAG="$(git rev-parse --short HEAD)" 11 | echo "VERSION_TAG was unset; using ${VERSION_TAG}" 12 | fi 13 | 14 | DIR="$(dirname "$0")/../" 15 | 16 | if [ "${BASH_SOURCE[0]}" = "${0}" ]; then 17 | mkdir osmesa-dist 18 | cp src/apps/target/scala-2.11/osmesa-apps.jar osmesa-dist 19 | cp -r deployment/sql osmesa-dist 20 | 21 | docker tag "osmesa-streaming-stats:${VERSION_TAG}" "${APP_IMAGE_ECR_ENDPOINT}:${VERSION_TAG}" 22 | docker tag "osmesa-stats-refresher:${VERSION_TAG}" "${REFRESHER_IMAGE_ECR_ENDPOINT}:${VERSION_TAG}" 23 | 24 | eval "$(aws ecr get-login --no-include-email)" 25 | docker push "${APP_IMAGE_ECR_ENDPOINT}:${VERSION_TAG}" 26 | docker push "${REFRESHER_IMAGE_ECR_ENDPOINT}:${VERSION_TAG}" 27 | fi 28 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | derby.log 2 | -------------------------------------------------------------------------------- /src/.sbtopts: -------------------------------------------------------------------------------- 1 | -J-Xmx2g 2 | -J-XX:+CMSClassUnloadingEnabled 3 | -J-XX:+UseConcMarkSweepGC 4 | -Djava.awt.headless=true 5 | -Dsun.io.serialization.extendedDebugInfo=true 6 | -Dhttps.protocols=TLSv1,TLSv1.1,TLSv1.2 -------------------------------------------------------------------------------- /src/.scalafmt.conf: -------------------------------------------------------------------------------- 1 | maxColumn = 100 2 | -------------------------------------------------------------------------------- /src/Dockerfile.apps: -------------------------------------------------------------------------------- 1 | FROM bde2020/spark-master:2.4.4-hadoop2.7 2 | 3 | COPY apps/target/scala-2.11/osmesa-apps.jar /opt/osmesa-apps.jar 4 | COPY docker/log4j.properties /spark/conf/ 5 | ENV PATH=$PATH:/spark/bin 6 | 7 | WORKDIR /opt 8 | ENTRYPOINT ["spark-submit"] 9 | -------------------------------------------------------------------------------- /src/Dockerfile.refresh: -------------------------------------------------------------------------------- 1 | FROM alpine:3.12 2 | 3 | RUN apk update && apk add bash postgresql-client 4 | COPY docker/refresh-views.sh /usr/local/bin/refresh-views.sh 5 | 6 | WORKDIR /opt 7 | -------------------------------------------------------------------------------- /src/analytics/.envrc: -------------------------------------------------------------------------------- 1 | test -f .env && dotenv 2 | -------------------------------------------------------------------------------- /src/analytics/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | package-lock.json 3 | .env 4 | -------------------------------------------------------------------------------- /src/analytics/bin/apply.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eo pipefail 4 | 5 | OPTIND=1 # Reset in case getopts has been used previously in the shell. 6 | 7 | replication_source="" 8 | tile_source="" 9 | 10 | while getopts "r:s:t:" opt; do 11 | case "$opt" in 12 | r) replication_source=$OPTARG 13 | ;; 14 | s) sequence=$OPTARG 15 | ;; 16 | t) tile_source=$OPTARG 17 | ;; 18 | esac 19 | done 20 | 21 | shift $((OPTIND-1)) 22 | 23 | [ "$1" = "--" ] && shift 24 | 25 | if [[ -z $sequence ]]; then 26 | sequence=$(aws s3 cp ${tile_source}sequence.txt - 2> /dev/null) 27 | else 28 | sequence=$[$sequence - 1] 29 | fi 30 | 31 | if [[ "$sequence" == "-1" || -z $replication_source || -z $tile_source ]]; then 32 | echo "Usage: $0 -r -t -s [initial sequence] -- [update-tiles options]" 33 | exit 1 34 | fi 35 | 36 | echo "Starting at sequence $(echo $[$sequence + 1])" 37 | 38 | while true; do 39 | set +e 40 | aws s3 ls ${replication_source}$((sequence + 1)).json > /dev/null 41 | retcode=$? 42 | set -e 43 | 44 | if [[ $retcode -eq 0 ]]; then 45 | sequence=$[$sequence + 1] 46 | 47 | $(dirname $0)/update-tiles -r $replication_source -t $tile_source -s urchn -l history -v $* $sequence 48 | 49 | echo $sequence | aws s3 cp - ${tile_source}sequence.txt 50 | else 51 | echo Waiting for $((sequence + 1))... 52 | sleep 15 53 | fi 54 | done 55 | -------------------------------------------------------------------------------- /src/analytics/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "osmesa-analytics" 4 | 5 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-core" % "2.6.7" 6 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.7" 7 | dependencyOverrides += "com.fasterxml.jackson.module" % "jackson-module-scala_2.11" % "2.6.7" 8 | 9 | libraryDependencies ++= Seq( 10 | postgresql, 11 | decline, 12 | sparkHive % Provided, 13 | sparkJts, 14 | gtGeotools, 15 | gtS3, 16 | gtSpark, 17 | gtVector, 18 | gtVectorTile, 19 | vectorpipe, 20 | cats, 21 | scalactic, 22 | gtSparkTestKit, 23 | logging, 24 | log4j2, 25 | scalatest 26 | ) 27 | 28 | /* Fixes Spark breakage with `sbt run` as of sbt-1.0.2 */ 29 | fork in run := true 30 | 31 | fork in Test := true 32 | 33 | test in assembly := {} 34 | 35 | javaOptions ++= Seq("-Xmx5G") 36 | 37 | initialCommands in console := 38 | """ 39 | """ 40 | 41 | assemblyJarName in assembly := "osmesa-analytics.jar" 42 | 43 | assemblyShadeRules in assembly := { 44 | // TODO: Do we still need these shade rules? 45 | val shadePackage = "com.azavea.shaded.demo" 46 | Seq( 47 | ShadeRule.rename("com.google.common.**" -> s"$shadePackage.google.common.@1") 48 | .inLibrary("com.azavea.geotrellis" %% "geotrellis-cassandra" % Version.geotrellis).inAll, 49 | ShadeRule.rename("io.netty.**" -> s"$shadePackage.io.netty.@1") 50 | .inLibrary("com.azavea.geotrellis" %% "geotrellis-hbase" % Version.geotrellis).inAll, 51 | ShadeRule.rename("com.fasterxml.jackson.**" -> s"$shadePackage.com.fasterxml.jackson.@1") 52 | .inLibrary("com.networknt" % "json-schema-validator" % "0.1.7").inAll, 53 | ShadeRule.rename("org.apache.avro.**" -> s"$shadePackage.org.apache.avro.@1") 54 | .inLibrary("com.azavea.geotrellis" %% "geotrellis-spark" % Version.geotrellis).inAll 55 | ) 56 | } 57 | 58 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 59 | -------------------------------------------------------------------------------- /src/analytics/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.2.1 2 | -------------------------------------------------------------------------------- /src/analytics/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 9 | # log level for this class is used to overwrite the root logger's log level, so that 10 | # the user can have different defaults for the shell and regular Spark apps. 11 | log4j.logger.org.apache.spark.repl.Main=WARN 12 | 13 | # Settings to quiet third party logs that are too verbose 14 | log4j.logger.org.spark_project.jetty=WARN 15 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR 16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 18 | log4j.logger.org.apache.parquet=ERROR 19 | log4j.logger.parquet=ERROR 20 | 21 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 22 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 23 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 24 | -------------------------------------------------------------------------------- /src/analytics/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=WARN, console 2 | log4j.appender.console=org.apache.log4j.ConsoleAppender 3 | log4j.appender.console.target=System.out 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | # log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c: %m%n 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 7 | log4j.logger.osmesa=DEBUG -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/Analytics.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics 2 | 3 | import geotrellis.spark.store.kryo.KryoRegistrator 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.serializer.KryoSerializer 6 | import org.apache.spark.sql._ 7 | import org.locationtech.geomesa.spark.jts._ 8 | 9 | object Analytics { 10 | def sparkSession(appName: String): SparkSession = { 11 | val conf = new SparkConf() 12 | .setIfMissing("spark.master", "local[*]") 13 | .setAppName(s"OSMesa Analytics - ${appName}") 14 | .set("spark.sql.orc.impl", "native") 15 | .set("spark.sql.orc.filterPushdown", "true") 16 | .set("spark.sql.parquet.mergeSchema", "false") 17 | .set("spark.sql.parquet.filterPushdown", "true") 18 | .set("spark.sql.hive.metastorePartitionPruning", "true") 19 | .set("spark.ui.showConsoleProgress", "true") 20 | .set("spark.serializer", classOf[KryoSerializer].getName) 21 | .set("spark.kryo.registrator", classOf[KryoRegistrator].getName) 22 | 23 | SparkSession.builder 24 | .config(conf) 25 | .enableHiveSupport 26 | .getOrCreate 27 | .withJTS 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/Countries.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics 2 | 3 | import org.locationtech.jts.geom.Coordinate 4 | import org.locationtech.jts.geom.prep.{PreparedGeometry, PreparedGeometryFactory} 5 | import geotrellis.vector._ 6 | import geotrellis.vector.io.json._ 7 | import _root_.io.circe._ 8 | import _root_.io.circe.generic.semiauto._ 9 | 10 | 11 | case class CountryId(name: String, code: Short) 12 | object CountryId { 13 | implicit val countryIdDecoder: Decoder[CountryId] = deriveDecoder 14 | implicit val countryIdEncoder: Encoder[CountryId] = deriveEncoder 15 | } 16 | 17 | object Countries { 18 | def all: Vector[MultiPolygonFeature[CountryId]] = { 19 | val collection = 20 | Resource("countries.geojson"). 21 | parseGeoJson[JsonFeatureCollection] 22 | 23 | val polys = 24 | collection. 25 | getAllPolygonFeatures[CountryId]. 26 | map(_.mapGeom(MultiPolygon(_))) 27 | 28 | val mps = 29 | collection. 30 | getAllMultiPolygonFeatures[CountryId] 31 | 32 | polys ++ mps 33 | } 34 | 35 | def byName: Map[String, MultiPolygonFeature[CountryId]] = 36 | all.map { f => (f.data.name, f) }.toMap 37 | 38 | def indexed: SpatialIndex[MultiPolygonFeature[CountryId]] = 39 | SpatialIndex.fromExtents(all) { mpf => mpf.geom.getEnvelopeInternal } 40 | 41 | } 42 | 43 | class CountryLookup() extends Serializable { 44 | private val index = 45 | SpatialIndex.fromExtents( 46 | Countries.all. 47 | map { mpf => 48 | (PreparedGeometryFactory.prepare(mpf.geom), mpf.data) 49 | } 50 | ) { case (pg, _) => pg.getGeometry.getEnvelopeInternal } 51 | 52 | def lookup(coord: Coordinate): Option[CountryId] = { 53 | val t = 54 | new Traversable[(PreparedGeometry, CountryId)] { 55 | override def foreach[U](f: ((PreparedGeometry, CountryId)) => U): Unit = { 56 | val visitor = new org.locationtech.jts.index.ItemVisitor { 57 | override def visitItem(obj: AnyRef): Unit = f(obj.asInstanceOf[(PreparedGeometry, CountryId)]) 58 | } 59 | index.rtree.query(new org.locationtech.jts.geom.Envelope(coord), visitor) 60 | } 61 | } 62 | 63 | t. 64 | find(_._1.covers(Point(coord.x, coord.y))). 65 | map(_._2) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/Resource.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics 2 | 3 | import java.io._ 4 | 5 | object Resource { 6 | def apply(name: String): String = { 7 | val stream: InputStream = getClass.getResourceAsStream(s"/$name") 8 | try { scala.io.Source.fromInputStream( stream ).getLines.mkString(" ") } finally { stream.close() } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/S3Utils.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics 2 | 3 | import java.net.{URI, URLDecoder} 4 | import java.nio.charset.StandardCharsets 5 | 6 | import geotrellis.store.s3.S3ClientProducer 7 | import software.amazon.awssdk.services.s3.S3Client 8 | import software.amazon.awssdk.services.s3.model.GetObjectRequest 9 | 10 | object S3Utils { 11 | def readText(uri: String): String = { 12 | val s3Client: S3Client = S3ClientProducer.get() 13 | val s3uri = URI.create(uri) 14 | val key = URLDecoder.decode(s3uri.getPath.drop(1), StandardCharsets.UTF_8.toString) 15 | val request = GetObjectRequest.builder() 16 | .bucket(s3uri.getHost) 17 | .key(key) 18 | .build() 19 | s3Client.getObjectAsBytes(request).asString(StandardCharsets.UTF_8) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/VectorGrid.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics 2 | 3 | import java.io.ByteArrayInputStream 4 | import java.net.URI 5 | import java.util.zip.GZIPInputStream 6 | 7 | import geotrellis.proj4.WebMercator 8 | import geotrellis.layer.ZoomedLayoutScheme 9 | import geotrellis.vector.Extent 10 | import geotrellis.vectortile.{Layer, MVTFeature, VInt64, VectorTile} 11 | import org.apache.commons.io.IOUtils 12 | import org.apache.spark.internal.Logging 13 | import osmesa.analytics.updater.Implicits._ 14 | import osmesa.analytics.updater._ 15 | 16 | import scala.collection.GenMap 17 | import scala.collection.parallel.TaskSupport 18 | 19 | trait VectorGrid extends Logging { 20 | // Default base zoom (highest resolution tiles produced) 21 | val DefaultBaseZoom: Int = 10 22 | 23 | // Number of cells per side in a gridded tile 24 | implicit val Cells: Int = 128 25 | 26 | // Number of cells in a gridded tile at the base of the pyramid (may be used for over-zooming) 27 | val BaseCells: Int = Cells 28 | 29 | // Default upload concurrency 30 | val DefaultUploadConcurrency: Int = 8 31 | 32 | implicit val LayoutScheme: ZoomedLayoutScheme = ZoomedLayoutScheme(WebMercator) 33 | val SequenceLayerName: String = "__sequences__" 34 | 35 | def getCommittedSequences(tile: VectorTile): Set[Int] = 36 | // NOTE when working with hashtags, this should be the changeset sequence, since changes from a 37 | // single sequence may appear in different batches depending on when changeset metadata arrives 38 | tile.layers 39 | .get(SequenceLayerName) 40 | .map(_.features.flatMap(f => f.data.values.map(valueToLong).map(_.intValue))) 41 | .map(_.toSet) 42 | .getOrElse(Set.empty) 43 | 44 | def makeSequenceLayer(sequences: Set[Int], extent: Extent, tileWidth: Int = 4096): (String, Layer) = { 45 | // create a second layer w/ a feature corresponding to committed sequences (in the absence of 46 | // available tile / layer metadata) 47 | val updatedSequences = 48 | sequences.toSeq.sorted 49 | .takeRight(1000) 50 | .zipWithIndex 51 | .map { 52 | case (seq, idx) => 53 | idx.toString -> VInt64(seq) 54 | } 55 | .toMap 56 | 57 | val sequenceFeature = MVTFeature(extent.center, updatedSequences) 58 | 59 | makeLayer(SequenceLayerName, extent, Seq(sequenceFeature), tileWidth) 60 | } 61 | 62 | def loadMVTs(urls: Map[URI, Extent])( 63 | implicit taskSupport: TaskSupport): GenMap[URI, VectorTile] = { 64 | // convert to a parallel collection to load more tiles concurrently 65 | val parUrls = urls.par 66 | parUrls.tasksupport = taskSupport 67 | 68 | parUrls.map { 69 | case (uri, extent) => 70 | (uri, 71 | read(uri).map( 72 | bytes => 73 | VectorTile.fromBytes( 74 | IOUtils.toByteArray(new GZIPInputStream(new ByteArrayInputStream(bytes))), 75 | extent))) 76 | } filter { 77 | case (_, mvt) => mvt.isDefined 78 | } map { 79 | case (uri, mvt) => uri -> mvt.get 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/raster/MutableSparseIntTile.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics.raster 2 | 3 | import geotrellis.raster.{ 4 | ArrayTile, 5 | CellType, 6 | IntCellType, 7 | IntCells, 8 | IntConstantNoDataCellType, 9 | IntTileVisitor, 10 | IntUserDefinedNoDataCellType, 11 | MutableArrayTile, 12 | NoDataHandling, 13 | Tile, 14 | isData 15 | } 16 | 17 | import scala.collection.mutable 18 | 19 | class MutableSparseIntTile(val cols: Int, 20 | val rows: Int, 21 | val values: scala.collection.mutable.LongMap[Int], 22 | val cellType: IntCells with NoDataHandling) 23 | extends MutableArrayTile { 24 | private val noDataValue = cellType match { 25 | case IntConstantNoDataCellType => Int.MinValue 26 | case IntUserDefinedNoDataCellType(nd) => nd 27 | case IntCellType => 0 28 | } 29 | 30 | override def updateDouble(i: Int, z: Double): Unit = update(i, z.toInt) 31 | 32 | override def update(i: Int, z: Int): Unit = { 33 | if (isData(z)) { 34 | values(i) = z 35 | } else { 36 | values.remove(i) 37 | } 38 | } 39 | 40 | def interpretAs(newCellType: CellType): Tile = { 41 | newCellType match { 42 | case dt: IntCells with NoDataHandling => 43 | MutableSparseIntTile(cols, rows, values, dt) 44 | case _ => 45 | withNoData(None).convert(newCellType) 46 | } 47 | } 48 | 49 | def withNoData(noDataValue: Option[Double]): Tile = 50 | MutableSparseIntTile(cols, rows, values, cellType.withNoData(noDataValue)) 51 | 52 | override def applyDouble(i: Int): Double = apply(i).toDouble 53 | 54 | override def apply(i: Int): Int = values.getOrElse(i, noDataValue) 55 | 56 | override def copy: ArrayTile = MutableSparseIntTile(cols, rows, values.clone(), cellType) 57 | 58 | // unimplemented because it doesn't make sense in this context (and MutableSparseIntTile can't be instantiated from 59 | // Array[Byte]) 60 | override def toBytes(): Array[Byte] = ??? 61 | 62 | def toMap: Map[Long, Int] = values.toMap 63 | 64 | override def foreachIntVisitor(visitor: IntTileVisitor): Unit = { 65 | values.foreach { 66 | case (k, v) => 67 | val col = k % cols 68 | val row = k / cols 69 | 70 | visitor(col.toInt, row.toInt, v) 71 | } 72 | } 73 | } 74 | 75 | object MutableSparseIntTile { 76 | def apply(cols: Int, 77 | rows: Int, 78 | values: mutable.LongMap[Int] = mutable.LongMap.empty[Int], 79 | cellType: IntCells with NoDataHandling = IntConstantNoDataCellType) = 80 | new MutableSparseIntTile(cols, rows, values, cellType) 81 | } 82 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/raster/SparseIntTile.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics.raster 2 | 3 | import geotrellis.raster.{ 4 | ArrayTile, 5 | CellType, 6 | IntCellType, 7 | IntCells, 8 | IntConstantNoDataCellType, 9 | IntTileVisitor, 10 | IntUserDefinedNoDataCellType, 11 | MutableArrayTile, 12 | NoDataHandling, 13 | Tile 14 | } 15 | 16 | class SparseIntTile(val cols: Int, 17 | val rows: Int, 18 | val values: Map[Long, Int], 19 | val cellType: IntCells with NoDataHandling) 20 | extends ArrayTile { 21 | private val noDataValue = cellType match { 22 | case IntConstantNoDataCellType => Int.MinValue 23 | case IntUserDefinedNoDataCellType(nd) => nd 24 | case IntCellType => 0 25 | } 26 | 27 | def interpretAs(newCellType: CellType): Tile = { 28 | newCellType match { 29 | case dt: IntCells with NoDataHandling => 30 | SparseIntTile(cols, rows, values, dt) 31 | case _ => 32 | withNoData(None).convert(newCellType) 33 | } 34 | } 35 | 36 | def withNoData(noDataValue: Option[Double]): Tile = 37 | SparseIntTile(cols, rows, values, cellType.withNoData(noDataValue)) 38 | 39 | override def applyDouble(i: Int): Double = apply(i).toDouble 40 | 41 | override def apply(i: Int): Int = values.getOrElse(i, noDataValue) 42 | 43 | override def copy: ArrayTile = SparseIntTile(cols, rows, Map(values.toSeq: _*), cellType) 44 | 45 | // unimplemented because it doesn't make sense in this context (and SparseIntTile can't be instantiated from 46 | // Array[Byte]) 47 | override def toBytes(): Array[Byte] = ??? 48 | 49 | def toMap: Map[Long, Int] = values 50 | 51 | override def mutable: MutableArrayTile = 52 | MutableSparseIntTile(cols, rows, scala.collection.mutable.LongMap(values.toSeq: _*), cellType) 53 | 54 | override def foreachIntVisitor(visitor: IntTileVisitor): Unit = { 55 | // NOTE only visits coordinates containing data; this isn't strictly correct for some uses 56 | values.foreach { 57 | case (k, v) => 58 | val col = k % cols 59 | val row = k / cols 60 | 61 | visitor(col.toInt, row.toInt, v) 62 | } 63 | } 64 | } 65 | 66 | object SparseIntTile { 67 | def apply(cols: Int, 68 | rows: Int, 69 | values: Map[Long, Int] = Map.empty[Long, Int], 70 | cellType: IntCells with NoDataHandling = IntConstantNoDataCellType) = 71 | new SparseIntTile(cols, rows, values, cellType) 72 | } 73 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/raster/package.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics 2 | import geotrellis.raster.{Raster, Tile, isData} 3 | 4 | package object raster { 5 | implicit class RasterMethods(val raster: Raster[Tile]) { 6 | def toMap: Map[Long, Int] = { 7 | raster.tile match { 8 | case tile: SparseIntTile => tile.toMap 9 | case tile: MutableSparseIntTile => tile.toMap 10 | case tile => 11 | tile 12 | .toArray() 13 | .zipWithIndex 14 | .filter(x => isData(x._1)) 15 | .map(x => (x._2.toLong, x._1)) 16 | .toMap 17 | } 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/stats/functions/package.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics.stats 2 | 3 | import org.apache.spark.sql.expressions.UserDefinedFunction 4 | import org.apache.spark.sql.functions._ 5 | import vectorpipe.util._ 6 | 7 | package object functions { 8 | // A brief note about style 9 | // Spark functions are typically defined using snake_case, therefore so are the UDFs 10 | // internal helper functions use standard Scala naming conventions 11 | 12 | lazy val merge_measurements: UserDefinedFunction = udf(_mergeDoubleCounts) 13 | 14 | lazy val sum_measurements: UserDefinedFunction = udf { counts: Iterable[Map[String, Double]] => 15 | Option(counts.reduce(_mergeDoubleCounts)).filter(_.nonEmpty).orNull 16 | } 17 | 18 | lazy val sum_count_values: UserDefinedFunction = udf { counts: Map[String, Int] => 19 | counts.values.sum 20 | } 21 | 22 | lazy val simplify_measurements: UserDefinedFunction = udf { counts: Map[String, Double] => 23 | counts.filter(_._2 != 0) 24 | } 25 | 26 | lazy val simplify_counts: UserDefinedFunction = udf { counts: Map[String, Int] => 27 | counts.filter(_._2 != 0) 28 | } 29 | 30 | private val _mergeIntCounts = (a: Map[String, Int], b: Map[String, Int]) => 31 | mergeMaps(Option(a).getOrElse(Map.empty), 32 | Option(b).getOrElse(Map.empty))(_ + _) 33 | 34 | private val _mergeDoubleCounts = (a: Map[String, Double], b: Map[String, Double]) => 35 | mergeMaps(Option(a).getOrElse(Map.empty), 36 | Option(b).getOrElse(Map.empty))(_ + _) 37 | } 38 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/updater/Implicits.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics.updater 2 | 3 | import geotrellis.vectortile.{VInt64, VString, Value} 4 | 5 | object Implicits { 6 | implicit def valueToLong(x: Value): Long = (x: @unchecked) match { 7 | case y: VInt64 => y.value 8 | case y: VString => y.value.toLong 9 | } 10 | 11 | implicit def valueToString(x: Value): String = (x: @unchecked) match { 12 | case y: VInt64 => y.value.toString 13 | case y: VString => y.value 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/updater/Schema.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics.updater 2 | 3 | import java.sql.Timestamp 4 | import java.time.Instant 5 | 6 | import geotrellis.vector.Geometry 7 | import geotrellis.vectortile.{Layer, MVTFeature} 8 | import org.apache.log4j.Logger 9 | import osmesa.analytics.updater.Implicits._ 10 | 11 | trait Schema { 12 | val layer: Layer 13 | val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)] 14 | 15 | val newFeatures: Seq[MVTFeature[Geometry]] 16 | lazy val replacementFeatures: Seq[MVTFeature[Geometry]] = Seq.empty[MVTFeature[Geometry]] 17 | lazy val retainedFeatures: Seq[MVTFeature[Geometry]] = Seq.empty[MVTFeature[Geometry]] 18 | 19 | protected lazy val logger: Logger = Logger.getLogger(getClass) 20 | 21 | protected lazy val touchedFeatures: Map[String, Seq[MVTFeature[Geometry]]] = 22 | Map.empty[String, Seq[MVTFeature[Geometry]]] 23 | 24 | protected lazy val versionInfo: Map[String, (Int, Int, Timestamp)] = 25 | touchedFeatures 26 | .mapValues(_.last) 27 | .mapValues( 28 | f => 29 | ( 30 | f.data("__version").toInt, 31 | f.data("__minorVersion").toInt, 32 | Timestamp.from(Instant.ofEpochMilli(f.data("__updated"))) 33 | )) 34 | 35 | protected lazy val minorVersions: Map[String, Int] = 36 | features 37 | .mapValues { 38 | case (_, curr) => curr.data 39 | } 40 | .map { 41 | case (id, f) => 42 | versionInfo.get(id) match { 43 | case Some((prevVersion, _, _)) if prevVersion < f.version => (id, 0) 44 | case Some((prevVersion, prevMinorVersion, _)) if prevVersion == f.version => 45 | (id, prevMinorVersion + 1) 46 | case _ => (id, 0) 47 | } 48 | } 49 | } 50 | 51 | trait SchemaBuilder { 52 | val layerName: String 53 | 54 | def apply(layer: Layer, 55 | features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]): Schema 56 | } 57 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/updater/TileUpdater.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics.updater 2 | 3 | import java.io.File 4 | import java.net.URI 5 | import java.nio.file.Path 6 | 7 | import cats.implicits._ 8 | import com.monovore.decline._ 9 | import org.apache.log4j.Logger 10 | import osmesa.analytics.updater.schemas._ 11 | 12 | class TileUpdater 13 | 14 | object TileUpdater extends CommandApp( 15 | name = "update-tiles", 16 | header = "Update vector tiles with changes from an augmented diff", 17 | main = { 18 | val rootURI = new File("").toURI 19 | 20 | val replicationSourceOpt = 21 | Opts.option[URI]( 22 | "replication-source", 23 | short = "r", 24 | metavar = "uri", 25 | help = "URI prefix for replication files" 26 | ).withDefault(rootURI) 27 | val tileSourceOpt = Opts 28 | .option[URI]( 29 | "tile-source", 30 | short = "t", 31 | metavar = "uri", 32 | help = "URI prefix for vector tiles to update" 33 | ).withDefault(rootURI) 34 | val minZoomOpt = 35 | Opts.option[Int]( 36 | "min-zoom", 37 | short = "z", 38 | metavar = "zoom", 39 | help = "Minimum zoom to consider" 40 | ) 41 | val maxZoomOpt = 42 | Opts.option[Int]( 43 | "max-zoom", 44 | short = "Z", 45 | metavar = "zoom", 46 | help = "Maximum zoom to consider" 47 | ) 48 | val schemaOpt = 49 | Opts.option[String]( 50 | "schema", 51 | short = "s", 52 | metavar = "schema", 53 | help = "Schema" 54 | ).withDefault("snapshot") 55 | .validate("Must be a registered schema") { Schemas.keySet.contains(_) } 56 | .map { Schemas(_) } 57 | val listingOpt = 58 | Opts.option[Path]( 59 | "tiles", 60 | short = "T", 61 | metavar = "tile list", 62 | help = "List of tiles available for updating" 63 | ).orNone 64 | val dryRunOpt = 65 | Opts.flag( 66 | "dry-run", 67 | short = "n", 68 | help = "Dry run" 69 | ).orFalse 70 | val verboseOpt = 71 | Opts.flag( 72 | "verbose", 73 | short = "v", 74 | help = "Be verbose" 75 | ).orFalse 76 | val sequenceOpt = Opts.argument[Int]("sequence") 77 | 78 | val logger = Logger.getLogger(classOf[TileUpdater]) 79 | 80 | (replicationSourceOpt, 81 | tileSourceOpt, 82 | minZoomOpt, 83 | maxZoomOpt, 84 | schemaOpt, 85 | listingOpt, 86 | dryRunOpt, 87 | verboseOpt, 88 | sequenceOpt).mapN { 89 | (replicationSource, 90 | tileSource, 91 | minZoom, 92 | maxZoom, 93 | schema, 94 | listing, 95 | dryRun, 96 | verbose, 97 | sequence) => 98 | val replicationUri = replicationSource.resolve(s"$sequence.json") 99 | 100 | if (verbose) { 101 | println(s"Applying $replicationUri to $tileSource from zoom $minZoom to $maxZoom...") 102 | } 103 | 104 | readFeatures(replicationUri) match { 105 | case Some(features) => 106 | for (zoom <- minZoom to maxZoom) { 107 | updateTiles( 108 | tileSource = tileSource, 109 | zoom = zoom, 110 | schemaType = schema, 111 | features = features, 112 | listing = listing, 113 | process = (sk, tile) => { 114 | val filename = s"$zoom/${sk.col}/${sk.row}.mvt" 115 | val uri = tileSource.resolve(filename) 116 | 117 | if (dryRun) { 118 | println( 119 | s"Would write ${tile.toBytes.length.formatted("%,d")} bytes to $uri") 120 | } else { 121 | logger.info( 122 | s"Writing ${tile.toBytes.length.formatted("%,d")} bytes to $uri") 123 | // TODO gzip compress 124 | write(uri, tile.toBytes) 125 | } 126 | 127 | if (verbose) { 128 | println(filename) 129 | } 130 | } 131 | ) 132 | } 133 | 134 | case None => 135 | println(s"No features available for $sequence") 136 | System.exit(1) 137 | } 138 | } 139 | } 140 | ) 141 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/updater/schemas/History.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics.updater.schemas 2 | 3 | import java.sql.Timestamp 4 | import java.time.Instant 5 | 6 | import geotrellis.vector.Geometry 7 | import geotrellis.vectortile._ 8 | import osmesa.analytics.updater.Implicits._ 9 | import osmesa.analytics.updater._ 10 | 11 | class History( 12 | override val layer: Layer, 13 | override val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]) 14 | extends Schema { 15 | override protected lazy val touchedFeatures: Map[String, Seq[MVTFeature[Geometry]]] = { 16 | val featureIds = features.keySet 17 | 18 | layer.features 19 | .filter(f => featureIds.contains(f.data("__id"))) 20 | .groupBy(f => f.data("__id"): String) 21 | .mapValues( 22 | fs => 23 | fs.sortWith(_.data("__minorVersion") < _.data("__minorVersion")) 24 | .sortWith(_.data("__version") < _.data("__version"))) 25 | } 26 | 27 | lazy val newFeatures: Seq[MVTFeature[Geometry]] = 28 | features 29 | .filter { 30 | case (id, (_, curr)) => 31 | versionInfo.get(id) match { 32 | case Some((_, _, prevTimestamp)) if curr.data.timestamp.after(prevTimestamp) => true 33 | case None => true 34 | case _ => false 35 | } 36 | } 37 | .filter { 38 | // filter out null geometries 39 | case (_, (_, curr)) => Option(curr.geom).isDefined && curr.isValid 40 | } 41 | .map { 42 | case (id, (_, curr)) => (id, makeFeature(curr, minorVersions.get(id))) 43 | } 44 | .values 45 | .filter(_.isDefined) 46 | .map(_.get) 47 | .toSeq 48 | 49 | override lazy val replacementFeatures: Seq[MVTFeature[Geometry]] = { 50 | val activeFeatures = touchedFeatures 51 | .filter { 52 | case (id, fs) => 53 | features(id)._2.data.timestamp 54 | .after(Timestamp.from(Instant.ofEpochMilli(fs.last.data("__updated")))) 55 | } 56 | 57 | val featuresToReplace = activeFeatures 58 | .mapValues(fs => fs.filter(_.data("__validUntil").toLong == 0)) 59 | .values 60 | .flatten 61 | .toSeq 62 | 63 | val replacedFeatures = featuresToReplace 64 | .map(f => updateFeature(f, features(f.data("__id"))._2.data.timestamp)) 65 | 66 | logger.info(s"Rewriting ${replacedFeatures.length.formatted("%,d")} features") 67 | 68 | replacedFeatures 69 | } 70 | 71 | override lazy val retainedFeatures: Seq[MVTFeature[Geometry]] = { 72 | val activeFeatures = touchedFeatures 73 | .filter { 74 | case (id, fs) => 75 | features(id)._2.data.timestamp 76 | .after(Timestamp.from(Instant.ofEpochMilli(fs.last.data("__updated")))) 77 | } 78 | 79 | activeFeatures 80 | .mapValues(fs => fs.filterNot(_.data("__validUntil").toLong == 0)) 81 | .values 82 | .flatten 83 | .toSeq 84 | } 85 | 86 | private def makeFeature(feature: AugmentedDiffFeature, 87 | minorVersion: Option[Int], 88 | validUntil: Option[Long] = None): Option[MVTFeature[Geometry]] = { 89 | val id = feature.data.id 90 | 91 | val elementId = feature.data.`type` match { 92 | case "node" => s"n$id" 93 | case "way" => s"w$id" 94 | case "relation" => s"r$id" 95 | case _ => id.toString 96 | } 97 | 98 | feature match { 99 | case _ if feature.geom.isValid => 100 | Some( 101 | MVTFeature( 102 | feature.geom, // when features are deleted, this will be the last geometry that was visible 103 | feature.data.tags.map { 104 | case (k, v) => (k, VString(v)) 105 | } ++ Map( 106 | "__id" -> VString(elementId), 107 | "__changeset" -> VInt64(feature.data.changeset), 108 | "__updated" -> VInt64(feature.data.timestamp.getTime), 109 | "__validUntil" -> VInt64(validUntil.getOrElse(0L)), 110 | "__version" -> VInt64(feature.data.version), 111 | "__uid" -> VInt64(feature.data.uid), 112 | "__user" -> VString(feature.data.user), 113 | "__visible" -> VBool(feature.data.visible.getOrElse(true)) 114 | ) ++ minorVersion 115 | .map(v => Map("__minorVersion" -> VInt64(v))) 116 | .getOrElse(Map.empty[String, Value]) 117 | ) 118 | ) 119 | case _ => None 120 | } 121 | } 122 | 123 | private def updateFeature(feature: MVTFeature[Geometry], validUntil: Timestamp): MVTFeature[Geometry] = { 124 | MVTFeature( 125 | feature.geom, 126 | feature.data.updated("__validUntil", VInt64(validUntil.getTime)) 127 | ) 128 | } 129 | } 130 | 131 | object History extends SchemaBuilder { 132 | override val layerName: String = "all" 133 | 134 | def apply(layer: Layer, 135 | features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]) = 136 | new History(layer, features) 137 | } 138 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/updater/schemas/Snapshot.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics.updater.schemas 2 | 3 | import geotrellis.vector.Geometry 4 | import geotrellis.vectortile.{Layer, MVTFeature, VInt64, VString} 5 | import osmesa.analytics.updater._ 6 | 7 | class Snapshot( 8 | override val layer: Layer, 9 | override val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]) 10 | extends Schema { 11 | lazy val newFeatures: Seq[MVTFeature[Geometry]] = 12 | features.values 13 | .map(_._2) 14 | .filter(_.data.visible.getOrElse(true)) 15 | .map(makeFeature) 16 | .filter(_.isDefined) 17 | .map(_.get) 18 | .toSeq 19 | 20 | private def makeFeature(feature: AugmentedDiffFeature): Option[MVTFeature[Geometry]] = { 21 | val id = feature.data.id 22 | 23 | val elementId = feature.data.`type` match { 24 | case "node" => s"n$id" 25 | case "way" => s"w$id" 26 | case "relation" => s"r$id" 27 | case _ => id.toString 28 | } 29 | 30 | feature match { 31 | case _ if feature.geom.isValid => 32 | Some( 33 | MVTFeature( 34 | feature.geom, 35 | feature.data.tags.map { 36 | case (k, v) => (k, VString(v)) 37 | } ++ Map( 38 | "__id" -> VString(elementId), 39 | "__changeset" -> VInt64(feature.data.changeset), 40 | "__updated" -> VInt64(feature.data.timestamp.getTime), 41 | "__version" -> VInt64(feature.data.version), 42 | "__uid" -> VInt64(feature.data.uid), 43 | "__user" -> VString(feature.data.user) 44 | ) 45 | ) 46 | ) 47 | case _ => None 48 | } 49 | } 50 | } 51 | 52 | object Snapshot extends SchemaBuilder { 53 | override val layerName: String = "data" 54 | 55 | def apply(layer: Layer, 56 | features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]) = 57 | new Snapshot(layer, features) 58 | } 59 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/updater/schemas/Urchn.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics.updater.schemas 2 | 3 | import geotrellis.vector.Geometry 4 | import geotrellis.vectortile._ 5 | import osmesa.analytics.updater.Implicits._ 6 | import osmesa.analytics.updater._ 7 | 8 | class Urchn( 9 | override val layer: Layer, 10 | override val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]) 11 | extends Schema { 12 | override protected lazy val touchedFeatures: Map[String, Seq[MVTFeature[Geometry]]] = { 13 | val featureIds = features.keySet 14 | 15 | layer.features 16 | .filter(f => featureIds.contains(f.data("__id"))) 17 | .groupBy(f => f.data("__id"): String) 18 | .mapValues( 19 | fs => 20 | fs.sortWith(_.data("__minorVersion") < _.data("__minorVersion")) 21 | .sortWith(_.data("__version") < _.data("__version"))) 22 | } 23 | 24 | private lazy val authors: Map[String, Set[String]] = 25 | touchedFeatures 26 | .mapValues(_.last) 27 | .mapValues(_.data("__authors").split(",").toSet) 28 | 29 | private lazy val creation: Map[String, Long] = 30 | touchedFeatures 31 | .mapValues(_.head) 32 | .mapValues(_.data("__creation")) 33 | 34 | lazy val newFeatures: Seq[MVTFeature[Geometry]] = 35 | features 36 | .filter { 37 | case (id, (_, curr)) => 38 | versionInfo.get(id) match { 39 | case Some((_, _, prevTimestamp)) if curr.data.timestamp.after(prevTimestamp) => true 40 | case None => true 41 | case _ => false 42 | } 43 | } 44 | .values 45 | .filter { 46 | // filter out null geometries 47 | case (_, curr) => Option(curr.geom).isDefined && curr.isValid 48 | } 49 | .map { 50 | case (_, curr) => 51 | // NOTE: if this feature appears in the current tile for the first time, creation, authors, and minorVersions 52 | // will be incomplete (and therefore wrong) 53 | makeFeature( 54 | curr, 55 | creation 56 | .getOrElse(curr.data.elementId, curr.data.timestamp.getTime), 57 | authors 58 | .get(curr.data.elementId) 59 | .map(_ + curr.data.user) 60 | .getOrElse(Set(curr.data.user)), 61 | minorVersions.get(curr.data.elementId) 62 | ) 63 | } 64 | .filter(_.isDefined) 65 | .map(_.get) 66 | .toSeq 67 | 68 | private def makeFeature(feature: AugmentedDiffFeature, 69 | creation: Long, 70 | authors: Set[String], 71 | minorVersion: Option[Int]): Option[MVTFeature[Geometry]] = { 72 | val id = feature.data.id 73 | 74 | val elementId = feature.data.`type` match { 75 | case "node" => s"n$id" 76 | case "way" => s"w$id" 77 | case "relation" => s"r$id" 78 | case _ => id.toString 79 | } 80 | 81 | feature match { 82 | case _ if Option(feature.geom).isDefined && feature.geom.isValid => 83 | Some( 84 | MVTFeature( 85 | feature.geom, // when features are deleted, this will be the last geometry that was visible 86 | feature.data.tags.map { 87 | case (k, v) => (k, VString(v)) 88 | } ++ Map( 89 | "__id" -> VString(elementId), 90 | "__changeset" -> VInt64(feature.data.changeset), 91 | "__updated" -> VInt64(feature.data.timestamp.getTime), 92 | "__version" -> VInt64(feature.data.version), 93 | "__vtileGen" -> VInt64(System.currentTimeMillis), 94 | "__creation" -> VInt64(creation), 95 | "__authors" -> VString(authors.mkString(",")), 96 | "__lastAuthor" -> VString(feature.data.user) 97 | ) ++ minorVersion 98 | .map(v => Map("__minorVersion" -> VInt64(v))) 99 | .getOrElse(Map.empty[String, Value]) 100 | ) 101 | ) 102 | case _ => None 103 | } 104 | } 105 | } 106 | 107 | object Urchn extends SchemaBuilder { 108 | override val layerName: String = "history" 109 | 110 | def apply(layer: Layer, 111 | features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]) = 112 | new Urchn(layer, features) 113 | } 114 | -------------------------------------------------------------------------------- /src/analytics/src/main/scala/osmesa/analytics/updater/schemas/package.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics.updater 2 | 3 | package object schemas { 4 | val Schemas: Map[String, SchemaBuilder] = Map( 5 | "history" -> History, 6 | "snapshot" -> Snapshot, 7 | "urchn" -> Urchn 8 | ) 9 | } 10 | -------------------------------------------------------------------------------- /src/analytics/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.out 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{HH:mm:ss} %c{1}: %m%n 7 | 8 | log4j.logger.osmesa.analytics=INFO 9 | 10 | # Settings to quiet third party logs that are too verbose 11 | log4j.logger.org.eclipse.jetty=WARN 12 | log4j.logger.org.apache.spark=WARN 13 | log4j.logger.org.apache.hadoop=WARN 14 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN 15 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN 16 | 17 | log4j.logger.org.spark-project.jetty=WARN 18 | org.spark-project.jetty.LEVEL=WARN -------------------------------------------------------------------------------- /src/analytics/src/test/scala/osmesa/analytics/CountriesTest.scala: -------------------------------------------------------------------------------- 1 | package osmesa.analytics 2 | 3 | import org.locationtech.jts.geom.Coordinate 4 | import geotrellis.vector._ 5 | import geotrellis.vector.io._ 6 | import org.scalatest._ 7 | import spray.json._ 8 | 9 | import geotrellis.spark.util._ 10 | 11 | class CountriesTest extends FunSuite with Matchers { 12 | def time[T](msg: String)(f: => T) = { 13 | val start = System.currentTimeMillis 14 | val v = f 15 | val end = System.currentTimeMillis 16 | println(s"[TIMING] ${msg}: ${java.text.NumberFormat.getIntegerInstance.format(end - start)} ms") 17 | v 18 | } 19 | 20 | def write(path: String, txt: String): Unit = { 21 | import java.nio.file.{Paths, Files} 22 | import java.nio.charset.StandardCharsets 23 | 24 | Files.write(Paths.get(path), txt.getBytes(StandardCharsets.UTF_8)) 25 | } 26 | 27 | test("Generate some random points and see if they make sense") { 28 | val countries = Countries.all 29 | val rand = new scala.util.Random 30 | val points = 31 | countries.flatMap { mpf => 32 | val env = mpf.geom.envelope 33 | 34 | for(i <- 0 until 10) yield { 35 | val x = env.xmin + (rand.nextDouble * env.width) 36 | val y = env.ymin + (rand.nextDouble * env.height) 37 | new Coordinate(x, y) 38 | } 39 | } 40 | 41 | val l = { 42 | // Ensure that we can serialize the Lookup. 43 | val x = 44 | time("Creating CountryLookup") { new CountryLookup() } 45 | val s = KryoSerializer.serialize(x) 46 | KryoSerializer.deserialize[CountryLookup](s) 47 | } 48 | 49 | val pcs = 50 | Countries.all.map { mpf => 51 | (mpf.geom.prepare, mpf.data) 52 | } 53 | 54 | // Brute force lookup, without spatial index 55 | def bfLookup(coord: Coordinate): Option[CountryId] = 56 | pcs.find { case (pg, _) => pg.contains(Point(coord.x, coord.y)) }. 57 | map { case (_, data) => data } 58 | 59 | val actual = 60 | time("LOOKUP") { 61 | points. 62 | map { p => l.lookup(p).map { cid => PointFeature(Point(p.x, p.y), cid) } } 63 | } 64 | 65 | val expected = 66 | time("BRUTE FORCE LOOKUP") { 67 | points. 68 | map { p => 69 | bfLookup(p).map { cid => PointFeature(Point(p.x, p.y), cid) } 70 | } 71 | } 72 | 73 | val nodeIndex = 74 | time("Creating nodeIndex") { 75 | SpatialIndex(points) { p => (p.x, p.y) } 76 | } 77 | 78 | val nodeIndexed = 79 | time("NODE INDEX LOOKUP") { 80 | // Another way to do the spatial index, indexing the nodes instead of the countries. 81 | // This turns out to be slower than the lookup for large point sets. 82 | val result: Vector[Option[PointFeature[CountryId]]] = 83 | Countries.all. 84 | flatMap { mpf => 85 | val pg = mpf.geom.prepare 86 | nodeIndex.traversePointsInExtent(mpf.geom.envelope). 87 | map { p => 88 | if(pg.covers(p)) { Some(PointFeature(Point(p.x, p.y), mpf.data)) } 89 | else { None } 90 | } 91 | } 92 | result 93 | } 94 | 95 | actual.flatten.length should be (expected.flatten.length) 96 | actual.flatten.length should be (nodeIndexed.flatten.length) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/apps/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "osmesa-apps" 4 | 5 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-core" % "2.6.7" 6 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.7" 7 | dependencyOverrides += "com.fasterxml.jackson.module" % "jackson-module-scala_2.11" % "2.6.7" 8 | 9 | libraryDependencies ++= Seq( 10 | postgresql, 11 | decline, 12 | sparkHive % Provided, 13 | sparkJts, 14 | gtGeotools, 15 | gtS3, 16 | gtSpark, 17 | gtVector, 18 | gtVectorTile, 19 | vectorpipe, 20 | cats, 21 | scalactic, 22 | gtSparkTestKit, 23 | logging, 24 | scalatest, 25 | apacheCommonsEmail, 26 | ) 27 | 28 | /* Fixes Spark breakage with `sbt run` as of sbt-1.0.2 */ 29 | fork in run := true 30 | 31 | fork in Test := true 32 | 33 | test in assembly := {} 34 | 35 | javaOptions ++= Seq("-Xmx5G") 36 | 37 | initialCommands in console := 38 | """ 39 | """ 40 | 41 | assemblyJarName in assembly := "osmesa-apps.jar" 42 | 43 | assemblyShadeRules in assembly := { 44 | val shadePackage = "com.azavea.shaded.demo" 45 | Seq( 46 | ShadeRule.rename("com.google.common.**" -> s"$shadePackage.google.common.@1") 47 | .inLibrary("com.azavea.geotrellis" %% "geotrellis-cassandra" % Version.geotrellis).inAll, 48 | ShadeRule.rename("io.netty.**" -> s"$shadePackage.io.netty.@1") 49 | .inLibrary("com.azavea.geotrellis" %% "geotrellis-hbase" % Version.geotrellis).inAll, 50 | ShadeRule.rename("com.fasterxml.jackson.**" -> s"$shadePackage.com.fasterxml.jackson.@1") 51 | .inLibrary("com.networknt" % "json-schema-validator" % "0.1.7").inAll, 52 | ShadeRule.rename("org.apache.avro.**" -> s"$shadePackage.org.apache.avro.@1") 53 | .inLibrary("com.azavea.geotrellis" %% "geotrellis-spark" % Version.geotrellis).inAll 54 | ) 55 | } 56 | 57 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 58 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/DbUtils.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps 2 | 3 | import java.net.URI 4 | import java.sql.Connection 5 | 6 | import vectorpipe.util.DBUtils 7 | 8 | object DbUtils { 9 | /** 10 | * Upsert a diff sequence number to a database, tied to a unique procName String 11 | * 12 | * Must be a PostgreSQL database 13 | * PostgreSQL database must contain table schema: 14 | * `checkpoints`: 15 | * - proc_name: String 16 | * - sequence: Int 17 | * 18 | * @param procName 19 | * @param sequence 20 | * @param databaseURI 21 | * @return 22 | */ 23 | def saveLocations(procName: String, sequence: Int, databaseURI: URI) = { 24 | var connection: Connection = null 25 | try { 26 | connection = DBUtils.getJdbcConnection(databaseURI) 27 | val upsertSequence = 28 | connection.prepareStatement( 29 | """ 30 | |INSERT INTO checkpoints (proc_name, sequence) 31 | |VALUES (?, ?) 32 | |ON CONFLICT (proc_name) 33 | |DO UPDATE SET sequence = ? 34 | """.stripMargin 35 | ) 36 | upsertSequence.setString(1, procName) 37 | upsertSequence.setInt(2, sequence) 38 | upsertSequence.setInt(3, sequence) 39 | upsertSequence.execute() 40 | } finally { 41 | if (connection != null) connection.close() 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/batch/EditHistogramTileCreator.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps.batch 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.sql.functions._ 9 | import org.locationtech.geomesa.spark.jts._ 10 | import osmesa.analytics.{Analytics, EditHistogram} 11 | import vectorpipe.functions.asDouble 12 | 13 | object EditHistogramTileCreator 14 | extends CommandApp( 15 | name = "edit-histogram", 16 | header = "Create vector tiles containing histograms of editing activity", 17 | main = { 18 | 19 | val historyOpt = Opts 20 | .option[URI]("history", help = "URI of the history ORC file to process.") 21 | 22 | val outputOpt = Opts.option[URI]("out", help = "Base URI for output.") 23 | 24 | val concurrentUploadsOpt = Opts 25 | .option[Int]("concurrent-uploads", 26 | short = "c", 27 | metavar = "concurrent uploads", 28 | help = "Set the number of concurrent uploads.") 29 | .orNone 30 | 31 | val baseZoomOpt = Opts 32 | .option[Int]("base-zoom", 33 | short = "z", 34 | metavar = "Base zoom", 35 | help = "Most detailed zoom level") 36 | .orNone 37 | 38 | ( 39 | historyOpt, 40 | outputOpt, 41 | concurrentUploadsOpt, 42 | baseZoomOpt 43 | ).mapN { 44 | (historyURI, outputURI, _concurrentUploads, baseZoom) => 45 | implicit val spark: SparkSession = 46 | Analytics.sparkSession("State of the Data tile generation") 47 | import spark.implicits._ 48 | implicit val concurrentUploads: Option[Int] = _concurrentUploads 49 | spark.withJTS 50 | 51 | val history = spark.read.orc(historyURI.toString) 52 | 53 | val nodes = history 54 | .where('type === "node" and 'lat.isNotNull and 'lon.isNotNull) 55 | .withColumn("lat", asDouble('lat)) 56 | .withColumn("lon", asDouble('lon)) 57 | .where('uid > 1) 58 | .select(st_makePoint('lon, 'lat) as 'geom, 59 | year('timestamp) * 1000 + dayofyear('timestamp) as 'key) 60 | 61 | val stats = EditHistogram.create(nodes, 62 | outputURI, 63 | baseZoom.getOrElse(EditHistogram.DefaultBaseZoom)) 64 | println(s"${stats.count} tiles created.") 65 | 66 | spark.stop() 67 | } 68 | } 69 | ) 70 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/batch/MergeChangesets.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps.batch 2 | 3 | import java.net.URI 4 | import java.sql._ 5 | import java.time.Instant 6 | 7 | import cats.data.{Validated, ValidatedNel} 8 | import cats.implicits._ 9 | import com.monovore.decline._ 10 | import io.circe._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.functions._ 13 | import org.joda.time.DateTime 14 | import org.joda.time.format.DateTimeFormat 15 | import osmesa.analytics.Analytics 16 | import vectorpipe.sources.{ChangesetSource, Source} 17 | import vectorpipe.util.DBUtils 18 | 19 | /* 20 | * Usage example: 21 | * 22 | * sbt "project apps" assembly 23 | * 24 | * spark-submit \ 25 | * --class osmesa.apps.batch.MergeChangesets \ 26 | * ingest/target/scala-2.11/osmesa-apps.jar \ 27 | * --changesets http://location/of/changeset/replications \ 28 | * --end-time 1970-01-01T13:00:00Z 29 | * s3://path/to/history.orc 30 | * s3://path/to/output.orc 31 | */ 32 | object MergeChangesets 33 | extends CommandApp( 34 | name = "osmesa-merge-changesets", 35 | header = "Bring existing changeset ORC file up to date using changeset stream", 36 | main = { 37 | 38 | import ChangesetSource._ 39 | import MergeChangesetsImplicits._ 40 | 41 | val changesetSourceOpt = 42 | Opts 43 | .option[URI]( 44 | "changesets", 45 | short = "c", 46 | metavar = "uri", 47 | help = "Location of replication changesets" 48 | ) 49 | .validate("Changeset source must have trailing '/'") { _.getPath.endsWith("/") } 50 | 51 | val endTimeOpt = 52 | Opts 53 | .option[Instant]("end-time", 54 | short = "e", 55 | metavar = "timestamp", 56 | help = "Timestamp of stream end (of the form 2016-02-29T13:45:00Z); if absent, the time now will be used") 57 | .orNone 58 | 59 | val orcArg = Opts 60 | .argument[URI]("source ORC") 61 | .validate("URI to ORC must have an s3 or file scheme") { _.getScheme != null } 62 | .validate("orc must be an S3 or file Uri") { uri => 63 | uri.getScheme.startsWith("s3") || uri.getScheme.startsWith("file") 64 | } 65 | .validate("orc must be an .orc file") { _.getPath.endsWith(".orc") } 66 | 67 | val outputArg = Opts.argument[URI]("destination ORC") 68 | .validate("Output URI must have a scheme") { _.getScheme != null } 69 | .validate("Output URI must have an S3 or file scheme") { uri => 70 | uri.getScheme.startsWith("s3") || uri.getScheme.startsWith("file") 71 | } 72 | .validate("orc must be an .orc file") { _.getPath.endsWith(".orc") } 73 | 74 | (changesetSourceOpt, 75 | endTimeOpt, 76 | orcArg, 77 | outputArg).mapN { 78 | (changesetSource, endTime, orcUri, outputURI) => 79 | implicit val spark: SparkSession = Analytics.sparkSession("MergeChangesets") 80 | 81 | import spark.implicits._ 82 | 83 | val df = spark.read.orc(orcUri.toString) 84 | val lastModified = df.select(max(coalesce('closedAt, 'createdAt))).first.getAs[Timestamp](0) 85 | 86 | val startSequence = findSequenceFor(lastModified.toInstant, changesetSource) 87 | val endSequence = endTime.map(findSequenceFor(_, changesetSource)).getOrElse(getCurrentSequence(changesetSource).get.sequence) 88 | 89 | val options = Map( 90 | Source.BaseURI -> changesetSource.toString, 91 | Source.StartSequence -> startSequence.toString, 92 | Source.EndSequence -> (endSequence + 1).toString // sequence range is (]; end sequence is exclusive 93 | ) 94 | 95 | val changesets = spark.read.format(Source.Changesets).options(options).load 96 | 97 | // TODO: Clean up the following by providing and using a function in VP to coerce the 98 | // column names into camel case (see https://github.com/geotrellis/vectorpipe/issues/113) 99 | changesets 100 | .drop("comments", "sequence") 101 | .union(df.select( 102 | 'id, 103 | 'tags, 104 | 'createdAt, 105 | 'open, 106 | 'closedAt, 107 | 'commentsCount, 108 | 'minLat, 109 | 'maxLat, 110 | 'minLon, 111 | 'maxLon, 112 | 'numChanges, 113 | 'uid, 114 | 'user) 115 | ) 116 | .repartition(1) 117 | .write 118 | .orc(outputURI.toString) 119 | 120 | spark.stop() 121 | } 122 | } 123 | ) 124 | object MergeChangesetsImplicits { 125 | implicit val readInstant: Argument[Instant] = new Argument[Instant] { 126 | override def read(string: String): ValidatedNel[String, Instant] = { 127 | try { Validated.valid(Instant.parse(string)) } 128 | catch { case e: Exception => Validated.invalidNel(s"Invalid time: $string (${ e.getMessage })") } 129 | } 130 | 131 | override def defaultMetavar: String = "time" 132 | } 133 | 134 | private val formatter = DateTimeFormat.forPattern("y-M-d H:m:s.SSSSSSSSS Z") 135 | 136 | private implicit val dateTimeDecoder: Decoder[DateTime] = 137 | Decoder.instance(a => a.as[String].map(DateTime.parse(_, formatter))) 138 | } 139 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/streaming/ChangeStreamProcessor.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps.streaming 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import org.apache.spark.sql._ 8 | import osmesa.analytics.Analytics 9 | import vectorpipe.sources.Source 10 | import vectorpipe.{internal => ProcessOSM} 11 | 12 | /* 13 | * Usage example: 14 | * 15 | * sbt "project apps" assembly 16 | * 17 | * # Running an infinite stream from the beginning of time 18 | * spark-submit \ 19 | * --class osmesa.apps.streaming.ChangeStreamProcessor \ 20 | * ./analytics/target/scala-2.11/osmesa-apps.jar \ 21 | * --start-sequence 1 22 | * 23 | * This class prints the change stream out to console for debugging 24 | */ 25 | object ChangeStreamProcessor 26 | extends CommandApp( 27 | name = "osmesa-diff-stream-processor", 28 | header = "display diffs from a change stream", 29 | main = { 30 | val changeSourceOpt = 31 | Opts 32 | .option[URI]( 33 | "change-source", 34 | short = "c", 35 | metavar = "uri", 36 | help = "Location of changes to process" 37 | ) 38 | .withDefault(new URI("https://planet.osm.org/replication/minute/")) 39 | 40 | val startSequenceOpt = 41 | Opts 42 | .option[Int]( 43 | "start-sequence", 44 | short = "s", 45 | metavar = "sequence", 46 | help = "Starting sequence. If absent, the current (remote) sequence will be used." 47 | ) 48 | .orNone 49 | 50 | val endSequenceOpt = 51 | Opts 52 | .option[Int]( 53 | "end-sequence", 54 | short = "e", 55 | metavar = "sequence", 56 | help = "Ending sequence. If absent, this will be an infinite stream." 57 | ) 58 | .orNone 59 | 60 | val databaseUriOpt = 61 | Opts 62 | .option[URI]( 63 | "database-url", 64 | short = "d", 65 | metavar = "database URL", 66 | help = "Database URL (default: $DATABASE_URL environment variable)" 67 | ) 68 | .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database")) 69 | .orNone 70 | 71 | (changeSourceOpt, startSequenceOpt, endSequenceOpt, databaseUriOpt).mapN { 72 | (changeSource, startSequence, endSequence, databaseUri) => 73 | implicit val ss: SparkSession = 74 | Analytics.sparkSession("ChangeStreamProcessor") 75 | 76 | import ss.implicits._ 77 | 78 | val options = Map( 79 | Source.BaseURI -> changeSource.toString, 80 | Source.ProcessName -> "ChangeStream" 81 | ) ++ 82 | databaseUri 83 | .map(x => Map(Source.DatabaseURI -> x.toString)) 84 | .getOrElse(Map.empty[String, String]) ++ 85 | startSequence 86 | .map(s => Map(Source.StartSequence -> s.toString)) 87 | .getOrElse(Map.empty[String, String]) ++ 88 | endSequence 89 | .map(s => Map(Source.EndSequence -> s.toString)) 90 | .getOrElse(Map.empty[String, String]) 91 | 92 | val changes = 93 | ss.readStream 94 | .format(Source.Changes) 95 | .options(options) 96 | .load 97 | 98 | val changeProcessor = changes 99 | .select('id, 'version, 'lat, 'lon, 'visible) 100 | .where('_type === ProcessOSM.NodeType and !'visible) 101 | .writeStream 102 | .queryName("display change data") 103 | .format("console") 104 | .start 105 | 106 | changeProcessor.awaitTermination() 107 | 108 | ss.stop() 109 | } 110 | } 111 | ) 112 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/streaming/ChangesetMetadataUpdater.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps.streaming 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import org.apache.spark.TaskContext 8 | import org.apache.spark.sql._ 9 | import osmesa.analytics.Analytics 10 | import osmesa.analytics.stats.ChangesetMetadataForeachWriter 11 | import vectorpipe.functions._ 12 | import vectorpipe.functions.osm._ 13 | import vectorpipe.sources.Source 14 | 15 | /* 16 | * Usage example: 17 | * 18 | * sbt "project apps" assembly 19 | * 20 | * spark-submit \ 21 | * --class osmesa.apps.streaming.ChangesetMetadataUpdater \ 22 | * ingest/target/scala-2.11/osmesa-analytics.jar \ 23 | * --database-url $DATABASE_URL 24 | */ 25 | object ChangesetMetadataUpdater 26 | extends CommandApp( 27 | name = "osmesa-augmented-diff-stream-processor", 28 | header = "Update statistics from streaming augmented diffs", 29 | main = { 30 | val changesetSourceOpt = 31 | Opts 32 | .option[URI]("changeset-source", 33 | short = "c", 34 | metavar = "uri", 35 | help = "Location of changesets to process") 36 | .withDefault(new URI("https://planet.osm.org/replication/changesets/")) 37 | 38 | val databaseUrlOpt = 39 | Opts 40 | .option[URI]( 41 | "database-url", 42 | short = "d", 43 | metavar = "database URL", 44 | help = "Database URL (default: $DATABASE_URL environment variable)" 45 | ) 46 | .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database")) 47 | 48 | val startSequenceOpt = 49 | Opts 50 | .option[Int]( 51 | "start-sequence", 52 | short = "s", 53 | metavar = "sequence", 54 | help = "Starting sequence. If absent, the current (remote) sequence will be used." 55 | ) 56 | .orNone 57 | 58 | val endSequenceOpt = 59 | Opts 60 | .option[Int]( 61 | "end-sequence", 62 | short = "e", 63 | metavar = "sequence", 64 | help = "Ending sequence. If absent, this will be an infinite stream." 65 | ) 66 | .orNone 67 | 68 | val partitionCountOpt = Opts 69 | .option[Int]("partition-count", 70 | short = "p", 71 | metavar = "partition count", 72 | help = "Change partition count.") 73 | .orNone 74 | 75 | (changesetSourceOpt, databaseUrlOpt, startSequenceOpt, endSequenceOpt, partitionCountOpt) 76 | .mapN { 77 | (changesetSource, databaseUrl, startSequence, endSequence, partitionCount) => 78 | implicit val ss: SparkSession = Analytics.sparkSession("ChangesetMetadataUpdater") 79 | 80 | import ss.implicits._ 81 | 82 | val options = Map( 83 | Source.BaseURI -> changesetSource.toString, 84 | Source.ProcessName -> "ChangesetMetadataUpdater" 85 | ) ++ 86 | startSequence 87 | .map(s => Map(Source.StartSequence -> s.toString)) 88 | .getOrElse(Map.empty) ++ 89 | endSequence 90 | .map(s => Map(Source.EndSequence -> s.toString)) 91 | .getOrElse(Map.empty) ++ 92 | partitionCount 93 | .map(x => Map(Source.PartitionCount -> x.toString)) 94 | .getOrElse(Map.empty) 95 | 96 | val changesets = 97 | ss.read 98 | .format(Source.Changesets) 99 | .options(options) 100 | .load 101 | 102 | changesets 103 | .select( 104 | 'id, 105 | 'createdAt, 106 | 'closedAt, 107 | 'user, 108 | 'uid, 109 | 'tags.getField("created_by") as 'editor, 110 | merge_sets(hashtags('tags.getField("comment")), 111 | hashtags('tags.getField("hashtags"))) as 'hashtags 112 | ) 113 | .foreachPartition(rows => { 114 | val writer = 115 | new ChangesetMetadataForeachWriter(databaseUrl, shouldUpdateUsernames = true) 116 | 117 | if (writer.open(TaskContext.getPartitionId(), 0)) { 118 | try { 119 | rows.foreach(writer.process) 120 | 121 | writer.close(null) 122 | } catch { 123 | case e: Throwable => writer.close(e) 124 | } 125 | } 126 | }) 127 | 128 | ss.stop() 129 | } 130 | } 131 | ) 132 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/streaming/ChangesetStatsUpdater.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps.streaming 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import geotrellis.vector.{Feature, Geometry} 8 | import org.apache.spark.TaskContext 9 | import org.apache.spark.sql._ 10 | import org.apache.spark.sql.functions._ 11 | import osmesa.analytics.Analytics 12 | import osmesa.analytics.stats._ 13 | import osmesa.analytics.stats.functions._ 14 | import vectorpipe.functions.{flatten => _, _} 15 | import vectorpipe.functions.osm.isTagged 16 | import vectorpipe.model.ElementWithSequence 17 | import vectorpipe.sources.Source 18 | import vectorpipe.util.Geocode 19 | 20 | /* 21 | * Usage example: 22 | * 23 | * sbt "project apps" assembly 24 | * 25 | * spark-submit \ 26 | * --class osmesa.apps.streaming.ChangesetStatsUpdater \ 27 | * ingest/target/scala-2.11/osmesa-apps.jar \ 28 | * --augmented-diff-source s3://somewhere/diffs/ \ 29 | * --database-url $DATABASE_URL 30 | */ 31 | object ChangesetStatsUpdater 32 | extends CommandApp( 33 | name = "osmesa-changeset-stats-updater", 34 | header = "Update statistics from augmented diffs", 35 | main = { 36 | type AugmentedDiffFeature = Feature[Geometry, ElementWithSequence] 37 | 38 | val augmentedDiffSourceOpt = 39 | Opts 40 | .option[URI]( 41 | "augmented-diff-source", 42 | short = "a", 43 | metavar = "uri", 44 | help = "Location of augmented diffs to process" 45 | ) 46 | 47 | val databaseUrlOpt = 48 | Opts 49 | .option[URI]( 50 | "database-url", 51 | short = "d", 52 | metavar = "database URL", 53 | help = "Database URL (default: $DATABASE_URL environment variable)" 54 | ) 55 | .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database")) 56 | 57 | val startSequenceOpt = 58 | Opts 59 | .option[Int]( 60 | "start-sequence", 61 | short = "s", 62 | metavar = "sequence", 63 | help = "Starting sequence. If absent, the current (remote) sequence will be used." 64 | ) 65 | .orNone 66 | 67 | val endSequenceOpt = 68 | Opts 69 | .option[Int]("end-sequence", 70 | short = "e", 71 | metavar = "sequence", 72 | help = "Ending sequence. If absent, this will be an infinite stream.") 73 | .orNone 74 | 75 | val partitionCountOpt = Opts 76 | .option[Int]("partition-count", 77 | short = "p", 78 | metavar = "partition count", 79 | help = "Change partition count.") 80 | .orNone 81 | 82 | (augmentedDiffSourceOpt, 83 | startSequenceOpt, 84 | endSequenceOpt, 85 | databaseUrlOpt, 86 | partitionCountOpt).mapN { 87 | (augmentedDiffSource, startSequence, endSequence, databaseUrl, partitionCount) => 88 | implicit val ss: SparkSession = Analytics.sparkSession("ChangesetStatsUpdater") 89 | 90 | import ss.implicits._ 91 | 92 | val options = Map( 93 | Source.BaseURI -> augmentedDiffSource.toString, 94 | Source.ProcessName -> "ChangesetStatsUpdater" 95 | ) ++ 96 | startSequence 97 | .map(s => Map(Source.StartSequence -> s.toString)) 98 | .getOrElse(Map.empty) ++ 99 | endSequence 100 | .map(s => Map(Source.EndSequence -> s.toString)) 101 | .getOrElse(Map.empty) ++ 102 | partitionCount 103 | .map(x => Map(Source.PartitionCount -> x.toString)) 104 | .getOrElse(Map.empty) 105 | 106 | val geoms = ss.read.format(Source.AugmentedDiffs).options(options).load 107 | 108 | Geocode(geoms.where(isTagged('tags))) 109 | .withLinearDelta 110 | .withAreaDelta 111 | .select( 112 | 'sequence, 113 | 'changeset, 114 | 'uid, 115 | 'user, 116 | 'countries, 117 | DefaultMeasurements, 118 | DefaultCounts 119 | ) 120 | .groupBy('sequence, 'changeset, 'uid, 'user) 121 | .agg( 122 | sum_measurements(collect_list('measurements)) as 'measurements, 123 | sum_counts(collect_list('counts)) as 'counts, 124 | count_values(flatten(collect_list('countries))) as 'countries 125 | ) 126 | .withColumn("totalEdits", sum_count_values('counts)) 127 | .foreachPartition(rows => { 128 | val writer = 129 | new ChangesetStatsForeachWriter(databaseUrl, shouldUpdateUsernames = true) 130 | 131 | if (writer.open(TaskContext.getPartitionId(), 0)) { 132 | try { 133 | rows.foreach(writer.process) 134 | 135 | writer.close(null) 136 | } catch { 137 | case e: Throwable => writer.close(e) 138 | } 139 | } 140 | }) 141 | 142 | ss.stop() 143 | } 144 | } 145 | ) 146 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/streaming/EditHistogramTileUpdater.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps.streaming 2 | 3 | import java.io._ 4 | import java.net.URI 5 | 6 | import cats.implicits._ 7 | import com.monovore.decline._ 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.sql.functions._ 10 | import org.locationtech.geomesa.spark.jts._ 11 | import osmesa.analytics.{Analytics, EditHistogram} 12 | import vectorpipe.sources.Source 13 | 14 | /* 15 | * Usage example: 16 | * 17 | * sbt "project apps" assembly 18 | * 19 | * spark-submit \ 20 | * --class osmesa.apps.streaming.EditHistogramTileUpdater \ 21 | * ingest/target/scala-2.11/osmesa-apps.jar 22 | */ 23 | object EditHistogramTileUpdater 24 | extends CommandApp( 25 | name = "osmesa-edit-histogram-tile-updater", 26 | header = "Consume minutely diffs to update edit histogram MVTs", 27 | main = { 28 | val changeSourceOpt = Opts 29 | .option[URI]("source", 30 | short = "d", 31 | metavar = "uri", 32 | help = "Location of minutely diffs to process") 33 | .withDefault(new URI("https://planet.osm.org/replication/minute/")) 34 | 35 | val startSequenceOpt = Opts 36 | .option[Int]( 37 | "start-sequence", 38 | short = "s", 39 | metavar = "sequence", 40 | help = 41 | "Minutely diff starting sequence. If absent, the current (remote) sequence will be used.") 42 | .orNone 43 | 44 | val endSequenceOpt = Opts 45 | .option[Int]( 46 | "end-sequence", 47 | short = "e", 48 | metavar = "sequence", 49 | help = 50 | "Minutely diff ending sequence. If absent, the current (remote) sequence will be used.") 51 | .orNone 52 | 53 | val partitionCountOpt = Opts 54 | .option[Int]("partition-count", 55 | short = "p", 56 | metavar = "partition count", 57 | help = "Change partition count.") 58 | .orNone 59 | 60 | val tileSourceOpt = Opts 61 | .option[URI]( 62 | "tile-source", 63 | short = "t", 64 | metavar = "uri", 65 | help = "URI prefix of MVTs to update" 66 | ) 67 | .withDefault(new File("").toURI) 68 | 69 | val concurrentUploadsOpt = Opts 70 | .option[Int]("concurrent-uploads", 71 | short = "c", 72 | metavar = "concurrent uploads", 73 | help = "Set the number of concurrent uploads.") 74 | .orNone 75 | 76 | val baseZoomOpt = Opts 77 | .option[Int]("base-zoom", 78 | short = "z", 79 | metavar = "Base zoom", 80 | help = "Most detailed zoom level") 81 | .orNone 82 | 83 | (changeSourceOpt, 84 | startSequenceOpt, 85 | endSequenceOpt, 86 | partitionCountOpt, 87 | tileSourceOpt, 88 | concurrentUploadsOpt, 89 | baseZoomOpt).mapN { 90 | (changeSource, 91 | startSequence, 92 | endSequence, 93 | partitionCount, 94 | tileSource, 95 | _concurrentUploads, 96 | baseZoom) => 97 | val AppName = "EditHistogramTileUpdater" 98 | 99 | val spark: SparkSession = Analytics.sparkSession(AppName) 100 | import spark.implicits._ 101 | implicit val concurrentUploads: Option[Int] = _concurrentUploads 102 | spark.withJTS 103 | 104 | val changeOptions = Map(Source.BaseURI -> changeSource.toString) ++ 105 | startSequence 106 | .map(x => Map(Source.StartSequence -> x.toString)) 107 | .getOrElse(Map.empty) ++ 108 | endSequence 109 | .map(x => Map(Source.EndSequence -> x.toString)) 110 | .getOrElse(Map.empty) ++ 111 | partitionCount 112 | .map(x => Map(Source.PartitionCount -> x.toString)) 113 | .getOrElse(Map.empty) 114 | 115 | val changes = spark.read 116 | .format(Source.Changes) 117 | .options(changeOptions) 118 | .load 119 | 120 | val changedNodes = changes 121 | .where('type === "node" and 'lat.isNotNull and 'lon.isNotNull) 122 | .select('sequence, 123 | st_makePoint('lon, 'lat) as 'geom, 124 | year('timestamp) * 1000 + dayofyear('timestamp) as 'key) 125 | 126 | val tiledNodes = EditHistogram.update(changedNodes, 127 | tileSource, 128 | baseZoom.getOrElse(EditHistogram.DefaultBaseZoom)) 129 | 130 | val lastSequence = 131 | changedNodes.select(max('sequence) as 'sequence).first.getAs[Int]("sequence") 132 | 133 | println(s"${tiledNodes.count} tiles updated to ${lastSequence}.") 134 | } 135 | } 136 | ) 137 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/streaming/StreamingChangesetMetadataUpdater.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps.streaming 2 | 3 | import java.net.URI 4 | 5 | import cats.implicits._ 6 | import com.monovore.decline._ 7 | import org.apache.spark.sql._ 8 | import osmesa.analytics.Analytics 9 | import osmesa.analytics.stats.ChangesetMetadataForeachWriter 10 | import vectorpipe.functions._ 11 | import vectorpipe.functions.osm._ 12 | import vectorpipe.sources.Source 13 | 14 | /* 15 | * Usage example: 16 | * 17 | * sbt "project apps" assembly 18 | * 19 | * spark-submit \ 20 | * --class osmesa.apps.streaming.StreamingChangesetMetadataUpdater \ 21 | * ingest/target/scala-2.11/osmesa-apps.jar \ 22 | * --database-url $DATABASE_URL 23 | */ 24 | object StreamingChangesetMetadataUpdater 25 | extends CommandApp( 26 | name = "osmesa-changeset-stream-processor", 27 | header = "Update statistics from changeset replication stream", 28 | main = { 29 | val changesetSourceOpt = 30 | Opts 31 | .option[URI]("changeset-source", 32 | short = "c", 33 | metavar = "uri", 34 | help = "Location of changesets to process") 35 | .withDefault(new URI("https://planet.osm.org/replication/changesets/")) 36 | 37 | val databaseUrlOpt = 38 | Opts 39 | .option[URI]( 40 | "database-url", 41 | short = "d", 42 | metavar = "database URL", 43 | help = "Database URL (default: $DATABASE_URL environment variable)" 44 | ) 45 | .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database")) 46 | 47 | val startSequenceOpt = 48 | Opts 49 | .option[Int]( 50 | "start-sequence", 51 | short = "s", 52 | metavar = "sequence", 53 | help = "Starting sequence. If absent, the current (remote) sequence will be used." 54 | ) 55 | .orNone 56 | 57 | val endSequenceOpt = 58 | Opts 59 | .option[Int]( 60 | "end-sequence", 61 | short = "e", 62 | metavar = "sequence", 63 | help = "Ending sequence. If absent, this will be an infinite stream." 64 | ) 65 | .orNone 66 | 67 | val batchSizeOpt = Opts 68 | .option[Int]("batch-size", 69 | short = "b", 70 | metavar = "batch size", 71 | help = "Change batch size.") 72 | .orNone 73 | 74 | (changesetSourceOpt, databaseUrlOpt, startSequenceOpt, endSequenceOpt, batchSizeOpt).mapN { 75 | (changesetSource, databaseUrl, startSequence, endSequence, batchSize) => 76 | implicit val ss: SparkSession = 77 | Analytics.sparkSession("StreamingChangesetMetadataUpdater") 78 | 79 | import ss.implicits._ 80 | 81 | val options = Map( 82 | Source.BaseURI -> changesetSource.toString, 83 | Source.DatabaseURI -> databaseUrl.toString, 84 | Source.ProcessName -> "ChangesetMetadataUpdater" 85 | ) ++ 86 | startSequence 87 | .map(s => Map(Source.StartSequence -> s.toString)) 88 | .getOrElse(Map.empty) ++ 89 | endSequence 90 | .map(s => Map(Source.EndSequence -> s.toString)) 91 | .getOrElse(Map.empty) ++ 92 | batchSize 93 | .map(x => Map(Source.BatchSize -> x.toString)) 94 | .getOrElse(Map.empty) 95 | 96 | val changesets = 97 | ss.readStream 98 | .format(Source.Changesets) 99 | .options(options) 100 | .load 101 | 102 | val changesetProcessor = changesets 103 | .select( 104 | 'id, 105 | 'createdAt, 106 | 'closedAt, 107 | 'user, 108 | 'uid, 109 | 'tags.getField("created_by") as 'editor, 110 | merge_sets(hashtags('tags.getField("comment")), 111 | hashtags('tags.getField("hashtags"))) as 'hashtags 112 | ) 113 | .writeStream 114 | .queryName("update changeset metadata") 115 | .foreach(new ChangesetMetadataForeachWriter(databaseUrl, 116 | shouldUpdateUsernames = true)) 117 | .start 118 | 119 | changesetProcessor.awaitTermination() 120 | 121 | ss.stop() 122 | } 123 | } 124 | ) 125 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/streaming/StreamingEditHistogramTileUpdater.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps.streaming 2 | 3 | import java.io._ 4 | import java.net.URI 5 | 6 | import cats.implicits._ 7 | import com.monovore.decline._ 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.sql.functions._ 10 | import org.locationtech.geomesa.spark.jts._ 11 | import osmesa.analytics.{Analytics, EditHistogram} 12 | import vectorpipe.sources.Source 13 | 14 | /* 15 | * Usage example: 16 | * 17 | * sbt "project apps" assembly 18 | * 19 | * spark-submit \ 20 | * --class osmesa.apps.streaming.StreamingEditHistogramTileUpdater \ 21 | * ingest/target/scala-2.11/osmesa-apps.jar 22 | */ 23 | object StreamingEditHistogramTileUpdater 24 | extends CommandApp( 25 | name = "osmesa-edit-histogram-tile-updater", 26 | header = "Consume minutely diffs to update edit histogram MVTs", 27 | main = { 28 | val changeSourceOpt = Opts 29 | .option[URI]("source", 30 | short = "d", 31 | metavar = "uri", 32 | help = "Location of minutely diffs to process") 33 | .withDefault(new URI("https://planet.osm.org/replication/minute/")) 34 | 35 | val startSequenceOpt = Opts 36 | .option[Int]( 37 | "start-sequence", 38 | short = "s", 39 | metavar = "sequence", 40 | help = 41 | "Minutely diff starting sequence. If absent, the current (remote) sequence will be used.") 42 | .orNone 43 | 44 | val batchSizeOpt = Opts 45 | .option[Int]("batch-size", 46 | short = "b", 47 | metavar = "batch size", 48 | help = "Change batch size.") 49 | .orNone 50 | 51 | val tileSourceOpt = Opts 52 | .option[URI]( 53 | "tile-source", 54 | short = "t", 55 | metavar = "uri", 56 | help = "URI prefix of MVTs to update" 57 | ) 58 | .withDefault(new File("").toURI) 59 | 60 | val concurrentUploadsOpt = Opts 61 | .option[Int]("concurrent-uploads", 62 | short = "c", 63 | metavar = "concurrent uploads", 64 | help = "Set the number of concurrent uploads.") 65 | .orNone 66 | 67 | val databaseUrlOpt = 68 | Opts 69 | .option[URI]( 70 | "database-url", 71 | short = "d", 72 | metavar = "database URL", 73 | help = "Database URL (default: DATABASE_URL environment variable)" 74 | ) 75 | .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database")) 76 | .orNone 77 | 78 | val baseZoomOpt = Opts 79 | .option[Int]("base-zoom", 80 | short = "z", 81 | metavar = "Base zoom", 82 | help = "Most detailed zoom level") 83 | .orNone 84 | 85 | (changeSourceOpt, 86 | startSequenceOpt, 87 | batchSizeOpt, 88 | tileSourceOpt, 89 | concurrentUploadsOpt, 90 | databaseUrlOpt, 91 | baseZoomOpt).mapN { 92 | (changeSource, 93 | startSequence, 94 | batchSize, 95 | tileSource, 96 | _concurrentUploads, 97 | databaseUrl, 98 | baseZoom) => 99 | val AppName = "EditHistogramTileUpdater" 100 | 101 | val spark: SparkSession = Analytics.sparkSession(AppName) 102 | import spark.implicits._ 103 | implicit val concurrentUploads: Option[Int] = _concurrentUploads 104 | spark.withJTS 105 | 106 | val changeOptions = Map(Source.BaseURI -> changeSource.toString, 107 | Source.ProcessName -> AppName) ++ 108 | databaseUrl 109 | .map(x => Map(Source.DatabaseURI -> x.toString)) 110 | .getOrElse(Map.empty) ++ 111 | startSequence 112 | .map(x => Map(Source.StartSequence -> x.toString)) 113 | .getOrElse(Map.empty) ++ 114 | batchSize 115 | .map(x => Map(Source.BatchSize -> x.toString)) 116 | .getOrElse(Map.empty) 117 | 118 | val changes = spark.readStream 119 | .format(Source.Changes) 120 | .options(changeOptions) 121 | .load 122 | 123 | val changedNodes = changes 124 | .where('type === "node" and 'lat.isNotNull and 'lon.isNotNull) 125 | .select('sequence, 126 | year('timestamp) * 1000 + dayofyear('timestamp) as 'key, 127 | st_makePoint('lon, 'lat) as 'geom) 128 | 129 | val tiledNodes = EditHistogram.update(changedNodes, 130 | tileSource, 131 | baseZoom.getOrElse(EditHistogram.DefaultBaseZoom)) 132 | 133 | val query = tiledNodes.writeStream 134 | .queryName("edit histogram tiles") 135 | .format("console") 136 | .start 137 | 138 | query.awaitTermination() 139 | 140 | spark.stop() 141 | } 142 | } 143 | ) 144 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/streaming/StreamingUserFootprintTileUpdater.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps.streaming 2 | 3 | import java.io._ 4 | import java.net.URI 5 | 6 | import cats.implicits._ 7 | import com.monovore.decline._ 8 | import org.apache.spark.sql._ 9 | import org.locationtech.geomesa.spark.jts._ 10 | import osmesa.analytics.{Analytics, Footprints} 11 | import vectorpipe.sources.Source 12 | 13 | /* 14 | * Usage example: 15 | * 16 | * sbt "project apps" assembly 17 | * 18 | * spark-submit \ 19 | * --class osmesa.apps.streaming.StreamingUserFootprintUpdater \ 20 | * ingest/target/scala-2.11/osmesa-apps.jar 21 | */ 22 | object StreamingUserFootprintTileUpdater 23 | extends CommandApp( 24 | name = "osmesa-user-footprint-updater", 25 | header = "Consume minutely diffs to update user footprint MVTs", 26 | main = { 27 | val changeSourceOpt = Opts 28 | .option[URI]("change-source", 29 | short = "d", 30 | metavar = "uri", 31 | help = "Location of minutely diffs to process") 32 | .withDefault(new URI("https://planet.osm.org/replication/minute/")) 33 | 34 | val startSequenceOpt = Opts 35 | .option[Int]( 36 | "start-sequence", 37 | short = "s", 38 | metavar = "sequence", 39 | help = 40 | "Minutely diff starting sequence. If absent, the current (remote) sequence will be used.") 41 | .orNone 42 | 43 | val batchSizeOpt = Opts 44 | .option[Int]("batch-size", 45 | short = "b", 46 | metavar = "batch size", 47 | help = "Change batch size.") 48 | .orNone 49 | 50 | val tileSourceOpt = Opts 51 | .option[URI]( 52 | "tile-source", 53 | short = "t", 54 | metavar = "uri", 55 | help = "URI prefix for vector tiles to update" 56 | ) 57 | .withDefault(new File("").toURI) 58 | 59 | val concurrentUploadsOpt = Opts 60 | .option[Int]("concurrent-uploads", 61 | short = "c", 62 | metavar = "concurrent uploads", 63 | help = "Set the number of concurrent uploads.") 64 | .orNone 65 | 66 | val databaseUrlOpt = 67 | Opts 68 | .option[URI]( 69 | "database-url", 70 | short = "d", 71 | metavar = "database URL", 72 | help = "Database URL (default: DATABASE_URL environment variable)" 73 | ) 74 | .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database")) 75 | .orNone 76 | 77 | (changeSourceOpt, 78 | startSequenceOpt, 79 | batchSizeOpt, 80 | tileSourceOpt, 81 | concurrentUploadsOpt, 82 | databaseUrlOpt).mapN { 83 | (changeSource, startSequence, batchSize, tileSource, _concurrentUploads, databaseUrl) => 84 | val AppName = "UserFootprintUpdater" 85 | 86 | val spark: SparkSession = Analytics.sparkSession(AppName) 87 | import spark.implicits._ 88 | implicit val concurrentUploads: Option[Int] = _concurrentUploads 89 | spark.withJTS 90 | 91 | val changeOptions = Map(Source.BaseURI -> changeSource.toString, 92 | Source.ProcessName -> AppName) ++ 93 | databaseUrl 94 | .map(x => Map(Source.DatabaseURI -> x.toString)) 95 | .getOrElse(Map.empty) ++ 96 | startSequence 97 | .map(x => Map(Source.StartSequence -> x.toString)) 98 | .getOrElse(Map.empty) ++ 99 | batchSize 100 | .map(x => Map(Source.BatchSize -> x.toString)) 101 | .getOrElse(Map.empty) 102 | 103 | val changes = spark.readStream 104 | .format(Source.Changes) 105 | .options(changeOptions) 106 | .load 107 | 108 | val changedNodes = changes 109 | .where('type === "node" and 'lat.isNotNull and 'lon.isNotNull) 110 | .select('sequence, 'uid as 'key, st_makePoint('lon, 'lat) as 'geom) 111 | 112 | val tiledNodes = 113 | Footprints.update(changedNodes, tileSource) 114 | 115 | val query = tiledNodes.writeStream 116 | .queryName("tiled user footprints") 117 | .format("console") 118 | .start 119 | 120 | query.awaitTermination() 121 | 122 | spark.stop() 123 | } 124 | } 125 | ) 126 | -------------------------------------------------------------------------------- /src/apps/src/main/scala/osmesa/apps/streaming/UserFootprintUpdater.scala: -------------------------------------------------------------------------------- 1 | package osmesa.apps.streaming 2 | 3 | import java.io._ 4 | import java.net.URI 5 | 6 | import cats.implicits._ 7 | import com.monovore.decline._ 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.sql.functions._ 10 | import org.locationtech.geomesa.spark.jts._ 11 | import osmesa.analytics.{Analytics, Footprints} 12 | import vectorpipe.sources.Source 13 | 14 | /* 15 | * Usage example: 16 | * 17 | * sbt "project apps" assembly 18 | * 19 | * spark-submit \ 20 | * --class osmesa.apps.streaming.UserFootprintUpdater \ 21 | * ingest/target/scala-2.11/osmesa-apps.jar 22 | */ 23 | object UserFootprintUpdater 24 | extends CommandApp( 25 | name = "osmesa-user-footprint-updater", 26 | header = "Consume minutely diffs to update user footprint MVTs", 27 | main = { 28 | val changeSourceOpt = Opts 29 | .option[URI]("change-source", 30 | short = "d", 31 | metavar = "uri", 32 | help = "Location of minutely diffs to process") 33 | .withDefault(new URI("https://planet.osm.org/replication/minute/")) 34 | 35 | val startSequenceOpt = Opts 36 | .option[Int]( 37 | "start-sequence", 38 | short = "s", 39 | metavar = "sequence", 40 | help = 41 | "Minutely diff starting sequence. If absent, the current (remote) sequence will be used.") 42 | .orNone 43 | 44 | val endSequenceOpt = Opts 45 | .option[Int]( 46 | "end-sequence", 47 | short = "e", 48 | metavar = "sequence", 49 | help = 50 | "Minutely diff ending sequence. If absent, the current (remote) sequence will be used.") 51 | .orNone 52 | 53 | val partitionCountOpt = Opts 54 | .option[Int]("partition-count", 55 | short = "p", 56 | metavar = "partition count", 57 | help = "Change partition count.") 58 | .orNone 59 | 60 | val tileSourceOpt = Opts 61 | .option[URI]( 62 | "tile-source", 63 | short = "t", 64 | metavar = "uri", 65 | help = "URI prefix for vector tiles to update" 66 | ) 67 | .withDefault(new File("").toURI) 68 | 69 | val concurrentUploadsOpt = Opts 70 | .option[Int]("concurrent-uploads", 71 | short = "c", 72 | metavar = "concurrent uploads", 73 | help = "Set the number of concurrent uploads.") 74 | .orNone 75 | 76 | (changeSourceOpt, 77 | startSequenceOpt, 78 | endSequenceOpt, 79 | partitionCountOpt, 80 | tileSourceOpt, 81 | concurrentUploadsOpt).mapN { 82 | (changeSource, 83 | startSequence, 84 | endSequence, 85 | partitionCount, 86 | tileSource, 87 | _concurrentUploads) => 88 | val AppName = "UserFootprintUpdater" 89 | 90 | val spark: SparkSession = Analytics.sparkSession(AppName) 91 | import spark.implicits._ 92 | implicit val concurrentUploads: Option[Int] = _concurrentUploads 93 | spark.withJTS 94 | 95 | val changeOptions = Map(Source.BaseURI -> changeSource.toString) ++ 96 | startSequence 97 | .map(x => Map(Source.StartSequence -> x.toString)) 98 | .getOrElse(Map.empty) ++ 99 | endSequence 100 | .map(x => Map(Source.EndSequence -> x.toString)) 101 | .getOrElse(Map.empty) ++ 102 | partitionCount 103 | .map(x => Map(Source.PartitionCount -> x.toString)) 104 | .getOrElse(Map.empty) 105 | 106 | val changes = spark.read 107 | .format(Source.Changes) 108 | .options(changeOptions) 109 | .load 110 | 111 | val changedNodes = changes 112 | .where('type === "node" and 'lat.isNotNull and 'lon.isNotNull) 113 | .select('sequence, 'uid as 'key, st_makePoint('lon, 'lat) as 'geom) 114 | 115 | val tiledNodes = 116 | Footprints.update(changedNodes, tileSource) 117 | 118 | val lastSequence = 119 | changedNodes.select(max('sequence) as 'sequence).first.getAs[Int]("sequence") 120 | 121 | println(s"${tiledNodes.count} tiles updated to ${lastSequence}.") 122 | } 123 | } 124 | ) 125 | -------------------------------------------------------------------------------- /src/bench/src/main/scala/osmesa/Bench.scala: -------------------------------------------------------------------------------- 1 | // package osmesa 2 | 3 | // import java.util.concurrent.TimeUnit 4 | 5 | // import scala.util.Try 6 | 7 | // import cats.implicits._ 8 | // import org.apache.log4j 9 | // import org.apache.spark._ 10 | // import org.apache.spark.sql._ 11 | // import org.openjdk.jmh.annotations._ 12 | // import osmesa.analytics.oneoffs.Analysis 13 | 14 | // // --- // 15 | 16 | // @BenchmarkMode(Array(Mode.AverageTime)) 17 | // @OutputTimeUnit(TimeUnit.SECONDS) 18 | // @State(Scope.Thread) 19 | // class Bench { 20 | 21 | // var conf: SparkConf = _ 22 | // implicit var ss: SparkSession = _ 23 | 24 | // @Setup 25 | // def setup: Unit = { 26 | // conf = new SparkConf() 27 | // .setIfMissing("spark.master", "local[*]") 28 | // .setAppName("road-changes") 29 | // .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 30 | // .set("spark.kryo.registrator", classOf[geotrellis.spark.io.kryo.KryoRegistrator].getName) 31 | 32 | // ss = SparkSession.builder.config(conf).enableHiveSupport.getOrCreate 33 | 34 | // /* Silence the damn INFO logger */ 35 | // log4j.Logger.getRootLogger().setLevel(log4j.Level.ERROR) 36 | // } 37 | 38 | // @TearDown 39 | // def close: Unit = ss.stop() 40 | 41 | // @Benchmark 42 | // def roads: Try[Double] = { 43 | // val path: String = "/home/colin/code/azavea/vectorpipe/data/isle-of-man.orc" 44 | 45 | // (Try(ss.read.orc(path)) >>= Analysis.newRoadsByUser).map(_.aggregate(0d)({ _ + _._2 }, { _ + _ })) 46 | // } 47 | 48 | // } 49 | -------------------------------------------------------------------------------- /src/bench/src/main/scala/osmesa/MetresBench.scala: -------------------------------------------------------------------------------- 1 | package osmesa 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import geotrellis.vector.{Point, Line} 6 | import geotrellis.util.Haversine 7 | import org.openjdk.jmh.annotations._ 8 | 9 | // --- // 10 | 11 | @BenchmarkMode(Array(Mode.AverageTime)) 12 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 13 | @State(Scope.Thread) 14 | class MetresBench { 15 | 16 | var line0: Line = _ 17 | var line1: Line = _ 18 | 19 | @Setup 20 | def setup: Unit = { 21 | line0 = Line((0 to 9).map(n => Point(n, n))) 22 | line1 = Line((0 to 90).map(n => Point(n, n))) 23 | } 24 | 25 | def iterator(line: Line): Double = { 26 | val ps: List[Point] = line.points.toList 27 | val pairs: Iterator[(Point, Point)] = ps.iterator.zip(ps.tail.iterator) 28 | 29 | pairs.foldLeft(0d) { case (acc, (p,c)) => acc + Haversine(p.x, p.y, c.x, c.y) } 30 | } 31 | 32 | def manual(line: Line): Double = { 33 | val geom = line.jtsGeom 34 | 35 | (0 until (geom.getNumPoints - 1)).map { i => 36 | val p = geom.getPointN(i) 37 | val c = geom.getPointN(i + 1) 38 | 39 | Haversine(p.getX, p.getY, c.getX, c.getY) 40 | } reduce (_ + _) 41 | } 42 | 43 | def whiley(line: Line): Double = { 44 | val geom = line.jtsGeom 45 | var i: Int = 0 46 | var r: Double = 0 47 | 48 | while (i < geom.getNumPoints - 1) { 49 | val p = geom.getPointN(i) 50 | val c = geom.getPointN(i + 1) 51 | 52 | r += Haversine(p.getX, p.getY, c.getX, c.getY) 53 | i += 1 54 | } 55 | 56 | r 57 | } 58 | 59 | def sliding(line: Line): Double = { 60 | val geom = line.jtsGeom 61 | 62 | line.points.sliding(2) 63 | .map(pair => Haversine(pair.head.x, pair.head.y, pair.last.x, pair.last.y)) 64 | .foldLeft(0d) { _ + _ } 65 | } 66 | 67 | @Benchmark 68 | def iterator10: Double = iterator(line0) 69 | @Benchmark 70 | def iterator100: Double = iterator(line1) 71 | 72 | @Benchmark 73 | def manual10: Double = manual(line0) 74 | @Benchmark 75 | def manual100: Double = manual(line1) 76 | 77 | @Benchmark 78 | def while10: Double = whiley(line0) 79 | @Benchmark 80 | def while100: Double = whiley(line1) 81 | 82 | @Benchmark 83 | def sliding10: Double = sliding(line0) 84 | @Benchmark 85 | def sliding100: Double = sliding(line1) 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/bench/src/main/scala/osmesa/SAXBench.scala: -------------------------------------------------------------------------------- 1 | package osmesa 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import org.apache.commons.io.IOUtils 6 | import org.openjdk.jmh.annotations._ 7 | import vectorpipe.model.{Actions, Change} 8 | 9 | import java.util.zip.GZIPInputStream 10 | import javax.xml.parsers.{SAXParser, SAXParserFactory} 11 | import scala.xml.XML 12 | 13 | // --- // 14 | 15 | @BenchmarkMode(Array(Mode.AverageTime)) 16 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 17 | @State(Scope.Thread) 18 | class SAXBench { 19 | 20 | val sequence = 0 21 | 22 | @Setup 23 | def setup: Unit = { 24 | } 25 | 26 | def gzipInputStream(): GZIPInputStream = { 27 | // requires the addition of a gzipped OSC file in bench/src/main/resources 28 | val stream = getClass.getResourceAsStream("/942.osc.gz") 29 | new GZIPInputStream(stream) 30 | } 31 | 32 | def withScalaXML(): Int = { 33 | // requires Change.fromXML (see commit 1b04a1e81f1a88f374a086c98d58677ec537b1bf) 34 | val data = XML.loadString(IOUtils.toString(gzipInputStream)) 35 | 36 | val changes = (data \ "_").flatMap { node => 37 | (node \ "_").map(Change.fromXML(_, Actions.fromString(node.label), sequence)) 38 | } 39 | 40 | changes.length 41 | } 42 | 43 | def withSAXParser(): Int = { 44 | val factory = SAXParserFactory.newInstance 45 | val parser = factory.newSAXParser 46 | val handler = new Change.ChangeHandler(sequence) 47 | parser.parse(gzipInputStream(), handler) 48 | handler.changeSeq.length 49 | } 50 | 51 | @Benchmark 52 | def useScala: Double = withScalaXML() 53 | @Benchmark 54 | def getSAXyGirl: Double = withSAXParser() 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/bm/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "bm-standalone" 4 | 5 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-core" % "2.6.7" 6 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.7" 7 | dependencyOverrides += "com.fasterxml.jackson.module" % "jackson-module-scala_2.11" % "2.6.7" 8 | 9 | def excludeVP(module: ModuleID): ModuleID = 10 | module.excludeAll(ExclusionRule("com.azavea", "vectorpipe")) 11 | 12 | libraryDependencies ~= (_.map(excludeVP)) 13 | 14 | libraryDependencies ++= Seq( 15 | decline, 16 | sparkHive % "provided", 17 | "com.google.protobuf" % "protobuf-java" % "2.5.0", 18 | cats, 19 | gtS3, 20 | gtSparkTestKit, 21 | logging, 22 | scalatest, 23 | "com.azavea" %% "vectorpipe" % "0.2.2", 24 | "org.jblas" % "jblas" % "1.2.4" 25 | ) 26 | 27 | /* Fixes Spark breakage with `sbt run` as of sbt-1.0.2 */ 28 | fork in run := true 29 | 30 | fork in Test := true 31 | 32 | test in assembly := {} 33 | 34 | javaOptions ++= Seq("-Xmx5G") 35 | 36 | initialCommands in console := 37 | """ 38 | """ 39 | 40 | assemblyJarName in assembly := "bm-standalone.jar" 41 | 42 | assemblyShadeRules in assembly := { 43 | val shadePackage = "com.azavea.shaded.demo" 44 | Seq( 45 | ShadeRule.rename("com.google.common.**" -> s"$shadePackage.google.common.@1") 46 | .inLibrary("com.azavea.geotrellis" %% "geotrellis-cassandra" % Version.geotrellis).inAll, 47 | ShadeRule.rename("io.netty.**" -> s"$shadePackage.io.netty.@1") 48 | .inLibrary("com.azavea.geotrellis" %% "geotrellis-hbase" % Version.geotrellis).inAll, 49 | ShadeRule.rename("com.fasterxml.jackson.**" -> s"$shadePackage.com.fasterxml.jackson.@1") 50 | .inLibrary("com.networknt" % "json-schema-validator" % "0.1.7").inAll, 51 | ShadeRule.rename("org.apache.avro.**" -> s"$shadePackage.org.apache.avro.@1") 52 | .inLibrary("com.azavea.geotrellis" %% "geotrellis-spark" % Version.geotrellis).inAll 53 | ) 54 | } 55 | 56 | assemblyMergeStrategy in assembly := { 57 | case s if s.startsWith("META-INF/services") => MergeStrategy.concat 58 | case "reference.conf" | "application.conf" => MergeStrategy.concat 59 | case "META-INF/MANIFEST.MF" | "META-INF\\MANIFEST.MF" => MergeStrategy.discard 60 | case "META-INF/ECLIPSEF.RSA" | "META-INF/ECLIPSEF.SF" => MergeStrategy.discard 61 | case _ => MergeStrategy.first 62 | } 63 | 64 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 65 | -------------------------------------------------------------------------------- /src/bm/src/main/scala/osmesa/bm/Downsample.scala: -------------------------------------------------------------------------------- 1 | package osmesa.bm 2 | 3 | import geotrellis.vector._ 4 | 5 | import org.apache.spark.rdd.RDD 6 | 7 | import vectorpipe.osm._ 8 | 9 | import monocle.macros.GenLens 10 | import com.vividsolutions.jts.algorithm.Centroid 11 | 12 | 13 | object Downsample { 14 | 15 | val tags = GenLens[vectorpipe.osm.ElementMeta](_.tags) 16 | 17 | def transmute(rdd: RDD[OSMFeature]) = { 18 | rdd.map({ f => 19 | val geom = { 20 | val _geom = Centroid.getCentroid(f.geom.jtsGeom) 21 | Point(_geom.x, _geom.y) 22 | } 23 | val data = tags.set(f.data.tags + ("multiplicity" -> 1.toString))(f.data) 24 | new OSMFeature(geom, data) 25 | }) 26 | } 27 | 28 | private def getAddress(f: OSMFeature, zoom: Int): (Double, Double) = { 29 | f.geom match { 30 | case p: Point => 31 | val u: Double = (p.x + 180.0)/360.0 32 | val v: Double = (p.y + 90.0)/180.0 33 | val x: Long = java.lang.Double.doubleToRawLongBits(u) >> (48-zoom) 34 | val y: Long = java.lang.Double.doubleToRawLongBits(v) >> (48-zoom) 35 | (x, y) 36 | case _ => throw new Exception 37 | } 38 | } 39 | 40 | def apply(rdd: RDD[OSMFeature], zoom: Int) = { 41 | rdd 42 | .map({ f => (getAddress(f, zoom), f) }) 43 | .reduceByKey({ case (f1: OSMFeature, f2: OSMFeature) => 44 | val mult1 = f1.data.tags.getOrElse("multiplicity", throw new Exception).toInt 45 | val mult2 = f2.data.tags.getOrElse("multiplicity", throw new Exception).toInt 46 | val geom = f1.geom 47 | val data = tags.set(f1.data.tags + ("multiplicity" -> (mult1+mult2).toString))(f1.data) 48 | new OSMFeature(geom, data) 49 | }) 50 | .values 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/bm/src/main/scala/osmesa/bm/Homography.scala: -------------------------------------------------------------------------------- 1 | package osmesa.bm 2 | 3 | import geotrellis.vector._ 4 | 5 | import vectorpipe.osm._ 6 | 7 | import org.jblas.{DoubleMatrix, Eigen, Singular} 8 | 9 | 10 | object Homography { 11 | 12 | private def pairToRows( 13 | a: Point, b: Point, 14 | xbar: Double, ybar: Double, 15 | maxabsx: Double, maxabsy: Double 16 | ) = { 17 | val x: Double = (a.x - xbar) / maxabsx 18 | val y: Double = (a.y - ybar) / maxabsy 19 | val u: Double = (b.x - xbar) / maxabsx 20 | val v: Double = (b.y - ybar) / maxabsy 21 | 22 | Array( 23 | (new DoubleMatrix(Array(-x, -y, -1.0, 0.0, 0.0, 0.0, u*x, u*y, u))).transpose, 24 | (new DoubleMatrix(Array(0.0, 0.0, 0.0, -x, -y, -1.0, v*x, v*y, v))).transpose 25 | ) 26 | } 27 | 28 | def dlt(pairs: Seq[(Point, Point)], cx: Double, cy: Double): DoubleMatrix = { 29 | val m = new DoubleMatrix(pairs.length * 2, 9) 30 | 31 | pairs 32 | .flatMap({ case (a: Point, b: Point) => pairToRows(a, b, cx, cy, 1e-5, 1e-5) }) 33 | .zipWithIndex 34 | .foreach({ case (c: DoubleMatrix, i: Int) => m.putRow(i, c) }) 35 | 36 | val svd = Singular.fullSVD(m) 37 | val h = svd(2).getColumn(8).reshape(3,3).transpose() 38 | val h33 = h.get(2,2) 39 | 40 | h.div(h33) 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/bm/src/main/scala/osmesa/bm/QuadTreePartitioner.scala: -------------------------------------------------------------------------------- 1 | package osmesa.bm 2 | 3 | import geotrellis.vector._ 4 | 5 | import vectorpipe.osm._ 6 | 7 | import org.apache.spark.{Partitioner, HashPartitioner } 8 | import org.apache.spark.rdd.RDD 9 | 10 | 11 | class QuadTreePartitioner(divisionSet: Set[Int], partitions: Int) extends Partitioner { 12 | 13 | val maxDivisions = divisionSet.reduce(math.max) 14 | 15 | require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.") 16 | 17 | val hashPartitioner = new HashPartitioner(partitions) 18 | 19 | private def step( 20 | bits: Int, 21 | _xmin: Double, _ymin: Double, 22 | _xmax: Double, _ymax:Double 23 | ): (Double, Double, Double, Double) = { 24 | var xmin = _xmin 25 | var ymin = _ymin 26 | var xmax = _xmax 27 | var ymax = _ymax 28 | 29 | bits match { 30 | case 0 => 31 | xmin = 2*xmin 32 | ymin = 2*ymin 33 | xmax = 2*xmax 34 | ymax = 2*ymax 35 | case 1 => 36 | xmin = 2*(xmin - 0.5) 37 | ymin = 2*ymin 38 | xmax = 2*(xmax - 0.5) 39 | ymax = 2*ymax 40 | case 2 => 41 | xmin = 2*xmin 42 | ymin = 2*(ymin - 0.5) 43 | xmax = 2*xmax 44 | ymax = 2*(ymax - 0.5) 45 | case 3 => 46 | xmin = 2*(xmin - 0.5) 47 | ymin = 2*(ymin - 0.5) 48 | xmax = 2*(xmax - 0.5) 49 | ymax = 2*(ymax - 0.5) 50 | } 51 | 52 | (xmin, ymin, xmax, ymax) 53 | } 54 | 55 | private def getBits(xmin: Double, ymin: Double, xmax: Double, ymax:Double): Option[Int] = { 56 | val minBits = ((xmin > 0.5),(ymin > 0.5)) match { 57 | case (false, false) => 0 58 | case (true, false) => 1 59 | case (false, true) => 2 60 | case (true, true) => 3 61 | } 62 | val maxBits = ((xmax > 0.5),(ymax > 0.5)) match { 63 | case (false, false) => 0 64 | case (true, false) => 1 65 | case (false, true) => 2 66 | case (true, true) => 3 67 | } 68 | 69 | if (minBits == maxBits) Some(minBits); else None 70 | } 71 | 72 | private def getBox(g: Geometry): (Double, Double, Double, Double) = { 73 | val e: Extent = g.envelope 74 | ((e.xmin+180)/360, (e.ymin+90)/180, (e.xmax+180)/360, (e.ymax+90)/180) 75 | } 76 | 77 | def getAddress(g: Geometry): Long = { 78 | var box: (Double, Double, Double, Double) = getBox(g) 79 | var address: Long = 0 80 | var bits: Option[Int] = getBits(box._1, box._2, box._3, box._4) 81 | var division = 0 82 | 83 | while (bits != None && division <= maxDivisions) { 84 | if (divisionSet.contains(division)) 85 | address = (address << 2) | bits.get 86 | box = step(bits.get, box._1, box._2, box._3, box._4) 87 | bits = getBits(box._1, box._2, box._3, box._4) 88 | division = division + 1 89 | } 90 | 91 | address 92 | } 93 | 94 | def numPartitions: Int = partitions 95 | 96 | def getPartition(key: Any): Int = { 97 | key match { 98 | case f: Feature[Geometry, Any] => 99 | (getAddress(f.geom) % partitions).toInt 100 | case g: Geometry => 101 | (getAddress(g) % partitions).toInt 102 | case _ => 103 | throw new Exception 104 | } 105 | } 106 | 107 | override def equals(other: Any): Boolean = false 108 | 109 | override def hashCode: Int = numPartitions 110 | } 111 | -------------------------------------------------------------------------------- /src/bm/src/main/scala/osmesa/bm/VertexMatching.scala: -------------------------------------------------------------------------------- 1 | package osmesa.bm 2 | 3 | import geotrellis.vector._ 4 | import geotrellis.vector.io._ 5 | 6 | import com.vividsolutions.jts.algorithm.{Centroid, CGAlgorithms} 7 | import com.vividsolutions.jts.geom.Coordinate 8 | 9 | 10 | object VertexMatching { 11 | 12 | private def matcher( 13 | points1: Array[Point], points2: Array[Point], 14 | offsetx: Double, offsety: Double, 15 | list: List[(Point, Point)] = List.empty[(Point, Point)] 16 | ): List[(Point, Point)] = { 17 | if (points1.isEmpty || points2.isEmpty) list 18 | else { 19 | val (_, i) = argmin(points1.head, points2, offsetx, offsety) 20 | matcher( 21 | points1.drop(1), points2.drop(i+1), 22 | offsetx, offsety, 23 | list ++ List((points1.head, points2(i))) 24 | ) 25 | } 26 | } 27 | 28 | private def argmin( 29 | p: Point, ps: Array[Point], 30 | offsetx: Double, offsety: Double 31 | ): (Double, Int) = { 32 | ps 33 | .map({ p2 => 34 | val temp = Point(p2.x - offsetx, p2.y - offsety) 35 | temp.distance(p) 36 | }) 37 | .zipWithIndex 38 | .reduce({ (pair1, pair2) => 39 | if (pair1._1 <= pair2._1) pair1 40 | else pair2 41 | }) 42 | } 43 | 44 | private def polygonToPolygon(_p1: Polygon, _p2: Polygon, relative: Boolean) = { 45 | val (p1, p2) = 46 | if (_p1.vertices.length < _p2.vertices.length) (_p1, _p2) 47 | else (_p2, _p1) 48 | 49 | val (centroidx, centroidy) = { 50 | val centroid = Centroid.getCentroid(p1.jtsGeom) 51 | (centroid.x, centroid.y) 52 | } 53 | 54 | val (offsetx: Double, offsety: Double) = 55 | if (relative) { 56 | val centroid = Centroid.getCentroid(p2.jtsGeom) 57 | (centroid.x - centroidx, centroid.y - centroidy) 58 | } 59 | else (0.0, 0.0) 60 | 61 | val points1 = { 62 | val pts = p1.jtsGeom.getCoordinates 63 | if (CGAlgorithms.isCCW(pts)) pts 64 | else pts.reverse 65 | }.drop(1).map({ p => Point(p.x, p.y) }) 66 | 67 | val points2 = { 68 | val points = { 69 | val pts = p2.jtsGeom.getCoordinates 70 | if (CGAlgorithms.isCCW(pts)) pts 71 | else pts.reverse 72 | }.drop(1).map({ p => Point(p.x, p.y) }) 73 | val (_, i) = argmin(points1.head, points, offsetx, offsety) 74 | points.drop(i) ++ points.take(i) 75 | } 76 | 77 | val pairs = matcher(points1, points2, offsetx, offsety) 78 | 79 | Homography.dlt( 80 | if (pairs.length >= 4) pairs; else points1.zip(points2).take(4).toList, 81 | centroidx, centroidy 82 | ) 83 | } 84 | 85 | def score(p1: Polygon, p2: Polygon): Double = { 86 | val h1 = polygonToPolygon(p1, p2, false).toArray 87 | val Δ1 = math.abs(h1(0)-1.0) + math.abs(h1(1)) + math.abs(h1(2)) + math.abs(h1(3)) + math.abs(h1(4)-1.0) + math.abs(h1(5)) 88 | 89 | val h2 = polygonToPolygon(p1, p2, true).toArray 90 | val Δ2 = math.abs(h2(0)-1.0) + math.abs(h2(1)) + math.abs(h2(2)) + math.abs(h2(3)) + math.abs(h2(4)-1.0) + math.abs(h2(5)) 91 | 92 | math.min(Δ1, Δ2) 93 | } 94 | 95 | def main(args: Array[String]): Unit = { 96 | val polygon1 = 97 | if (args(0).endsWith(".geojson")) 98 | scala.io.Source.fromFile(args(0)).mkString.parseGeoJson[Polygon] 99 | else 100 | args(0).parseGeoJson[Polygon] 101 | 102 | val polygon2 = 103 | if (args(1).endsWith(".geojson")) 104 | scala.io.Source.fromFile(args(1)).mkString.parseGeoJson[Polygon] 105 | else 106 | args(1).parseGeoJson[Polygon] 107 | 108 | println(polygon1.distance(polygon2)) 109 | println(Centroid.getCentroid(polygon1.jtsGeom).distance(Centroid.getCentroid(polygon2.jtsGeom))) 110 | println(polygonToPolygon(polygon1, polygon2, false)) 111 | println(polygonToPolygon(polygon1, polygon2, true)) 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /src/bm/src/main/scala/osmesa/bm/VertexProjection.scala: -------------------------------------------------------------------------------- 1 | package osmesa.bm 2 | 3 | import geotrellis.vector._ 4 | import geotrellis.vector.io._ 5 | import geotrellis.vector.io.json.JsonFeatureCollection 6 | 7 | import com.vividsolutions.jts.algorithm.Centroid 8 | 9 | 10 | object VertexProjection { 11 | 12 | private def pointToPolygon(p: Point, offsetx: Double, offsety: Double, right: Polygon): Point = { 13 | 14 | val point = 15 | right.vertices 16 | .map({ p => Point(p.x - offsetx, p.y - offsety) }) 17 | .sliding(2) 18 | .map({ case Array(a, b) => 19 | val px: Double = p.x - a.x 20 | val py: Double = p.y - a.y 21 | val vx: Double = b.x - a.x 22 | val vy: Double = b.y - a.y 23 | val absv: Double = math.sqrt(vx*vx + vy*vy) 24 | val t = px*vx/absv + py*vy/absv 25 | 26 | val c = 27 | if (t <= 0.0) a 28 | else if (t >= 1.0) b 29 | else Point(a.x*t + b.x*(1.0-t), a.y*t + b.y*(1.0-t)) 30 | 31 | (p.distance(c), c) 32 | }) 33 | .reduce({ (t1: (Double, Point), t2: (Double, Point)) => 34 | if (t1._1 <= t2._1) t1 35 | else t2 36 | })._2 37 | 38 | Point(point.x + offsetx, point.y + offsety) 39 | } 40 | 41 | private def polygonToPolygon(left: Polygon, right: Polygon, relative: Boolean) = { 42 | val (centroidx, centroidy) = { 43 | val centroid = Centroid.getCentroid(left.jtsGeom) 44 | (centroid.x, centroid.y) 45 | } 46 | 47 | val (offsetx: Double, offsety: Double) = { 48 | if (relative) { 49 | val centroid = Centroid.getCentroid(right.jtsGeom) 50 | (centroid.x - centroidx, centroid.y - centroidy) 51 | } 52 | else (0.0, 0.0) 53 | } 54 | 55 | val xs1 = left.vertices 56 | val xs2 = xs1.map({ point => pointToPolygon(point, offsetx, offsety, right) }) 57 | 58 | Homography.dlt(xs1.zip(xs2), centroidx, centroidy) 59 | } 60 | 61 | private def geometryToGeometry(left: Geometry, right: Geometry, relative: Boolean) = { 62 | val polygon1 = left match { 63 | case p: Polygon => p 64 | case mp: MultiPolygon => 65 | mp.polygons.reduce({ (p1, p2) => if (p1.vertices.length > p2.vertices.length) p1; else p2 }) 66 | } 67 | val polygon2 = right match { 68 | case p: Polygon => p 69 | case mp: MultiPolygon => 70 | mp.polygons.reduce({ (p1, p2) => if (p1.vertices.length > p2.vertices.length) p1; else p2 }) 71 | } 72 | 73 | polygonToPolygon(polygon1, polygon2, relative) 74 | } 75 | 76 | def score(p1: Polygon, p2: Polygon): Double = { 77 | val h1 = polygonToPolygon(p1, p2, false).toArray 78 | val Δ1 = math.abs(h1(0)-1.0) + math.abs(h1(1)) + math.abs(h1(2)) + math.abs(h1(3)) + math.abs(h1(4)-1.0) + math.abs(h1(5)) 79 | 80 | val h2 = polygonToPolygon(p2, p1, false).toArray 81 | val Δ2 = math.abs(h2(0)-1.0) + math.abs(h2(1)) + math.abs(h2(2)) + math.abs(h2(3)) + math.abs(h2(4)-1.0) + math.abs(h2(5)) 82 | 83 | val h3 = polygonToPolygon(p1, p2, true).toArray 84 | val Δ3 = math.abs(h3(0)-1.0) + math.abs(h3(1)) + math.abs(h3(2)) + math.abs(h3(3)) + math.abs(h3(4)-1.0) + math.abs(h3(5)) 85 | 86 | val h4 = polygonToPolygon(p2, p1, true).toArray 87 | val Δ4 = math.abs(h4(0)-1.0) + math.abs(h4(1)) + math.abs(h4(2)) + math.abs(h4(3)) + math.abs(h4(4)-1.0) + math.abs(h4(5)) 88 | 89 | math.min(Δ1, math.min(Δ2, math.min(Δ3, Δ4))) 90 | } 91 | 92 | def main(args: Array[String]): Unit = { 93 | val polygon1 = 94 | if (args(0).endsWith(".geojson")) 95 | scala.io.Source.fromFile(args(0)).mkString.parseGeoJson[Geometry] 96 | else 97 | args(0).parseGeoJson[Geometry] 98 | 99 | val polygon2 = 100 | if (args(1).endsWith(".geojson")) 101 | scala.io.Source.fromFile(args(1)).mkString.parseGeoJson[Geometry] 102 | else 103 | args(1).parseGeoJson[Geometry] 104 | 105 | println(geometryToGeometry(polygon1, polygon2, false)) 106 | println(geometryToGeometry(polygon2, polygon1, false)) 107 | println(geometryToGeometry(polygon1, polygon2, true)) 108 | println(geometryToGeometry(polygon2, polygon1, true)) 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /src/bm/src/main/scala/osmesa/bm/VolumeMatching.scala: -------------------------------------------------------------------------------- 1 | package osmesa.bm 2 | 3 | import geotrellis.vector._ 4 | import geotrellis.vector.io._ 5 | 6 | 7 | object VolumeMatching { 8 | 9 | def data(p1: Polygon, p2: Polygon): (Double, Double) = { 10 | val a1 = p1.jtsGeom.getArea 11 | val a2 = p2.jtsGeom.getArea 12 | val a3 = p1.jtsGeom.intersection(p2.jtsGeom).getArea 13 | (a3/a1, a3/a2) 14 | } 15 | 16 | def min(p1: Polygon, p2: Polygon): Double = { 17 | val (a1, a2) = data(p1, p2) 18 | math.min(a1, a2) 19 | } 20 | 21 | def max(p1: Polygon, p2: Polygon): Double = { 22 | val (a1, a2) = data(p1, p2) 23 | math.max(a1, a2) 24 | } 25 | 26 | def main(args: Array[String]): Unit = { 27 | val polygon1 = 28 | if (args(0).endsWith(".geojson")) 29 | scala.io.Source.fromFile(args(0)).mkString.parseGeoJson[Polygon] 30 | else 31 | args(0).parseGeoJson[Polygon] 32 | 33 | val polygon2 = 34 | if (args(1).endsWith(".geojson")) 35 | scala.io.Source.fromFile(args(1)).mkString.parseGeoJson[Polygon] 36 | else 37 | args(1).parseGeoJson[Polygon] 38 | 39 | println(polygon1 == polygon2) 40 | println(data(polygon1, polygon2)) 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/bm/view/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | BM 6 | 7 | 8 | 9 | 13 | 14 | 15 | 16 |
17 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /src/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | lazy val commonSettings = Seq( 4 | organization := "com.azavea", 5 | version := Version.osmesa, 6 | cancelable in Global := true, 7 | scalaVersion in ThisBuild := Version.scala, 8 | scalacOptions := Seq( 9 | "-deprecation", 10 | "-unchecked", 11 | "-feature", 12 | "-language:implicitConversions", 13 | "-language:reflectiveCalls", 14 | "-language:higherKinds", 15 | "-language:postfixOps", 16 | "-language:existentials", 17 | "-language:experimental.macros", 18 | "-feature", 19 | "-Ypartial-unification", 20 | "-Ypatmat-exhaust-depth", "100" 21 | ), 22 | 23 | // resolvers ++= Seq( 24 | // "locationtech-releases" at "https://repo.locationtech.org/content/repositories/releases/", 25 | // "locationtech-snapshots" at "https://repo.locationtech.org/content/repositories/snapshots/", 26 | // "geosolutions" at "http://maven.geo-solutions.it/", 27 | // "osgeo-releases" at "https://repo.osgeo.org/repository/release/", 28 | // "apache.commons.io" at "https://mvnrepository.com/artifact/commons-io/commons-io" 29 | // ), 30 | externalResolvers := Settings.Repositories.all, 31 | 32 | updateOptions := updateOptions.value.withGigahorse(false), 33 | shellPrompt := { s => Project.extract(s).currentProject.id + " > " }, 34 | assemblyMergeStrategy in assembly := { 35 | case "reference.conf" | "application.conf" => MergeStrategy.concat 36 | case PathList("META-INF", xs@_*) => 37 | xs match { 38 | case ("MANIFEST.MF" :: Nil) => MergeStrategy.discard 39 | // Concatenate everything in the services directory to keep GeoTools happy. 40 | case ("services" :: _ :: Nil) => 41 | MergeStrategy.concat 42 | // Concatenate these to keep JAI happy. 43 | case ("javax.media.jai.registryFile.jai" :: Nil) | ("registryFile.jai" :: Nil) | ("registryFile.jaiext" :: Nil) => 44 | MergeStrategy.concat 45 | case (name :: Nil) => { 46 | // Must exclude META-INF/*.([RD]SA|SF) to avoid "Invalid signature file digest for Manifest main attributes" exception. 47 | if (name.endsWith(".RSA") || name.endsWith(".DSA") || name.endsWith(".SF")) 48 | MergeStrategy.discard 49 | else 50 | MergeStrategy.first 51 | } 52 | case _ => MergeStrategy.first 53 | } 54 | case _ => MergeStrategy.first 55 | } 56 | ) 57 | 58 | /* Allow `run` to be used with Spark code, while assembling fat JARs w/o Spark bundled */ 59 | // run in Compile := Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)).evaluated 60 | // runMain in Compile := Defaults.runMainTask(fullClasspath in Compile, runner in(Compile, run)).evaluated 61 | 62 | lazy val root = Project("osmesa", file(".")) 63 | .aggregate( 64 | analytics, 65 | apps, 66 | bm 67 | ).settings(commonSettings: _*) 68 | 69 | lazy val analytics = 70 | project 71 | .settings(commonSettings: _*) 72 | 73 | lazy val apps = 74 | project 75 | .dependsOn(analytics) 76 | .settings(commonSettings: _*) 77 | 78 | lazy val bm = 79 | project 80 | .settings(commonSettings: _*) 81 | 82 | /* Run with 83 | jmh:run -t 1 -f 1 -wi 5 -i 5 .*Bench.* 84 | */ 85 | // lazy val bench = 86 | // project.in(file("bench")) 87 | // .settings(commonSettings) 88 | // .dependsOn(analytics) 89 | // .enablePlugins(JmhPlugin) 90 | -------------------------------------------------------------------------------- /src/docker/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=WARN, console 2 | log4j.appender.console=org.apache.log4j.ConsoleAppender 3 | log4j.appender.console.target=System.out 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | # log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c: %m%n 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 7 | log4j.logger.osmesa=DEBUG 8 | log4j.logger.vectorpipe=DEBUG -------------------------------------------------------------------------------- /src/docker/refresh-views.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo "$(date -Iseconds): Starting view refreshment in $DATABASE_NAME" 4 | 5 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently user_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then 6 | echo "$(date -Iseconds): Refreshing user statistics" 7 | # refresh in the background to return immediately 8 | psql -Aqt \ 9 | -c "REFRESH MATERIALIZED VIEW CONCURRENTLY user_statistics" \ 10 | -c "UPDATE refreshments SET updated_at=now() where mat_view='user_statistics'" \ 11 | $DATABASE_URL & 12 | else 13 | echo "$(date -Iseconds): User stats table already refreshing" 14 | fi 15 | 16 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently hashtag_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then 17 | echo "$(date -Iseconds): Refreshing hashtag statistics" 18 | # refresh in the background to return immediately 19 | psql -Aqt \ 20 | -c "REFRESH MATERIALIZED VIEW CONCURRENTLY hashtag_statistics" \ 21 | -c "UPDATE refreshments SET updated_at=now() where mat_view='hashtag_statistics'" \ 22 | $DATABASE_URL & 23 | else 24 | echo "$(date -Iseconds): Hashtag stats table already refreshing" 25 | fi 26 | 27 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently country_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then 28 | # refresh in the background to return immediately 29 | echo "$(date -Iseconds): Refreshing country statistics" 30 | psql -Aqt \ 31 | -c "REFRESH MATERIALIZED VIEW CONCURRENTLY country_statistics" \ 32 | -c "UPDATE refreshments SET updated_at=now() where mat_view='country_statistics'" \ 33 | $DATABASE_URL & 34 | else 35 | echo "$(date -Iseconds): Country stats table already refreshing" 36 | fi 37 | 38 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently hashtag_user_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then 39 | # refresh in the background to return immediately 40 | echo "$(date -Iseconds): Refreshing hashtag/user statistics" 41 | psql -Aqt \ 42 | -c "REFRESH MATERIALIZED VIEW CONCURRENTLY hashtag_user_statistics" \ 43 | -c "UPDATE refreshments SET updated_at=now() where mat_view='hashtag_user_statistics'" \ 44 | $DATABASE_URL & 45 | else 46 | echo "$(date -Iseconds): Hashtag/user stats table already refreshing" 47 | fi 48 | 49 | wait 50 | echo "$(date -Iseconds): Completed" 51 | -------------------------------------------------------------------------------- /src/project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object Dependencies { 4 | val decline = "com.monovore" %% "decline" % Version.decline 5 | val sparkHive = "org.apache.spark" %% "spark-hive" % Version.spark 6 | val sparkStreaming = "org.apache.spark" %% "spark-streaming" % Version.spark 7 | val sparkJts = "org.locationtech.geomesa" %% "geomesa-spark-jts" % Version.geomesa 8 | val gtGeomesa = "org.locationtech.geotrellis" %% "geotrellis-geomesa" % Version.geotrellis 9 | val gtGeotools = "org.locationtech.geotrellis" %% "geotrellis-geotools" % Version.geotrellis 10 | val gtS3 = "org.locationtech.geotrellis" %% "geotrellis-s3" % Version.geotrellis 11 | val gtSpark = "org.locationtech.geotrellis" %% "geotrellis-spark" % Version.geotrellis 12 | val gtSparkTestKit = "org.locationtech.geotrellis" %% "geotrellis-spark-testkit" % Version.geotrellis % "test" 13 | val gtVector = "org.locationtech.geotrellis" %% "geotrellis-vector" % Version.geotrellis 14 | val gtShapefile = "org.locationtech.geotrellis" %% "geotrellis-shapefile" % Version.geotrellis 15 | val gtVectorTile = "org.locationtech.geotrellis" %% "geotrellis-vectortile" % Version.geotrellis 16 | val vectorpipe = "com.azavea.geotrellis" %% "vectorpipe" % Version.vectorpipe 17 | val cats = "org.typelevel" %% "cats-core" % Version.cats 18 | val scalactic = "org.scalactic" %% "scalactic" % Version.scalactic 19 | val scalatest = "org.scalatest" %% "scalatest" % Version.scalatest % "test" 20 | //val jaiCore = "javax.media" % "jai_core" % Version.jai % "test" from s"http://download.osgeo.org/webdav/geotools/javax/media/jai_core/${Version.jai}/jai_core-${Version.jai}.jar" 21 | val apacheCommonsEmail = "org.apache.commons" % "commons-email" % Version.apacheCommonsEmail 22 | val hbaseCommon = "org.apache.hbase" % "hbase-common" % Version.hbase 23 | val hbaseClient = "org.apache.hbase" % "hbase-client" % Version.hbase 24 | val hbaseServer = "org.apache.hbase" % "hbase-server" % Version.hbase 25 | val geomesaHbaseDatastore = "org.locationtech.geomesa" % "geomesa-hbase-datastore_2.11" % Version.geomesa 26 | val kryo = "com.esotericsoftware" % "kryo-shaded" % Version.kryo 27 | val snakeyaml = "org.yaml" % "snakeyaml" % Version.snakeyaml 28 | val circeCore = "io.circe" %% "circe-core" % Version.circe 29 | val circeGeneric = "io.circe" %% "circe-generic" % Version.circe 30 | val circeExtras = "io.circe" %% "circe-generic-extras" % Version.circe 31 | val circeParser = "io.circe" %% "circe-parser" % Version.circe 32 | val circeOptics = "io.circe" %% "circe-optics" % Version.circe 33 | val circeJava8 = "io.circe" %% "circe-java8" % Version.circe 34 | val circeYaml = "io.circe" %% "circe-yaml" % Version.circeYaml 35 | val logging = "com.typesafe.scala-logging" %% "scala-logging" % Version.scalaLogging 36 | val log4j2 = "org.apache.logging.log4j" % "log4j-1.2-api" % "2.17.1" 37 | val commonsIO = "commons-io" % "commons-io" % Version.commonsIO 38 | val postgresql = "org.postgresql" % "postgresql" % Version.postgresql 39 | } 40 | -------------------------------------------------------------------------------- /src/project/Settings.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object Settings { 4 | object Repositories { 5 | val apacheCommons = "apache.commons.io" at "https://mvnrepository.com/artifact/commons-io/commons-io" 6 | val eclipseReleases = "eclipse-releases" at "https://repo.eclipse.org/content/groups/releases" 7 | val osgeoReleases = "osgeo-releases" at "https://repo.osgeo.org/repository/release/" 8 | val geosolutions = "geosolutions" at "https://maven.geo-solutions.it/" 9 | val ltReleases = "locationtech-releases" at "https://repo.locationtech.org/content/repositories/releases/" 10 | val ltSnapshots = "locationtech-snapshots" at "https://repo.locationtech.org/content/repositories/snapshots/" 11 | val ivy2Local = Resolver.file("local", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns) 12 | val mavenLocal = Resolver.mavenLocal 13 | val maven = DefaultMavenRepository 14 | val local = Seq(ivy2Local, mavenLocal) 15 | val external = Seq(osgeoReleases, maven, apacheCommons, eclipseReleases, geosolutions, ltReleases, ltSnapshots) 16 | val all = external ++ local 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/project/Version.scala: -------------------------------------------------------------------------------- 1 | object Version { 2 | val scala = "2.11.12" 3 | val osmesa = "0.2.0" 4 | val geotrellis = "3.5.1" 5 | val geomesa = "2.3.2" 6 | val vectorpipe = "2.2.0" 7 | val decline = "1.0.0" 8 | val cats = "1.6.1" 9 | val scalactic = "3.0.3" 10 | val scalatest = "3.0.3" 11 | val spark = "2.4.4" 12 | val kryo = "4.0.0" 13 | val snakeyaml = "1.25" 14 | val circe = "0.11.1" 15 | val circeYaml = "0.10.1" // not in sync with circe core 16 | val scalaLogging = "3.5.0" 17 | val commonsIO = "2.5" 18 | val osmosis = "0.46" 19 | val apacheCommonsEmail = "1.5" 20 | val hbase = "2.2.0" 21 | val jai = "1.1.3" 22 | val postgresql = "42.2.9" 23 | } 24 | -------------------------------------------------------------------------------- /src/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") 2 | -------------------------------------------------------------------------------- /src/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.3.0 2 | -------------------------------------------------------------------------------- /src/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0") 2 | 3 | addCompilerPlugin("org.scalamacros" % "paradise" % "2.1.0" cross CrossVersion.full) 4 | 5 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.2.27") 6 | --------------------------------------------------------------------------------