├── .gitignore
├── LICENSE
├── README.md
├── buildspec.yml
├── deployment
    ├── batch
    │   ├── makefiles
    │   │   ├── .gitignore
    │   │   ├── Makefile
    │   │   ├── config-aws.mk.template
    │   │   ├── config-run.mk.template
    │   │   └── scripts
    │   │   │   └── configurations.json
    │   └── terraform
    │   │   ├── .gitignore
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── cluster
    │   │       ├── .gitignore
    │   │       ├── aws.tf
    │   │       ├── cluster-configurations.json
    │   │       ├── emr.tf
    │   │       ├── outputs.tf
    │   │       ├── security-group.tf
    │   │       └── variables.tf
    │   │   └── tfvars.tpl
    ├── docker
    │   ├── Dockerfile.osm_apps
    │   ├── Dockerfile.osm_refresh
    │   ├── build-containers.sh
    │   ├── log4j.properties
    │   ├── refresh-views.sh
    │   └── sources.list
    ├── monitor-checkpoints.sh
    ├── sql
    │   ├── 01-countries.sql
    │   ├── 02-checkpoints.sql
    │   ├── 03-users.sql
    │   ├── 04-hashtags.sql
    │   ├── 05-errors.sql
    │   ├── 06-changesets.sql
    │   ├── 07-changesets_countries.sql
    │   ├── 08-changesets_hashtags.sql
    │   ├── README.md
    │   └── materialized_views
    │   │   ├── country_statistics.sql
    │   │   ├── hashtag_statistics.sql
    │   │   ├── hashtag_user_statistics.sql
    │   │   ├── refreshments.sql
    │   │   └── user_statistics.sql
    └── streaming
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── README.md
    │   ├── config-deployment.mk.template
    │   ├── ecs-params.yml
    │   └── scripts
    │       ├── batch-generate-edit-histograms.sh
    │       ├── batch-generate-footprints.sh
    │       ├── batch-process.sh
    │       ├── create-log-groups.sh
    │       ├── define-production-streaming-update-tasks.sh
    │       ├── define-production-view-refresher.sh
    │       ├── define-staging-streaming-update-tasks.sh
    │       ├── define-staging-view-refresher.sh
    │       ├── define-streaming-augdiff-producer.sh
    │       ├── define-streaming-vectortile-tasks.sh
    │       ├── deploy-stats-refresher.sh
    │       ├── emr-configurations
    │           └── batch-process.json
    │       ├── expand.sh
    │       ├── get-tag.sh
    │       ├── latest-history-to-orc.sh
    │       └── stop-streaming-service.sh
├── docker-compose.yml
├── notebooks
    ├── Footprint_test_messy_vectortiles.json
    ├── OSM_Ingest.json
    └── zeppelin
    │   ├── Counting road length.json
    │   ├── Debugging long running ingest step.json
    │   ├── Working with ORC 1.json
    │   ├── Working with ORC.json
    │   └── hashtags.json
├── project
    └── build.properties
├── scripts
    ├── cibuild
    └── cipublish
└── src
    ├── .gitignore
    ├── .sbtopts
    ├── .scalafmt.conf
    ├── Dockerfile.apps
    ├── Dockerfile.refresh
    ├── analytics
        ├── .envrc
        ├── .gitignore
        ├── bin
        │   ├── apply.sh
        │   └── update-tiles
        ├── build.sbt
        ├── project
        │   └── build.properties
        ├── resources
        │   └── log4j.properties
        └── src
        │   ├── main
        │       ├── resources
        │       │   ├── countries.geojson
        │       │   └── log4j.properties
        │       └── scala
        │       │   └── osmesa
        │       │       └── analytics
        │       │           ├── Analytics.scala
        │       │           ├── Countries.scala
        │       │           ├── EditHistogram.scala
        │       │           ├── Footprints.scala
        │       │           ├── Implicits.scala
        │       │           ├── Resource.scala
        │       │           ├── S3Utils.scala
        │       │           ├── VectorGrid.scala
        │       │           ├── raster
        │       │               ├── MutableSparseIntTile.scala
        │       │               ├── SparseIntTile.scala
        │       │               └── package.scala
        │       │           ├── stats
        │       │               ├── ChangesetMetadataForeachWriter.scala
        │       │               ├── ChangesetStatsForeachWriter.scala
        │       │               ├── functions
        │       │               │   └── package.scala
        │       │               └── package.scala
        │       │           ├── updater
        │       │               ├── Implicits.scala
        │       │               ├── Schema.scala
        │       │               ├── TileUpdater.scala
        │       │               ├── package.scala
        │       │               └── schemas
        │       │               │   ├── History.scala
        │       │               │   ├── Snapshot.scala
        │       │               │   ├── Urchn.scala
        │       │               │   └── package.scala
        │       │           └── vectorgrid
        │       │               └── package.scala
        │   └── test
        │       ├── resources
        │           └── log4j.properties
        │       └── scala
        │           └── osmesa
        │               └── analytics
        │                   └── CountriesTest.scala
    ├── apps
        ├── build.sbt
        └── src
        │   └── main
        │       └── scala
        │           └── osmesa
        │               └── apps
        │                   ├── DbUtils.scala
        │                   ├── batch
        │                       ├── ChangesetMetadataCreator.scala
        │                       ├── ChangesetStatsCreator.scala
        │                       ├── EditHistogramTileCreator.scala
        │                       ├── FacetedEditHistogramTileCreator.scala
        │                       ├── FootprintCreator.scala
        │                       └── MergeChangesets.scala
        │                   └── streaming
        │                       ├── ChangeStreamProcessor.scala
        │                       ├── ChangesetMetadataUpdater.scala
        │                       ├── ChangesetStatsUpdater.scala
        │                       ├── EditHistogramTileUpdater.scala
        │                       ├── FacetedEditHistogramTileUpdater.scala
        │                       ├── HashtagFootprintUpdater.scala
        │                       ├── MergedChangesetStreamProcessor.scala
        │                       ├── StreamingChangesetMetadataUpdater.scala
        │                       ├── StreamingChangesetStatsUpdater.scala
        │                       ├── StreamingEditHistogramTileUpdater.scala
        │                       ├── StreamingFacetedEditHistogramTileUpdater.scala
        │                       ├── StreamingUserFootprintTileUpdater.scala
        │                       └── UserFootprintUpdater.scala
    ├── bench
        └── src
        │   └── main
        │       └── scala
        │           └── osmesa
        │               ├── Bench.scala
        │               ├── MetresBench.scala
        │               └── SAXBench.scala
    ├── bm
        ├── build.sbt
        ├── src
        │   └── main
        │   │   └── scala
        │   │       └── osmesa
        │   │           └── bm
        │   │               ├── BuildingMatching.scala
        │   │               ├── Downsample.scala
        │   │               ├── GenerateVT.scala
        │   │               ├── Homography.scala
        │   │               ├── QuadTreePartitioner.scala
        │   │               ├── VertexMatching.scala
        │   │               ├── VertexProjection.scala
        │   │               └── VolumeMatching.scala
        └── view
        │   └── index.html
    ├── build.sbt
    ├── docker
        ├── log4j.properties
        └── refresh-views.sh
    ├── project
        ├── Dependencies.scala
        ├── Settings.scala
        ├── Version.scala
        ├── assembly.sbt
        ├── build.properties
        └── plugins.sbt
    └── sbt


/.gitignore:
--------------------------------------------------------------------------------
 1 | # GeoMesa Dist (to be copied in) #
 2 | services/hbase/geomesa-hbase-dist.tar.gz
 3 | services/geoserver/geomesa-hbase-dist.tar.gz
 4 | 
 5 | # Project generated files #
 6 | 
 7 | metastore_db
 8 | third_party_sources
 9 | derby.log
10 | 
11 | # Test Data #
12 | src/test-data
13 | 
14 | # AWS #
15 | 
16 | *.pem
17 | 
18 | # Operating System Files #
19 | 
20 | *.DS_Store
21 | Thumbs.db
22 | 
23 | # Build Files #
24 | 
25 | bin
26 | target
27 | build/
28 | .gradle
29 | 
30 | # Eclipse Project Files #
31 | 
32 | .classpath
33 | .project
34 | .settings
35 | 
36 | # Vagrant
37 | 
38 | .vagrant
39 | 
40 | # Terraform
41 | deployment/terraform/.terraform
42 | deployment/terraform/terraform.tfvars
43 | .terraform.tfstate.lock.info
44 | *.tfvars
45 | *.tfstate
46 | *.tfplan
47 | *.tfstate.backup
48 | .terraform
49 | 
50 | # Makefile configs
51 | *.mk
52 | 
53 | # Ansible
54 | deployment/ansible/roles/azavea.*
55 | 
56 | # Node and Webpack
57 | node_modules/
58 | npm-debug.log
59 | pgw.communitymapping*.js
60 | pgw.communitymapping*.js.map
61 | vendor.bundle.js
62 | dist/
63 | 
64 | 
65 | # IntelliJ IDEA Files #
66 | 
67 | *.iml
68 | *.ipr
69 | *.iws
70 | *.idea
71 | 
72 | # macOS
73 | .DS_Store
74 | 
75 | # Emacs #
76 | 
77 | .ensime
78 | \#*#
79 | *~
80 | .#*
81 | 
82 | *.orc
83 | *.jar
84 | 
85 | # Temporary credentials
86 | emr/terraform/auth.json
87 | 
88 | 


--------------------------------------------------------------------------------
/buildspec.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   install:
 5 |     runtime-versions:
 6 |       docker: 18
 7 |       java: openjdk11
 8 |     commands:
 9 |       - docker -v
10 |       - java -version
11 |   pre_build:
12 |     commands:
13 |       - echo "$DOCKER_HUB_ACCESS_TOKEN" | docker login --username "$DOCKER_HUB_USERNAME" --password-stdin
14 |   build:
15 |     commands:
16 |       - mkdir -p /root/.sbt/launchers/1.3.0/
17 |       - curl -L -o /root/.sbt/launchers/1.3.0/sbt-launch.jar https://repo.scala-sbt.org/scalasbt/maven-releases/org/scala-sbt/sbt-launch/1.3.0/sbt-launch.jar
18 |       - ls -lh /root/.sbt/launchers/1.3.0
19 |       - ./scripts/cibuild
20 |       - ./scripts/cipublish
21 | artifacts:
22 |   files:
23 |     - osmesa-dist/**/*
24 | 


--------------------------------------------------------------------------------
/deployment/batch/makefiles/.gitignore:
--------------------------------------------------------------------------------
1 | cluster-id.txt
2 | last-step-id.txt
3 | 


--------------------------------------------------------------------------------
/deployment/batch/makefiles/config-aws.mk.template:
--------------------------------------------------------------------------------
1 | export AWS_DEFAULT_REGION:=us-east-1
2 | 
3 | export PEM_FILE:=[PEM FILE]
4 | export EC2_KEY:=[EC2 KEY]
5 | export SUBNET_ID:=[EMR SUBNET]
6 | 


--------------------------------------------------------------------------------
/deployment/batch/makefiles/config-run.mk.template:
--------------------------------------------------------------------------------
 1 | export S3_BUCKET:=[TARGET BUCKET]
 2 | export S3_URI:=s3://${S3_BUCKET}
 3 | export S3_CATALOG := ${S3_URI}/[TARGET_CATALOG]
 4 | 
 5 | export PLANET_ORC := s3://osm-pds/planet-history/history-latest.orc
 6 | export OSM_HISTORY := [OSH_ORC_FILENAME_URI]
 7 | export OUTPUT_LOCATION := [TARGET_BUCKET]
 8 | 
 9 | export ORC_CACHE_LOCATION := ${S3_URI}/cache
10 | export VECTORTILE_CATALOG_LOCATION = ${S3_URI}/vectortiles
11 | 
12 | export CHANGESET_CSV := [URI of OSM CSV table dump]
13 | export CHANGESET_COMMENTS_CSV := [URI of OSM CSV table dump]
14 | export CHANGESET_TAGS_CSV := [URI of OSM CSV table dump]
15 | export USER_CSV := [URI of OSM CSV table dump]
16 | export CHANGESET_ORC_DEST := [S3 URI of ORC]
17 | 
18 | export PLANET_HISTORY_PBF := [S3 URI of target planet history PBF]
19 | export PLANET_HISTORY_ORC_DIR := [S3 URI of converted planet history ORCs]
20 | 


--------------------------------------------------------------------------------
/deployment/batch/makefiles/scripts/configurations.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "Classification": "spark",
 4 |     "Properties": {
 5 |       "maximizeResourceAllocation": "false"
 6 |     }
 7 |   },
 8 |   {
 9 |     "Classification": "spark-defaults",
10 |     "Properties": {
11 |         "spark.driver.maxResultSize": "3G",
12 |         "spark.dynamicAllocation.enabled": "true",
13 |         "spark.shuffle.service.enabled": "true",
14 |         "spark.shuffle.compress": "true",
15 |         "spark.shuffle.spill.compress": "true",
16 |         "spark.rdd.compress": "true",
17 |         "spark.executor.memoryOverhead": "1G",
18 |         "spark.driver.memoryOverhead": "1G",
19 |         "spark.driver.maxResultSize": "3G",
20 |         "spark.executor.extraJavaOptions" : "-XX:+UseParallelGC -Dgeotrellis.s3.threads.rdd.write=64"
21 |     }
22 |   },
23 |   {
24 |     "Classification": "hdfs-site",
25 |     "Properties": {
26 |       "dfs.replication": "1",
27 |       "dfs.permissions": "false",
28 |       "dfs.datanode.max.xcievers": "16384",
29 |       "dfs.datanode.max.transfer.threads": "16384",
30 |       "dfs.datanode.balance.max.concurrent.moves": "1000",
31 |       "dfs.datanode.balance.bandwidthPerSec": "100000000"
32 |     }
33 |   },
34 |   {
35 |     "Classification": "yarn-site",
36 |     "Properties": {
37 |       "yarn.resourcemanager.am.max-attempts": "1",
38 |       "yarn.nodemanager.vmem-check-enabled": "false",
39 |       "yarn.nodemanager.pmem-check-enabled": "false"
40 |     }
41 |   },
42 |   {
43 |     "Classification": "hadoop-env",
44 |     "Configurations": [
45 |       {
46 |         "Classification": "export",
47 |         "Properties": {
48 |           "JAVA_HOME": "/usr/lib/jvm/java-1.8.0",
49 |           "GDAL_DATA": "/usr/local/share/gdal",
50 |           "LD_LIBRARY_PATH": "/usr/local/lib",
51 |           "PYSPARK_PYTHON": "python27",
52 |           "PYSPARK_DRIVER_PYTHON": "python27"
53 |         }
54 |       }
55 |     ]
56 |   },
57 |   {
58 |     "Classification": "spark-env",
59 |     "Configurations": [
60 |       {
61 |         "Classification": "export",
62 |         "Properties": {
63 |           "JAVA_HOME": "/usr/lib/jvm/java-1.8.0",
64 |           "GDAL_DATA": "/usr/local/share/gdal",
65 |           "LD_LIBRARY_PATH": "/usr/local/lib",
66 |           "SPARK_PRINT_LAUNCH_COMMAND": "1",
67 |           "PYSPARK_PYTHON": "python27",
68 |           "PYSPARK_DRIVER_PYTHON": "python27"
69 |         }
70 |       }
71 |     ]
72 |   },
73 |   {
74 |     "Classification": "yarn-env",
75 |     "Configurations": [
76 |       {
77 |         "Classification": "export",
78 |         "Properties": {
79 |           "JAVA_HOME": "/usr/lib/jvm/java-1.8.0",
80 |           "GDAL_DATA": "/usr/local/share/gdal",
81 |           "LD_LIBRARY_PATH": "/usr/local/lib",
82 |           "PYSPARK_PYTHON": "python27",
83 |           "PYSPARK_DRIVER_PYTHON": "python27"
84 |         }
85 |       }
86 |     ]
87 |   }
88 | ]
89 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/.gitignore:
--------------------------------------------------------------------------------
1 | tfvars
2 | auth.json
3 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef AWS_PROFILE
 3 | $(error AWS_PROFILE is not set)
 4 | endif
 5 | 
 6 | CLUSTER_ID ?= $(shell cd cluster && terraform output | grep emr-id | awk '{print $$NF}')
 7 | MASTER_IP ?= $(shell cd cluster && terraform output | grep emr-master | awk '{print $$NF}')
 8 | KEY_NAME ?= $(shell cd cluster && terraform output | grep key-name | awk '{print $$NF}')
 9 | KEY_PATH ?= "~/.ssh/${KEY_NAME}.pem"
10 | 
11 | # For circumvention of MFA when necessary
12 | AWS_ENV_VARS ?= AWS_ACCESS_KEY_ID=$(shell cat auth.json | jq -re '.Credentials.AccessKeyId') AWS_SECRET_ACCESS_KEY=$(shell cat auth.json | jq -re '.Credentials.SecretAccessKey') AWS_SESSION_TOKEN=$(shell cat auth.json | jq -re '.Credentials.SessionToken')
13 | 
14 | # Get STS token to work around terraform's MFA difficulties
15 | auth.json:
16 | 	rm -rf auth.json
17 | 	cd cluster; aws \
18 | 		--profile ${AWS_PROFILE} \
19 | 		sts assume-role \
20 | 		--role-arn="$(shell aws configure get --profile ${AWS_PROFILE} role_arn)" \
21 | 		--role-session-name="power-user-session" > ../auth.json
22 | 
23 | validate-cluster: auth.json
24 | 	cd cluster; $(AWS_ENV_VARS) terraform validate \
25 | 		--var-file="../tfvars" \
26 | 		-var "aws_profile=${AWS_PROFILE}"
27 | 
28 | init-cluster: auth.json
29 | 	cd cluster; $(AWS_ENV_VARS) terraform init \
30 | 		-var-file="../tfvars" \
31 | 		-var "aws_profile=${AWS_PROFILE}"
32 | 
33 | cluster-tfplan: auth.json
34 | 	cd cluster; $(AWS_ENV_VARS) terraform plan \
35 | 		-var-file="../tfvars" \
36 | 		-var "aws_profile=${AWS_PROFILE}" \
37 | 		-out="cluster-tfplan"
38 | 
39 | cluster: cluster-tfplan
40 | 	cd cluster; $(AWS_ENV_VARS) terraform apply "cluster-tfplan"
41 | 
42 | ssh: auth.json
43 | 	$(AWS_ENV_VARS) aws emr ssh \
44 | 		--cluster-id ${CLUSTER_ID} \
45 | 		--key-pair-file ${KEY_PATH}
46 | 
47 | proxy:
48 | 	ssh -i ${KEY_PATH} -ND 8157 hadoop@${MASTER_IP}
49 | 
50 | destroy-cluster: auth.json
51 | 	cd cluster; $(AWS_ENV_VARS) terraform destroy \
52 | 		-var-file="../tfvars" \
53 | 		-var "aws_profile=${AWS_PROFILE}"
54 | 
55 | osmesa.jar:
56 | 	echo TODO
57 | 	# cd ../src && sbt assembly
58 | 	# cp ../src/target/scala-2.11/osmesa-assembly-0.1.0.jar osmesa.jar
59 | 
60 | upload-jar: osmesa.jar
61 | 	echo TODO
62 | 	# aws emr put --cluster-id ${CLUSTER_ID} --key-pair-file ${KEY_PATH} \
63 | 	# --src osmesa.jar --dest /tmp/osmesa.jar
64 | 
65 | 
66 | print-vars:
67 | 	echo aws_profile: ${AWS_PROFILE}
68 | 	echo cluster_id: ${CLUSTER_ID}
69 | 	echo key_name: ${KEY_NAME}
70 | 	echo key_path: ${KEY_PATH}
71 | 	echo master_ip: ${MASTER_IP}
72 | 	echo env_vars: ${AWS_ENV_VARS}
73 | 
74 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/README.md:
--------------------------------------------------------------------------------
 1 | # OSMESA EMR
 2 | 
 3 | This directory contains a make file to spin up an EMR cluster using [terraform](https://github.com/hashicorp/terraform).
 4 | 
 5 | - [Requirements](#requirements)
 6 | - [Makefile](#makefile)
 7 | - [Running](#running)
 8 | 
 9 | ## Requirements
10 | 
11 | [Terraform 0.11.5](https://github.com/hashicorp/terraform/releases/tag/v0.11.5)
12 | 
13 | ## Settings
14 | 
15 | [cluster/variables.tf](cluster/variables.tf) contains the full set of variables
16 | which can be specified to modify an EMR deployment. Only those not
17 | provided defaults need to be specified, and these can be found within
18 | [tfvars.tpl](tfvars.tpl) - be sure to make a copy of this template and remove
19 | 'tpl' from the filename.
20 | 
21 | 
22 | ## Makefile
23 | 
24 | | Command               | Description
25 | |-----------------------|------------------------------------------------------------|
26 | |auth.json              |Generate temporary session and key/secret                   |
27 | |validate-cluster       |`terraform validate - Validate terraform                    |
28 | |init-cluster           |`terraform init` - Initialize terraform                     |
29 | |cluster-tfplan         |`terraform plan` - Plan out an 'apply'  of this terraform   |
30 | |cluster                |`terraform` init, if it's the first run                     |
31 | |ssh                    |SSH into a running EMR cluster                              |
32 | |destroy-cluster        |Destroy a running EMR cluster                               |
33 | |print-vars             |Print out env vars for diagnostic and debug purposes        |
34 | 
35 | ## Running
36 | 
37 | The Makefile in this directory provides commands to easily set up an EMR
38 | cluster with MFA, but doing so does require a minimal amount of configuration.
39 | It will be necessary to export your desired AWS profile as well as
40 | having set up `assume role` permissions and an MFA device for the AWS
41 | profile exported. You'll also need to make a copy of tfvars.tpl for
42 | adding parameters specific to your deployment.
43 | 
44 | ```bash
45 | export AWS_PROFILE=my_profile
46 | cp tfvars.tpl tfvars
47 | # update tfvars with values appropriate to the EMR cluster you'd like
48 | make auth.json
49 | make cluster
50 | ```
51 | 
52 | `make auth.json` will prompt you for your MFA key and produce a
53 | session which terraform can use to get around MFA restrictions.
54 | 
55 | **Note:** long startup times (10 minutes or more) probably indicates that you have
56 | chosen a spot price that is too low.
57 | 
58 | This basic cluster will have a running Zeppelin interface that can be accessed
59 | via SSH tunnel with
60 | [foxyproxy](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-connect-master-node-proxy.html)
61 | 
62 | ![Zeppelin Welcome](./images/zeppelin-welcome.png)
63 | 
64 | This cluster will not have access to any code until we upload the
65 | appropriate jars and register them within zeppelin
66 | 
67 | ```bash
68 | make upload-assembly
69 | ```
70 | 
71 | is issued.  Upon doing so, you must configure Zeppelin to recognize this
72 | resource by going to the interpreters tab:
73 | 
74 | ![Zeppelin interpreters](./images/zeppelin-interpreters.png)
75 | 
76 | Edit the spark interpreter settings by adding the GeoTrellis jar into the
77 | class path (`make upload-assembly` copies the fat jar into, e.g.,
78 | `/tmp/geotrellis-spark-etl-assembly-1.2.0-SNAPSHOT.jar`):
79 | 
80 | ![Zeppelin interpreter edit](./images/zeppelin-interpreter-edit.png)
81 | 
82 | You may then create a new notebook:
83 | 
84 | ![Zeppelin Osmesa Notebook](./images/zeppelin-osmesa-notebook.png)
85 | 
86 | wherein GeoTrellis deps can be imported:
87 | 
88 | ![Zeppelin Osmesa example](./images/zeppelin-osmesa-example.png)
89 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/cluster/.gitignore:
--------------------------------------------------------------------------------
1 | .terraform*
2 | terraform.tfstate*
3 | tfvars
4 | cluster-tfplan
5 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/cluster/aws.tf:
--------------------------------------------------------------------------------
1 | 
2 | # Marks AWS as a resource provider.
3 | provider "aws" {
4 |   profile = "${var.aws_profile}"
5 |   region  = "${var.aws_region}"
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/cluster/cluster-configurations.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "Classification": "spark",
 4 |         "Properties": {
 5 |             "maximizeResourceAllocation": "true"
 6 |         }
 7 |     },
 8 |     {
 9 |         "Classification": "spark-defaults",
10 |         "Properties": {
11 |             "spark.driver.maxResultSize": "3G",
12 |             "spark.dynamicAllocation.enabled": "true",
13 |             "spark.shuffle.service.enabled": "true",
14 |             "spark.shuffle.compress": "true",
15 |             "spark.shuffle.spill.compress": "true",
16 |             "spark.rdd.compress": "true",
17 |             "spark.yarn.executor.memoryOverhead": "1G",
18 |             "spark.yarn.driver.memoryOverhead": "1G",
19 |             "spark.driver.maxResultSize": "3G",
20 |             "spark.executor.extraJavaOptions" : "-XX:+UseParallelGC -Dgeotrellis.s3.threads.rdd.write=64"
21 |         }
22 |     },
23 |     {
24 |         "Classification": "yarn-site",
25 |         "Properties": {
26 |             "yarn.resourcemanager.am.max-attempts": "1",
27 |             "yarn.nodemanager.vmem-check-enabled": "false",
28 |             "yarn.nodemanager.pmem-check-enabled": "false"
29 |         }
30 |     }
31 | ]
32 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/cluster/emr.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_emr_cluster" "emr-spark-cluster" {
 2 |   name          = "${var.user} - ${var.cluster_name}"
 3 |   applications  = ["Hadoop", "Spark", "Ganglia", "Zeppelin"]
 4 |   release_label = "emr-5.8.0"
 5 |   service_role  = "${var.emr_service_role}"
 6 | 
 7 |   ec2_attributes {
 8 |     instance_profile = "${var.emr_instance_profile}"
 9 |     key_name         = "${var.key_name}"
10 | 
11 |     emr_managed_master_security_group = "${aws_security_group.emr-cluster.id}"
12 |     emr_managed_slave_security_group  = "${aws_security_group.emr-cluster.id}"
13 |   }
14 | 
15 |   instance_group {
16 |     instance_count = 1
17 |     instance_role  = "MASTER"
18 |     instance_type  = "${var.master_instance_type}"
19 |     name           = "emr-master"
20 |   }
21 | 
22 |   instance_group {
23 |     bid_price      = "${var.bid_price}"
24 |     instance_count = "${var.worker_count}"
25 |     instance_role  = "CORE"
26 |     instance_type  = "${var.worker_instance_type}"
27 |     name           = "emr-worker"
28 |   }
29 | 
30 |   configurations = "cluster-configurations.json"
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/cluster/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "emr-id" {
 2 |   value = "${aws_emr_cluster.emr-spark-cluster.id}"
 3 | }
 4 | 
 5 | output "emr-master" {
 6 |   value = "${aws_emr_cluster.emr-spark-cluster.master_public_dns}"
 7 | }
 8 | 
 9 | output "key-name" {
10 |   value = "${var.key_name}"
11 | }
12 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/cluster/security-group.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_security_group" "emr-cluster" {
 2 |   ingress {
 3 |     from_port = 0
 4 |     to_port   = 0
 5 |     protocol  = "-1"
 6 |     self      = true
 7 |   }
 8 | 
 9 |   ingress {
10 |     from_port   = "22"
11 |     to_port     = "22"
12 |     protocol    = "tcp"
13 |     cidr_blocks = ["0.0.0.0/0"]
14 |   }
15 | 
16 |   egress {
17 |     from_port   = 0
18 |     to_port     = 0
19 |     protocol    = "-1"
20 |     cidr_blocks = ["0.0.0.0/0"]
21 |   }
22 | 
23 |   lifecycle {
24 |     create_before_destroy = true
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/cluster/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "aws_region" {
 2 |     type        = "string"
 3 |     description = "AWS Region"
 4 |     default     = "us-east-1"
 5 | }
 6 | 
 7 | variable "aws_profile" {
 8 |     type        = "string"
 9 |     description = "AWS Profile"
10 | }
11 | 
12 | variable "key_name" {
13 |     type        = "string"
14 |     description = "The name of the EC2 secret key (primarily for SSH access)"
15 | }
16 | 
17 | variable "worker_count" {
18 |     type        = "string"
19 |     description = "The number of worker nodes"
20 |     default     = "1"
21 | }
22 | 
23 | variable "emr_service_role" {
24 |   type        = "string"
25 |   description = "EMR service role"
26 |   default     = "EMR_DefaultRole"
27 | }
28 | 
29 | variable "emr_instance_profile" {
30 |   type        = "string"
31 |   description = "EMR instance profile"
32 |   default     = "EMR_EC2_DefaultRole"
33 | }
34 | 
35 | variable "bid_price" {
36 |   type        = "string"
37 |   description = "Bid Price"
38 |   default     = "0.07"
39 | }
40 | 
41 | variable "user" {
42 |   default = "EMR"
43 | }
44 | 
45 | variable "cluster_name" {
46 |   default = "Testing"
47 | }
48 | 
49 | variable "master_instance_type" {
50 |   default = "m3.2xlarge"
51 | }
52 | 
53 | variable "worker_instance_type" {
54 |   default = "m3.xlarge"
55 | }
56 | 


--------------------------------------------------------------------------------
/deployment/batch/terraform/tfvars.tpl:
--------------------------------------------------------------------------------
1 | aws_region = ""
2 | 
3 | key_name = ""
4 | 
5 | worker_count = ""
6 | 
7 | cluster_name = ""
8 | 


--------------------------------------------------------------------------------
/deployment/docker/Dockerfile.osm_apps:
--------------------------------------------------------------------------------
1 | FROM bde2020/spark-master:2.4.4-hadoop2.7
2 | 
3 | COPY osmesa-apps.jar /opt/osmesa-apps.jar
4 | COPY log4j.properties /spark/conf/
5 | COPY refresh-views.sh /usr/local/bin/refresh-views.sh
6 | 
7 | WORKDIR /opt
8 | 


--------------------------------------------------------------------------------
/deployment/docker/Dockerfile.osm_refresh:
--------------------------------------------------------------------------------
1 | FROM alpine:3.12
2 | 
3 | RUN apk update && apk add bash postgresql-client
4 | COPY refresh-views.sh /usr/local/bin/refresh-views.sh
5 | 
6 | WORKDIR /opt
7 | 


--------------------------------------------------------------------------------
/deployment/docker/build-containers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "No version tag has been set.  Do not run this script directly; instead, issue"
 5 |     echo "                              make build-container"
 6 |     echo "from the 'streaming' directory."
 7 |     exit 1
 8 | else
 9 |     echo "Version tag is set to '${VERSION_TAG}'"
10 | fi
11 | 
12 | set -xe
13 | SBT_DIR="../../src"
14 | JAR_DIR=${SBT_DIR}/apps/target/scala-2.11/
15 | DOCKER_DIR=$(pwd)
16 | 
17 | cp ${JAR_DIR}/osmesa-apps.jar ${DOCKER_DIR}/osmesa-apps.jar
18 | docker build -f Dockerfile.osm_apps --tag osm_apps:${VERSION_TAG} ${DOCKER_DIR}
19 | docker build -f Dockerfile.osm_refresh --tag osm_refresh:${VERSION_TAG} ${DOCKER_DIR}
20 | rm ${DOCKER_DIR}/osmesa-apps.jar
21 | 


--------------------------------------------------------------------------------
/deployment/docker/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=WARN, console
2 | log4j.appender.console=org.apache.log4j.ConsoleAppender
3 | log4j.appender.console.target=System.out
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | # log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c: %m%n
6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
7 | log4j.logger.osmesa=DEBUG
8 | log4j.logger.vectorpipe=DEBUG


--------------------------------------------------------------------------------
/deployment/docker/refresh-views.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo "$(date -Iseconds): Starting view refreshment in $DATABASE_NAME"
 4 | 
 5 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently user_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then
 6 |   echo "$(date -Iseconds): Refreshing user statistics"
 7 |   # refresh in the background to return immediately
 8 |   psql -Aqt \
 9 |     -c "REFRESH MATERIALIZED VIEW CONCURRENTLY user_statistics" \
10 |     -c "UPDATE refreshments SET updated_at=now() where mat_view='user_statistics'" \
11 |     $DATABASE_URL &
12 | else
13 |   echo "$(date -Iseconds): User stats table already refreshing"
14 | fi
15 | 
16 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently hashtag_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then
17 |   echo "$(date -Iseconds): Refreshing hashtag statistics"
18 |   # refresh in the background to return immediately
19 |   psql -Aqt \
20 |     -c "REFRESH MATERIALIZED VIEW CONCURRENTLY hashtag_statistics" \
21 |     -c "UPDATE refreshments SET updated_at=now() where mat_view='hashtag_statistics'" \
22 |     $DATABASE_URL &
23 | else
24 |   echo "$(date -Iseconds): Hashtag stats table already refreshing"
25 | fi
26 | 
27 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently country_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then
28 |   # refresh in the background to return immediately
29 |   echo "$(date -Iseconds): Refreshing country statistics"
30 |   psql -Aqt \
31 |     -c "REFRESH MATERIALIZED VIEW CONCURRENTLY country_statistics" \
32 |     -c "UPDATE refreshments SET updated_at=now() where mat_view='country_statistics'" \
33 |     $DATABASE_URL &
34 | else
35 |   echo "$(date -Iseconds): Country stats table already refreshing"
36 | fi
37 | 
38 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently hashtag_user_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then
39 |   # refresh in the background to return immediately
40 |   echo "$(date -Iseconds): Refreshing hashtag/user statistics"
41 |   psql -Aqt \
42 |     -c "REFRESH MATERIALIZED VIEW CONCURRENTLY hashtag_user_statistics" \
43 |     -c "UPDATE refreshments SET updated_at=now() where mat_view='hashtag_user_statistics'" \
44 |     $DATABASE_URL &
45 | else
46 |   echo "$(date -Iseconds): Hashtag/user stats table already refreshing"
47 | fi
48 | 
49 | wait
50 | echo "$(date -Iseconds): Completed"
51 | 


--------------------------------------------------------------------------------
/deployment/docker/sources.list:
--------------------------------------------------------------------------------
1 | deb http://mirrors.linode.com/debian/ stretch main
2 | deb-src http://mirrors.linode.com/debian/ stretch main
3 | deb http://mirrors.linode.com/debian-security/ stretch/updates main
4 | deb-src http://mirrors.linode.com/debian-security/ stretch/updates main
5 | deb http://mirrors.linode.com/debian/ stretch-updates main
6 | deb-src http://mirrors.linode.com/debian/ stretch-updates main
7 | 


--------------------------------------------------------------------------------
/deployment/monitor-checkpoints.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Send email if ChangesetStatsUpdater is at least OFFSET_THRESHOLD minutes behind.
 3 | # Requires that `mailx` be installed.
 4 | #
 5 | # Ensure the following env variables are set:
 6 | #   - DATABASE_URL: A valid postgres connection string
 7 | #   - ENVIRONMENT: A unique string describing the environment, usually "staging"|"production"
 8 | #   - FROM_EMAIL: Email address to send alert from
 9 | #   - TO_EMAIL: Email address to send alert to
10 | #   - SMTP_HOSTNAME: Hostname of SMTP server to send mail to
11 | # Optional:
12 | #   - OFFSET_THRESHOLD: Default 10. Offset in minutes to begin alerting at.
13 | #   - SMTP_PORT: Default 25
14 | 
15 | set -e
16 | 
17 | CHANGESET_CHECKPOINT=$(psql -Aqtc "select sequence from checkpoints where proc_name = 'ChangesetStatsUpdater'" $DATABASE_URL)
18 | EPOCH_NOW=$(date +%s)
19 | ADIFF_SEQUENCE_NOW=$(( (${EPOCH_NOW} - 1347432900) / 60 ))
20 | 
21 | OFFSET=$(( ${ADIFF_SEQUENCE_NOW} - ${CHANGESET_CHECKPOINT} ))
22 | 
23 | if (( ${OFFSET} >= ${OFFSET_THRESHOLD:-10} )); then
24 |     echo "OSMesa ChangesetStatsUpdater in ${ENVIRONMENT} is behind by ${OFFSET}" | \
25 |       mailx \
26 |         -s "ALERT: OSMesa ChangesetStats Slow (${ENVIRONMENT})" \
27 |         -S smtp=smtp://${SMTP_HOSTNAME}:${SMTP_PORT:-25} \
28 |         -S from="${FROM_EMAIL}" \
29 |         "${TO_EMAIL}"
30 | fi
31 | 


--------------------------------------------------------------------------------
/deployment/sql/02-checkpoints.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE checkpoints (
2 |     proc_name text NOT NULL UNIQUE,
3 |     sequence integer NOT NULL,
4 |     PRIMARY KEY(proc_name)
5 | );
6 | 
7 | 


--------------------------------------------------------------------------------
/deployment/sql/03-users.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE users (
2 |     id integer NOT NULL,
3 |     name text,
4 |     PRIMARY KEY(id)
5 | );
6 | 


--------------------------------------------------------------------------------
/deployment/sql/04-hashtags.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE hashtags (
 2 |     id serial,
 3 |     hashtag text NOT NULL UNIQUE,
 4 |     PRIMARY KEY(id)
 5 | );
 6 | 
 7 | CREATE UNIQUE INDEX ON hashtags (hashtag);
 8 | 
 9 | -- support for LIKE queries on hashtags
10 | CREATE EXTENSION pg_trgm;
11 | CREATE INDEX trgm_idx_hashtags ON hashtags USING gin (hashtag gin_trgm_ops);
12 | 


--------------------------------------------------------------------------------
/deployment/sql/05-errors.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE errors (
 2 |     id bigint NOT NULL,
 3 |     type smallint,
 4 |     sequence integer,
 5 |     tags jsonb,
 6 |     nds bigint[],
 7 |     changeset bigint,
 8 |     uid bigint,
 9 |     "user" text,
10 |     updated timestamp with time zone,
11 |     visible boolean,
12 |     version integer,
13 |     PRIMARY KEY(id)
14 | );
15 | 


--------------------------------------------------------------------------------
/deployment/sql/06-changesets.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE changesets (
 2 |     id bigint NOT NULL,
 3 |     measurements jsonb,
 4 |     counts jsonb,
 5 |     total_edits integer,
 6 |     editor text,
 7 |     user_id integer,
 8 |     created_at timestamp with time zone,
 9 |     closed_at timestamp with time zone,
10 |     augmented_diffs integer[],
11 |     updated_at timestamp with time zone,
12 |     PRIMARY KEY(id)
13 | );
14 | 
15 | CREATE INDEX changesets_user_id ON changesets(user_id);
16 | 
17 | CREATE INDEX changesets_created_at_index
18 |     ON changesets (created_at);
19 | 
20 | CREATE INDEX changesets_closed_at_index
21 |     ON changesets (closed_at);
22 | 
23 | CREATE INDEX changesets_updated_at_index
24 |     ON changesets (updated_at);
25 | 
26 | 


--------------------------------------------------------------------------------
/deployment/sql/07-changesets_countries.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE changesets_countries (
 2 |     changeset_id integer NOT NULL
 3 |         CONSTRAINT changesets_countries_changesets_id_fk
 4 | 		REFERENCES changesets,
 5 |     country_id integer NOT NULL
 6 |         CONSTRAINT changesets_countries_countries_id_fk
 7 | 		REFERENCES countries,
 8 |     edit_count integer NOT NULL,
 9 |     augmented_diffs integer[],
10 |     PRIMARY KEY(changeset_id, country_id)
11 | );
12 | 
13 | -- support joining on foreign keys (add index in reverse order of the primary key)
14 | CREATE INDEX changesets_countries_country_id_changeset_id_index
15 | 	ON changesets_countries (country_id, changeset_id);
16 | 


--------------------------------------------------------------------------------
/deployment/sql/08-changesets_hashtags.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE changesets_hashtags (
 2 |     changeset_id integer NOT NULL
 3 |         CONSTRAINT changesets_hashtags_changesets_id_fk
 4 |         REFERENCES changesets,
 5 |     hashtag_id integer NOT NULL
 6 |         CONSTRAINT changesets_hashtags_hashtags_id_fk
 7 |         REFERENCES hashtags,
 8 |     PRIMARY KEY(changeset_id, hashtag_id)
 9 | );
10 | 
11 | -- support joining on foreign keys (add index in reverse order of the primary key)
12 | CREATE INDEX changesets_hashtags_hashtag_id_changeset_id_index
13 |     ON changesets_hashtags (hashtag_id, changeset_id);
14 | 


--------------------------------------------------------------------------------
/deployment/sql/README.md:
--------------------------------------------------------------------------------
1 | ## SQL Definitions
2 | 
3 | This directory contains files with SQL definitions to set up a fresh OSMesa stats database.  The files in this top level are the definitions for the required tables which are constructed by the batch ingest process and subsequently updated by the streaming tasks.  The SQL files in the `materialized_views` directory are used to create aggregated summaries of the fundamental stats that are more useful for direct consumption by a user.  These materialized views will not, however, automatically update; they will need to be refreshed periodically.
4 | 


--------------------------------------------------------------------------------
/deployment/sql/materialized_views/country_statistics.sql:
--------------------------------------------------------------------------------
  1 | DROP MATERIALIZED VIEW IF EXISTS country_statistics;
  2 | CREATE MATERIALIZED VIEW country_statistics AS
  3 |   WITH changesets AS (
  4 |     SELECT
  5 |       *
  6 |     FROM changesets
  7 |     -- ignore users 0 and 1
  8 |     WHERE user_id > 1
  9 |   ),
 10 |   general AS (
 11 |     SELECT
 12 |       country_id,
 13 |       max(coalesce(closed_at, created_at)) last_edit,
 14 |       count(*) changeset_count,
 15 |       sum(coalesce(edit_count, 0)) edit_count,
 16 |       max(updated_at) updated_at
 17 |     FROM changesets
 18 |     JOIN changesets_countries ON changesets.id = changesets_countries.changeset_id
 19 |     GROUP BY country_id
 20 |   ),
 21 |   processed_changesets AS (
 22 |     SELECT
 23 |       id,
 24 |       user_id,
 25 |       country_id,
 26 |       measurements,
 27 |       counts,
 28 |       edit_count
 29 |     FROM changesets
 30 |     JOIN changesets_countries ON changesets.id = changesets_countries.changeset_id
 31 |   ),
 32 |   hashtag_counts AS (
 33 |     SELECT
 34 |       RANK() OVER (PARTITION BY country_id ORDER BY sum(coalesce(edit_count, 0)) DESC) AS rank,
 35 |       country_id,
 36 |       hashtag,
 37 |       count(*) changesets,
 38 |       sum(coalesce(edit_count, 0)) edits
 39 |     FROM processed_changesets
 40 |     JOIN changesets_hashtags ON processed_changesets.id = changesets_hashtags.changeset_id
 41 |     JOIN hashtags ON changesets_hashtags.hashtag_id = hashtags.id
 42 |     GROUP BY country_id, hashtag
 43 |   ),
 44 |   hashtags AS (
 45 |     SELECT
 46 |       country_id,
 47 |       jsonb_object_agg(hashtag, changesets) hashtag_changesets,
 48 |       jsonb_object_agg(hashtag, edits) hashtag_edits
 49 |     FROM hashtag_counts
 50 |     WHERE rank <= 10
 51 |     GROUP BY country_id
 52 |   ),
 53 |   user_counts AS (
 54 |     SELECT
 55 |       RANK() OVER (PARTITION BY country_id ORDER BY sum(coalesce(edit_count, 0)) DESC) AS rank,
 56 |       country_id,
 57 |       user_id,
 58 |       count(*) changesets,
 59 |       sum(coalesce(edit_count, 0)) edits
 60 |     FROM processed_changesets
 61 |     GROUP BY country_id, user_id
 62 |   ),
 63 |   users AS (
 64 |     SELECT
 65 |       country_id,
 66 |       jsonb_object_agg(user_id, changesets) user_changesets,
 67 |       jsonb_object_agg(user_id, edits) user_edits
 68 |     FROM user_counts
 69 |     WHERE rank <= 10
 70 |     GROUP BY country_id
 71 |   ),
 72 |   measurements AS (
 73 |     SELECT
 74 |       id,
 75 |       country_id,
 76 |       key,
 77 |       value
 78 |     FROM processed_changesets
 79 |     CROSS JOIN LATERAL jsonb_each(measurements)
 80 |   ),
 81 |   aggregated_measurements_kv AS (
 82 |     SELECT
 83 |       country_id,
 84 |       key,
 85 |       sum((value->>0)::numeric) AS value
 86 |     FROM measurements
 87 |     GROUP BY country_id, key
 88 |   ),
 89 |   aggregated_measurements AS (
 90 |     SELECT
 91 |       country_id,
 92 |       jsonb_object_agg(key, value) measurements
 93 |     FROM aggregated_measurements_kv
 94 |     GROUP BY country_id
 95 |   ),
 96 |   counts AS (
 97 |     SELECT
 98 |       id,
 99 |       country_id,
100 |       key,
101 |       value
102 |     FROM processed_changesets
103 |     CROSS JOIN LATERAL jsonb_each(counts)
104 |   ),
105 |   aggregated_counts_kv AS (
106 |     SELECT
107 |       country_id,
108 |       key,
109 |       sum((value->>0)::numeric) AS value
110 |     FROM counts
111 |     GROUP BY country_id, key
112 |   ),
113 |   aggregated_counts AS (
114 |     SELECT
115 |       country_id,
116 |       jsonb_object_agg(key, value) counts
117 |     FROM aggregated_counts_kv
118 |     GROUP BY country_id
119 |   )
120 |   SELECT
121 |     general.country_id,
122 |     countries.name country_name,
123 |     countries.code country_code,
124 |     -- NOTE these are per-changeset, not per-country, so stats are double-counted
125 |     measurements,
126 |     -- NOTE these are per-changeset, not per-country, so stats are double-counted
127 |     counts,
128 |     general.changeset_count,
129 |     general.edit_count,
130 |     general.last_edit,
131 |     general.updated_at,
132 |     user_changesets,
133 |     user_edits,
134 |     hashtag_changesets,
135 |     hashtag_edits
136 |   FROM general
137 |   JOIN countries ON country_id = countries.id
138 |   LEFT OUTER JOIN users USING (country_id)
139 |   LEFT OUTER JOIN hashtags USING (country_id)
140 |   LEFT OUTER JOIN aggregated_measurements USING (country_id)
141 |   LEFT OUTER JOIN aggregated_counts USING (country_id);
142 | 
143 | CREATE UNIQUE INDEX IF NOT EXISTS country_statistics_id ON country_statistics(country_code);
144 | 


--------------------------------------------------------------------------------
/deployment/sql/materialized_views/hashtag_statistics.sql:
--------------------------------------------------------------------------------
  1 | DROP MATERIALIZED VIEW IF EXISTS hashtag_statistics;
  2 | CREATE MATERIALIZED VIEW hashtag_statistics AS
  3 |   WITH general AS (
  4 |     SELECT
  5 |       hashtag_id,
  6 |       max(coalesce(closed_at, created_at)) last_edit,
  7 |       count(*) changeset_count,
  8 |       sum(coalesce(total_edits, 0)) edit_count,
  9 |       max(updated_at) updated_at
 10 |     FROM changesets
 11 |     JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id
 12 |     GROUP BY hashtag_id
 13 |   ),
 14 |   processed_changesets AS (
 15 |     SELECT
 16 |       id,
 17 |       user_id,
 18 |       hashtag_id,
 19 |       measurements,
 20 |       counts,
 21 |       total_edits
 22 |     FROM changesets
 23 |     JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id
 24 |   ),
 25 |   user_counts AS (
 26 |     SELECT
 27 |       RANK() OVER (PARTITION BY hashtag_id ORDER BY sum(coalesce(total_edits, 0)) DESC) AS rank,
 28 |       hashtag_id,
 29 |       user_id,
 30 |       count(*) changesets,
 31 |       sum(coalesce(total_edits, 0)) edit_count
 32 |     FROM processed_changesets
 33 |     GROUP BY hashtag_id, user_id
 34 |   ),
 35 |   users AS (
 36 |     SELECT
 37 |       hashtag_id,
 38 |       jsonb_object_agg(user_id, changesets) user_changesets,
 39 |       jsonb_object_agg(user_id, edit_count) user_edits
 40 |     FROM user_counts
 41 |     WHERE rank <= 10
 42 |     GROUP BY hashtag_id
 43 |   ),
 44 |   measurements AS (
 45 |     SELECT
 46 |       id,
 47 |       hashtag_id,
 48 |       key,
 49 |       value
 50 |     FROM processed_changesets
 51 |     CROSS JOIN LATERAL jsonb_each(measurements)
 52 |   ),
 53 |   aggregated_measurements_kv AS (
 54 |     SELECT
 55 |       hashtag_id,
 56 |       key,
 57 |       sum((value->>0)::numeric) AS value
 58 |     FROM measurements
 59 |     GROUP BY hashtag_id, key
 60 |   ),
 61 |   aggregated_measurements AS (
 62 |     SELECT
 63 |       hashtag_id,
 64 |       jsonb_object_agg(key, value) measurements
 65 |     FROM aggregated_measurements_kv
 66 |     GROUP BY hashtag_id
 67 |   ),
 68 |   counts AS (
 69 |     SELECT
 70 |       id,
 71 |       hashtag_id,
 72 |       key,
 73 |       value
 74 |     FROM processed_changesets
 75 |     CROSS JOIN LATERAL jsonb_each(counts)
 76 |   ),
 77 |   aggregated_counts_kv AS (
 78 |     SELECT
 79 |       hashtag_id,
 80 |       key,
 81 |       sum((value->>0)::numeric) AS value
 82 |     FROM counts
 83 |     GROUP BY hashtag_id, key
 84 |   ),
 85 |   aggregated_counts AS (
 86 |     SELECT
 87 |       hashtag_id,
 88 |       jsonb_object_agg(key, value) counts
 89 |     FROM aggregated_counts_kv
 90 |     GROUP BY hashtag_id
 91 |   )
 92 |   SELECT
 93 |     hashtags.hashtag tag,
 94 |     general.hashtag_id,
 95 |     measurements,
 96 |     counts,
 97 |     general.changeset_count,
 98 |     general.edit_count,
 99 |     general.last_edit,
100 |     general.updated_at,
101 |     user_changesets,
102 |     user_edits
103 |   FROM general
104 |   JOIN hashtags ON hashtag_id = hashtags.id
105 |   LEFT OUTER JOIN users USING (hashtag_id)
106 |   LEFT OUTER JOIN aggregated_measurements USING (hashtag_id)
107 |   LEFT OUTER JOIN aggregated_counts USING (hashtag_id);
108 | 
109 | CREATE UNIQUE INDEX IF NOT EXISTS hashtag_statistics_hashtag_id ON hashtag_statistics(hashtag_id);


--------------------------------------------------------------------------------
/deployment/sql/materialized_views/hashtag_user_statistics.sql:
--------------------------------------------------------------------------------
 1 | DROP MATERIALIZED VIEW IF EXISTS hashtag_user_statistics;
 2 | CREATE MATERIALIZED VIEW hashtag_user_statistics AS
 3 |   WITH general AS (
 4 |       SELECT
 5 |       user_id,
 6 |       hashtag_id,
 7 |       array_agg(id) changesets,
 8 |       max(coalesce(closed_at, created_at)) last_edit,
 9 |       count(*) changeset_count,
10 |       sum(coalesce(total_edits, 0)) edit_count,
11 |       max(updated_at) updated_at
12 |     FROM changesets
13 |     JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id
14 |     GROUP BY user_id, hashtag_id
15 |   ),
16 |   measurements AS (
17 |     SELECT
18 |       id,
19 |       user_id,
20 |       hashtag_id,
21 |       key,
22 |       value
23 |     FROM changesets
24 |     JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id
25 |     CROSS JOIN LATERAL jsonb_each(measurements)
26 |   ),
27 |   aggregated_measurements_kv AS (
28 |     SELECT
29 |       user_id,
30 |       hashtag_id,
31 |       key,
32 |       sum((value->>0)::numeric) AS value
33 |     FROM measurements
34 |     GROUP BY user_id, hashtag_id, key
35 |   ),
36 |   aggregated_measurements AS (
37 |     SELECT
38 |       user_id,
39 |       hashtag_id,
40 |       jsonb_object_agg(key, value) measurements
41 |     FROM aggregated_measurements_kv
42 |     GROUP BY user_id, hashtag_id
43 |   ),
44 |   counts AS (
45 |     SELECT
46 |       id,
47 |       user_id,
48 |       hashtag_id,
49 |       key,
50 |       value
51 |     FROM changesets
52 |     JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id
53 |     CROSS JOIN LATERAL jsonb_each(counts)
54 |   ),
55 |   aggregated_counts_kv AS (
56 |     SELECT
57 |       user_id,
58 |       hashtag_id,
59 |       key,
60 |       sum((value->>0)::numeric) AS value
61 |     FROM counts
62 |     GROUP BY user_id, hashtag_id, key
63 |   ),
64 |   aggregated_counts AS (
65 |     SELECT
66 |       user_id,
67 |       hashtag_id,
68 |       jsonb_object_agg(key, value) counts
69 |     FROM aggregated_counts_kv
70 |     GROUP BY user_id, hashtag_id
71 |   )
72 |   SELECT
73 |     user_id,
74 |     users.name,
75 |     general.hashtag_id,
76 |     hashtags.hashtag,
77 |     measurements,
78 |     counts,
79 |     last_edit,
80 |     changeset_count,
81 |     edit_count,
82 |     updated_at
83 |   FROM general
84 |   LEFT OUTER JOIN hashtags ON general.hashtag_id = hashtags.id
85 |   LEFT OUTER JOIN aggregated_measurements USING (user_id, hashtag_id)
86 |   LEFT OUTER JOIN aggregated_counts USING (user_id, hashtag_id)
87 |   JOIN users ON user_id = users.id;
88 | 
89 | CREATE UNIQUE INDEX IF NOT EXISTS hashtag_user_statistics_pk ON hashtag_user_statistics(hashtag_id, user_id);


--------------------------------------------------------------------------------
/deployment/sql/materialized_views/refreshments.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE refreshments (
2 |   mat_view text NOT NULL,
3 |   updated_at timestamp with time zone,
4 |   PRIMARY KEY(mat_view)
5 | );
6 | 
7 | INSERT INTO refreshments VALUES ('user_statistics', to_timestamp(0)), ('country_statistics', to_timestamp(0)), ('hashtag_statistics', to_timestamp(0)), ('hashtag_user_statistics', to_timestamp(0));


--------------------------------------------------------------------------------
/deployment/sql/materialized_views/user_statistics.sql:
--------------------------------------------------------------------------------
  1 | DROP MATERIALIZED VIEW IF EXISTS user_statistics;
  2 | CREATE MATERIALIZED VIEW user_statistics AS
  3 |   WITH general AS (
  4 |     SELECT
  5 |       user_id,
  6 |       array_agg(id) changesets,
  7 |       max(coalesce(closed_at, created_at)) last_edit,
  8 |       count(*) changeset_count,
  9 |       sum(coalesce(total_edits, 0)) edit_count,
 10 |       max(updated_at) updated_at
 11 |     FROM changesets
 12 |     GROUP BY user_id
 13 |   ),
 14 |   country_counts AS (
 15 |     SELECT
 16 |       user_id,
 17 |       code,
 18 |       count(*) changesets,
 19 |       sum(coalesce(total_edits, 0)) edits
 20 |     FROM changesets
 21 |     JOIN changesets_countries ON changesets.id = changesets_countries.changeset_id
 22 |     JOIN countries ON changesets_countries.country_id = countries.id
 23 |     GROUP BY user_id, code
 24 |   ),
 25 |   countries AS (
 26 |     SELECT
 27 |       user_id,
 28 |       jsonb_object_agg(code, changesets) country_changesets,
 29 |       jsonb_object_agg(code, edits) country_edits
 30 |     FROM country_counts
 31 |     GROUP BY user_id
 32 |   ),
 33 |   edit_day_counts AS (
 34 |     SELECT
 35 |       user_id,
 36 |       date_trunc('day', coalesce(closed_at, created_at))::date AS day,
 37 |       count(*) changesets,
 38 |       sum(coalesce(total_edits, 0)) edits
 39 |     FROM changesets
 40 |     WHERE coalesce(closed_at, created_at) IS NOT NULL
 41 |     GROUP BY user_id, day
 42 |   ),
 43 |   edit_days AS (
 44 |     SELECT
 45 |       user_id,
 46 |       jsonb_object_agg(day, changesets) day_changesets,
 47 |       jsonb_object_agg(day, edits) day_edits
 48 |     FROM edit_day_counts
 49 |     GROUP BY user_id
 50 |   ),
 51 |   editor_counts AS (
 52 |     SELECT
 53 |       RANK() OVER (PARTITION BY user_id ORDER BY sum(coalesce(total_edits, 0)) DESC) AS rank,
 54 |       user_id,
 55 |       editor,
 56 |       count(*) changesets,
 57 |       sum(coalesce(total_edits, 0)) edits
 58 |     FROM changesets
 59 |     WHERE editor IS NOT NULL
 60 |     GROUP BY user_id, editor
 61 |   ),
 62 |   editors AS (
 63 |     SELECT
 64 |       user_id,
 65 |       jsonb_object_agg(editor, changesets) editor_changesets,
 66 |       jsonb_object_agg(editor, edits) editor_edits
 67 |     FROM editor_counts
 68 |     WHERE rank <= 10
 69 |     GROUP BY user_id
 70 |   ),
 71 |   hashtag_counts AS (
 72 |     SELECT
 73 |       RANK() OVER (PARTITION BY user_id ORDER BY sum(coalesce(total_edits, 0)) DESC) AS rank,
 74 |       user_id,
 75 |       hashtag,
 76 |       count(*) changesets,
 77 |       sum(coalesce(total_edits)) edits
 78 |     FROM changesets
 79 |     JOIN changesets_hashtags ON changesets.id = changesets_hashtags.changeset_id
 80 |     JOIN hashtags ON changesets_hashtags.hashtag_id = hashtags.id
 81 |     GROUP BY user_id, hashtag
 82 |   ),
 83 |   hashtags AS (
 84 |     SELECT
 85 |       user_id,
 86 |       jsonb_object_agg(hashtag, changesets) hashtag_changesets,
 87 |       jsonb_object_agg(hashtag, edits) hashtag_edits
 88 |     FROM hashtag_counts
 89 |     WHERE rank <= 50
 90 |     GROUP BY user_id
 91 |   ),
 92 |   measurements AS (
 93 |     SELECT
 94 |       id,
 95 |       user_id,
 96 |       key,
 97 |       value
 98 |     FROM changesets
 99 |     CROSS JOIN LATERAL jsonb_each(measurements)
100 |   ),
101 |   aggregated_measurements_kv AS (
102 |     SELECT
103 |       user_id,
104 |       key,
105 |       sum((value->>0)::numeric) AS value
106 |     FROM measurements
107 |     GROUP BY user_id, key
108 |   ),
109 |   aggregated_measurements AS (
110 |     SELECT
111 |       user_id,
112 |       jsonb_object_agg(key, value) measurements
113 |     FROM aggregated_measurements_kv
114 |     GROUP BY user_id
115 |   ),
116 |   counts AS (
117 |     SELECT
118 |       id,
119 |       user_id,
120 |       key,
121 |       value
122 |     FROM changesets
123 |     CROSS JOIN LATERAL jsonb_each(counts)
124 |   ),
125 |   aggregated_counts_kv AS (
126 |     SELECT
127 |       user_id,
128 |       key,
129 |       sum((value->>0)::numeric) AS value
130 |     FROM counts
131 |     GROUP BY user_id, key
132 |   ),
133 |   aggregated_counts AS (
134 |     SELECT
135 |       user_id,
136 |       jsonb_object_agg(key, value) counts
137 |     FROM aggregated_counts_kv
138 |     GROUP BY user_id
139 |   )
140 |   SELECT
141 |     user_id AS id,
142 |     users.name,
143 |     measurements,
144 |     counts,
145 |     last_edit,
146 |     changeset_count,
147 |     edit_count,
148 |     editor_changesets,
149 |     editor_edits,
150 |     day_changesets,
151 |     day_edits,
152 |     country_changesets,
153 |     country_edits,
154 |     hashtag_changesets,
155 |     hashtag_edits,
156 |     updated_at
157 |   FROM general
158 |   LEFT OUTER JOIN countries USING (user_id)
159 |   LEFT OUTER JOIN editors USING (user_id)
160 |   LEFT OUTER JOIN edit_days USING (user_id)
161 |   LEFT OUTER JOIN hashtags USING (user_id)
162 |   LEFT OUTER JOIN aggregated_measurements USING (user_id)
163 |   LEFT OUTER JOIN aggregated_counts USING (user_id)
164 |   JOIN users ON user_id = users.id;
165 | 
166 | CREATE UNIQUE INDEX IF NOT EXISTS user_statistics_id ON user_statistics(id);
167 | 


--------------------------------------------------------------------------------
/deployment/streaming/.gitignore:
--------------------------------------------------------------------------------
1 | config-*.mk
2 | 


--------------------------------------------------------------------------------
/deployment/streaming/README.md:
--------------------------------------------------------------------------------
  1 | # Streaming Stats Deployment via AWS ECS
  2 | 
  3 | Amazon ECS is a system for deploying containers on top of AWS managed
  4 | infrastructure. ECS is the deployment strategy we've provided resources
  5 | for and would suggest because failures and even the hardware hiccups
  6 | (say, the loss of a machine) will be automatically resolved so that
  7 | the stream can get back to work. In conjunction with a checkpointing
  8 | mechanism which ensures the stream starts near where it left off, these
  9 | streams are highly resilient.
 10 | 
 11 | An ECS deployment consists of a few different pieces:
 12 | 
 13 | - The ECS cluster: scales EC2 instances up and down as necessary
 14 | - Services: describe long-running programs that should maintain availability
 15 | - Tasks: one or more containerized processes being run by the cluster
 16 | - Containers: docker images uploaded to AWS ECR to be pulled upon each task creation
 17 | 
 18 | The long-running stream which keeps statistics up to date by
 19 | continuously polling Overpass augmented diffs and OSM changesets is
 20 | deployed as an ECS cluster. This cluster has a service that tracks
 21 | a lone streaming task and reboots the stream from the latest saved
 22 | checkpoint (which lives on the table 'checkpoints' in the database being
 23 | updated) to ensure that failures aren't fatal to the long-running
 24 | process.
 25 | 
 26 | Our ECS deployment process relies on the use of the `ecs-cli` tool, which is
 27 | similar in spirit to `docker-compose`, but manages containers on ECS instead
 28 | of on a local docker instance.  You can install `ecs-cli` by issuing the
 29 | command
 30 | ```bash
 31 |  curl -o /usr/local/bin/ecs-cli https://s3.amazonaws.com/amazon-ecs-cli/ecs-cli-linux-amd64-latest
 32 | ```
 33 | 
 34 | ## Deployment Steps
 35 | 
 36 | 1. Copy `config-aws.mk.example` to `config-aws.mk` and
 37 |    `config-local.mk.example` to `config-local.mk`. These can be configured in a
 38 |    moment.
 39 | 
 40 | 2. Build the osm_apps container
 41 | 
 42 | ```bash
 43 | make build-container
 44 | ```
 45 | 
 46 | 3. Create an IAM role for EC2 instances. This will become `INSTANCE_ROLE`.
 47 |    The "AmazonEC2ContainerServiceforEC2Role" policy should be attached.
 48 | 
 49 | 4. Edit [config-aws.mk.tpl](./config-aws.mk.tpl) with variables appropriate
 50 |    to your AWS account and desired deployment (choose VPCs, Security Groups,
 51 |    etc) and save the file as `config-aws.mk` Note that you will need to provide
 52 |    an ECR repo URI (which you'll have to set up manually via the AWS console) in
 53 |    order to use your container on AWS.
 54 | 
 55 | 5. Manually create an ECS cluster backed by EC2 instances (not fargate), and
 56 |    be sure to record the cluster name in `config-aws.mk`. It should now be
 57 |    possible to configure ECS-CLI to deploy services against your cluster:
 58 | 
 59 | ```bash
 60 | make configure-cluster
 61 | ```
 62 | 
 63 | 6. Assuming all's well, you're ready to deploy. Update the docker-compose
 64 |    which defines your services with the appropriate variables:
 65 | 
 66 | ```bash
 67 | make docker-compose.deploy.yml
 68 | ```
 69 | 
 70 | 7. Push your image to ECR:
 71 | 
 72 | ```bash
 73 | make push-image
 74 | ```
 75 | 
 76 | 8. Bring the cluster up:
 77 | 
 78 | ```bash
 79 | make cluster-up
 80 | ```
 81 | 
 82 | 9. Deploy the service (this will create new task definitions as necessary):
 83 | 
 84 | ```bash
 85 | make start-service
 86 | ```
 87 | 
 88 | ### Updating an Existing ECS Service
 89 | 
 90 | If there is already a streaming task running on an ECS cluster that needs to
 91 | be updated, then the procedure above can be abbreviated.  Please perform steps
 92 | 1, 2, 4, 7, and 9.
 93 | 
 94 | ## Local Testing
 95 | 
 96 | From a clean environment,
 97 | 
 98 | 1. In the `deployment/streaming` directory, update the missing values in
 99 |    `config-local.mk.example` (LOCAL_AUGDIFF_SOURCE, LOCAL_AUGDIFF_START,
100 |    LOCAL_CHANGE_START, LOCAL_CHANGESET_START) and save to `config-local.mk`.
101 | 
102 | 2. In the same directory, ensure that `config-aws.mk` exists.  You may `touch`
103 |    the file if it does not.
104 | 
105 | 3. Execute `make build-container` followed by `make start-local`.  You should
106 |    observe a stream of log messages.  Any errors should appear in this window.
107 | 
108 | 4. If you want to verify that the system is operating up to spec, you may
109 | ```bash
110 | docker exec -it streaming_db_1 bash
111 | psql -U postgres
112 | ```
113 | (The trailing "1" may need to be incremented.  See `docker ps` for the proper
114 | name.)  From there, you may issue a `\d` directive and verify that the DB is
115 | populated with the correct tables.
116 | 
117 | 5. You may now test the operation of the system in the DB interface by issuing
118 |    queries against the available tables and observing the log output.  The
119 |    content of the tables will update as the system runs.
120 | 


--------------------------------------------------------------------------------
/deployment/streaming/config-deployment.mk.template:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # AWS properties
 3 | ################################################################################
 4 | export KEYPAIR :=
 5 | export SUBNET :=
 6 | export AWS_REGION := us-east-1
 7 | export IAM_ACCOUNT :=
 8 | 
 9 | ################################################################################
10 | #                                                 Streaming resource definitions
11 | ################################################################################
12 | export STREAMING_INSTANCE_TYPE := m4.xlarge
13 | export ECR_STATS_IMAGE :=
14 | export ECR_REFRESH_IMAGE :=
15 | export AWS_LOG_GROUP := streaming-stats-updater
16 | export ECS_SUBNET := ${SUBNET}
17 | export ECS_SECURITY_GROUP :=
18 | 
19 | export CLUSTER_NAME_DEPLOYMENT :=
20 | export CLUSTER_NAME_STAGING :=
21 | 
22 | export NODE_OPTIONS=--max-old-space-size=7168
23 | export DRIVER_MEMORY=8192m
24 | export ECS_MEMORY_GB=8
25 | export AUGDIFF_ECS_MEMORY_GB=8
26 | 
27 | export AUGDIFF_SERVICE_NAME := "azavea-overpass-diff-publisher"
28 | export AUGDIFF_ECR_IMAGE :=
29 | export AUGDIFF_SOURCE :=
30 | export ONRAMP_URL :=
31 | export OVERPASS_URL :=
32 | export CHANGESET_SOURCE :=
33 | export CHANGE_SOURCE :=
34 | 
35 | export DB_BASE_URI :=
36 | export PRODUCTION_DB :=
37 | export STAGING_DB :=
38 | 
39 | export NETWORK_CONFIGURATION="{\"awsvpcConfiguration\": {\"subnets\": [\"${ECS_SUBNET}\"], \"securityGroups\": [\"${ECS_SECURITY_GROUP}\"], \"assignPublicIp\": \"DISABLED\"}}"
40 | 
41 | # Uncomment the following to raise resource allocations to get past a large changeset
42 | #export TURBO_BOOST := yes
43 | 
44 | ################################################################################
45 | #                                                     Batch resource definitions
46 | ################################################################################
47 | export MASTER_SECURITY_GROUP :=
48 | export WORKER_SECURITY_GROUP :=
49 | export SERVICE_ACCESS_SG :=
50 | export SANDBOX_SG :=
51 | 
52 | export S3_ROOT_URI :=
53 | export S3_LOG_URI := ${S3_ROOT_URI}/logs/
54 | 
55 | export BATCH_CORE_INSTANCE_TYPE := m4.xlarge
56 | export BATCH_MASTER_INSTANCE_TYPE := m4.xlarge
57 | export OSMESA_APPS_JAR := s3://<bucket>/osmesa-apps.jar
58 | 
59 | export PLANET_HISTORY_PBF :=
60 | export PLANET_HISTORY_ORC_DIR :=
61 | export HISTORY_ORC :=
62 | 
63 | export CHANGESET_CSV :=
64 | export CHANGESET_COMMENTS_CSV :=
65 | export CHANGESET_TAGS_CSV :=
66 | export USERS_CSV :=
67 | export CHANGESETS_ORC :=
68 | 
69 | export FOOTPRINT_VT_LOCATION :=
70 | export HISTOGRAM_VT_LOCATION :=
71 | 
72 | export MAX_PG_CONNECTIONS := 64
73 | 
74 | # Uncomment the following line to save the processed stats
75 | # export STATS_SNAPSHOT_ORC :=
76 | 
77 | # Uncomment the following line to use the above snapshot in lieu of recalculating from history; setting to "no" will not turn feature off
78 | # export USE_SNAPSHOT := yes
79 | 


--------------------------------------------------------------------------------
/deployment/streaming/ecs-params.yml:
--------------------------------------------------------------------------------
 1 | # this file should be in deployment dir (relative to Makefile path)
 2 | # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html
 3 | # NOTE: comment it out for the test case
 4 | version: 1
 5 | task_definition:
 6 |   services:
 7 |     changeset-stream:
 8 |       mem_reservation: 2048m
 9 |     augdiff-stream:
10 |       mem_reservation: 2048m
11 |     user-footprint-updater:
12 |       mem_reservation: 4096m
13 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/batch-generate-edit-histograms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | aws emr create-cluster \
 9 |     --applications Name=Ganglia Name=Spark \
10 |     --ebs-root-volume-size 10 \
11 |     --ec2-attributes '{
12 | 	      "KeyName": "${KEYPAIR}",
13 | 	      "InstanceProfile":"EMR_EC2_DefaultRole",
14 | 	      "ServiceAccessSecurityGroup": "${SERVICE_ACCESS_SECURITY_GROUP}",
15 | 	      "SubnetId": "${SUBNET}",
16 | 	      "EmrManagedSlaveSecurityGroup": "${EMR_SLAVE_SECURITY_GROUP}",
17 | 	      "EmrManagedMasterSecurityGroup": "${EMR_MASTER_SECURITY_GROUP}"
18 | 	    }' \
19 |     --service-role EMR_DefaultRole \
20 |     --release-label emr-5.19.0 \
21 |     --name 'Faceted State of the Data tile generation' \
22 |     --instance-groups '[
23 | 	      {
24 | 	        "InstanceCount": 1,
25 | 	        "BidPrice": "OnDemandPrice",
26 | 	        "InstanceGroupType": "MASTER",
27 | 	        "InstanceType": "${BATCH_MASTER_INSTANCE_TYPE}",
28 | 	        "Name":"Master"
29 | 	      }, {
30 | 	        "InstanceCount": 20,
31 | 	        "BidPrice": "OnDemandPrice",
32 | 	        "InstanceGroupType": "CORE",
33 | 	        "InstanceType": "${BATCH_CORE_INSTANCE_TYPE}",
34 | 	        "Name":"Workers"
35 | 	      }
36 | 	    ]' \
37 |     --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
38 |     --auto-terminate \
39 |     --region us-east-1 \
40 |     --steps '[
41 | 	      {
42 | 	        "Args": [
43 | 	          "spark-submit",
44 | 	          "--deploy-mode", "cluster",
45 | 	          "--class", "osmesa.apps.batch.FacetedEditHistogramTileCreator",
46 | 	          "--conf", "spark.executor.memoryOverhead=2g",
47 | 	          "--conf", "spark.sql.shuffle.partitions=2000",
48 | 	          "--conf", "spark.speculation=true",
49 | 	          "${OSMESA_APPS_JAR}",
50 | 	          "--history", "${HISTORY_ORC}",
51 | 	          "--out", "${HISTOGRAM_VT_LOCATION}"
52 | 	        ],
53 | 	        "Type": "CUSTOM_JAR",
54 | 	        "ActionOnFailure": "TERMINATE_CLUSTER",
55 | 	        "Jar": "command-runner.jar",
56 | 	        "Properties": "",
57 | 	        "Name": "FacetedEditHistogramTileCreator"
58 | 	      }
59 | 	    ]'
60 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/batch-generate-footprints.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | aws emr create-cluster \
 9 |     --applications Name=Ganglia Name=Spark \
10 |     --ebs-root-volume-size 10 \
11 |     --ec2-attributes '{
12 | 	      "KeyName": "${KEYPAIR}",
13 | 	      "InstanceProfile":"EMR_EC2_DefaultRole",
14 | 	      "ServiceAccessSecurityGroup": "${SERVICE_ACCESS_SECURITY_GROUP}",
15 | 	      "SubnetId": "${SUBNET}",
16 | 	      "EmrManagedSlaveSecurityGroup": "${EMR_SLAVE_SECURITY_GROUP}",
17 | 	      "EmrManagedMasterSecurityGroup": "${EMR_MASTER_SECURITY_GROUP}"
18 | 	    }' \
19 |     --service-role EMR_DefaultRole \
20 |     --release-label emr-5.19.0 \
21 |     --name 'User footprint tile generation' \
22 |     --instance-groups '[
23 | 	      {
24 | 	        "InstanceCount": 1,
25 | 	        "BidPrice": "OnDemandPrice",
26 | 	        "InstanceGroupType": "MASTER",
27 | 	        "InstanceType": "${BATCH_MASTER_INSTANCE_TYPE}",
28 | 	        "Name":"Master"
29 | 	      }, {
30 | 	        "InstanceCount": 20,
31 | 	        "BidPrice": "OnDemandPrice",
32 | 	        "InstanceGroupType": "CORE",
33 | 	        "InstanceType": "${BATCH_CORE_INSTANCE_TYPE}",
34 | 	        "Name":"Workers"
35 | 	      }
36 | 	    ]' \
37 |     --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
38 |     --auto-terminate \
39 |     --region ${AWS_REGION} \
40 |     --steps '[
41 | 	      {
42 | 	        "Args": [
43 | 	          "spark-submit",
44 | 	          "--deploy-mode", "cluster",
45 | 	          "--class", "osmesa.apps.batch.FootprintCreator",
46 | 	          "--conf", "spark.executor.memoryOverhead=2g",
47 | 	          "--conf", "spark.sql.shuffle.partitions=2000",
48 | 	          "--conf", "spark.speculation=true",
49 | 	          "${OSMESA_APPS_JAR}",
50 | 	          "--history", "${HISTORY_ORC}",
51 | 	          "--changesets", "${CHANGESETS_ORC}",
52 | 	          "--out", "${FOOTPRINT_VT_LOCATION}",
53 | 	          "--type", "users",
54 | 	        ],
55 | 	        "Type": "CUSTOM_JAR",
56 | 	        "ActionOnFailure": "TERMINATE_CLUSTER",
57 | 	        "Jar": "command-runner.jar",
58 | 	        "Properties": "",
59 | 	        "Name": "FootprintCreator"
60 | 	      }
61 | 	    ]'
62 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/batch-process.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |   echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |   exit 1
 6 | fi
 7 | 
 8 | CLUSTER_NAME=$1
 9 | NUM_EXECUTORS=$2
10 | 
11 | shift 2
12 | 
13 | ARGS=
14 | while [ "$#" -gt 1 ] ; do
15 |     ARGS="$ARGS
16 |       {
17 |         \"Args\": $2,
18 |         \"Type\": \"CUSTOM_JAR\",
19 |         \"ActionOnFailure\": \"CONTINUE\",
20 |         \"Jar\": \"command-runner.jar\",
21 |         \"Properties\": \"\",
22 |         \"Name\": \"$1\"
23 |       }"
24 |     if [ "$#" -gt 2 ]; then
25 |         ARGS="$ARGS,"
26 |     fi
27 |     shift 2
28 | done
29 | 
30 | set -x
31 | aws emr create-cluster \
32 |   --applications Name=Ganglia Name=Spark Name=Hive \
33 |   --log-uri ${S3_LOG_URI} \
34 |   --configurations "file://scripts/emr-configurations/batch-process.json" \
35 |   --ebs-root-volume-size 10 \
36 |   --ec2-attributes "{
37 |       \"KeyName\": \"${KEYPAIR}\",
38 |       \"InstanceProfile\":\"EMR_EC2_DefaultRole\",
39 |       \"SubnetId\": \"${SUBNET}\",
40 |       \"EmrManagedMasterSecurityGroup\": \"${MASTER_SECURITY_GROUP}\",
41 |       \"EmrManagedSlaveSecurityGroup\": \"${WORKER_SECURITY_GROUP}\",
42 |       \"ServiceAccessSecurityGroup\": \"${SERVICE_ACCESS_SG}\",
43 |       \"AdditionalMasterSecurityGroups\": [\"${SANDBOX_SG}\"],
44 |       \"AdditionalSlaveSecurityGroups\": [\"${SANDBOX_SG}\"]
45 |     }" \
46 |   --service-role EMR_DefaultRole \
47 |   --release-label emr-5.29.0 \
48 |   --name "$CLUSTER_NAME" \
49 |   --instance-groups "[
50 |       {
51 |         \"InstanceCount\": 1,
52 |         \"BidPrice\": \"OnDemandPrice\",
53 |         \"InstanceGroupType\": \"MASTER\",
54 |         \"InstanceType\": \"${BATCH_MASTER_INSTANCE_TYPE}\",
55 |         \"Name\":\"Master\",
56 |         \"EbsConfiguration\": {
57 |           \"EbsOptimized\": true,
58 |           \"EbsBlockDeviceConfigs\": [{
59 |             \"VolumeSpecification\": {
60 |               \"VolumeType\": \"gp2\",
61 |               \"SizeInGB\": 1024
62 |             }
63 |           }]
64 |         }
65 |       }, {
66 |         \"InstanceCount\": ${NUM_EXECUTORS},
67 |         \"BidPrice\": \"OnDemandPrice\",
68 |         \"InstanceGroupType\": \"CORE\",
69 |         \"InstanceType\": \"${BATCH_CORE_INSTANCE_TYPE}\",
70 |         \"Name\":\"Workers\",
71 |         \"EbsConfiguration\": {
72 |           \"EbsOptimized\": true,
73 |           \"EbsBlockDeviceConfigs\": [{
74 |             \"VolumeSpecification\": {
75 |               \"VolumeType\": \"gp2\",
76 |               \"SizeInGB\": 1024
77 |             }
78 |           }]
79 |         }
80 |       }
81 |     ]" \
82 |   --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
83 |   --auto-terminate \
84 |   --region us-east-1 \
85 |   --steps "[
86 |         $ARGS
87 |     ]"
88 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/create-log-groups.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | DEFINED_GROUPS=$(aws logs describe-log-groups | jq '.logGroups[].logGroupName' | sed -e 's/"//g')
 9 | 
10 | if [[ $DEFINED_GROUPS != *"/ecs/${AWS_LOG_GROUP}"* ]]; then
11 |     aws logs create-log-group \
12 |         --log-group-name /ecs/${AWS_LOG_GROUP}
13 | fi
14 | 
15 | if [[ $DEFINED_GROUPS != *"/ecs/${AWS_LOG_GROUP}${TASK_SUFFIX}"* ]]; then
16 |     aws logs create-log-group \
17 |         --log-group-name /ecs/${AWS_LOG_GROUP}${TASK_SUFFIX}
18 | fi
19 | 
20 | if [[ $DEFINED_GROUPS != *"/ecs/streaming-user-footprint-tile-updater"* ]]; then
21 |     aws logs create-log-group \
22 |         --log-group-name /ecs/streaming-user-footprint-tile-updater
23 | fi
24 | 
25 | if [[ $DEFINED_GROUPS != *"/ecs/streaming-edit-histogram-tile-updater"* ]]; then
26 |     aws logs create-log-group \
27 |         --log-group-name /ecs/streaming-edit-histogram-tile-updater
28 | fi
29 | 
30 | if [[ $DEFINED_GROUPS != *"/ecs/osmesa-streaming-augdiff-producer"* ]]; then
31 |     aws logs create-log-group \
32 |         --log-group-name /ecs/osmesa-streaming-augdiff-producer
33 | fi
34 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/define-production-streaming-update-tasks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | aws ecs register-task-definition \
 9 |     --family streaming-stats-updater \
10 |     --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \
11 |     --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \
12 |     --network-mode awsvpc \
13 |     --requires-compatibilities EC2 FARGATE \
14 |     --cpu "1 vCPU" \
15 |     --memory "4 GB" \
16 |     --container-definitions "[
17 | 	    {
18 | 	      \"logConfiguration\": {
19 | 	        \"logDriver\": \"awslogs\",
20 | 	        \"options\": {
21 | 	          \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}\",
22 | 	          \"awslogs-region\": \"${AWS_REGION}\",
23 | 	          \"awslogs-stream-prefix\": \"ecs\"
24 | 	        }
25 | 	      },
26 | 	      \"command\": [
27 | 	        \"/spark/bin/spark-submit\",
28 | 	        \"--driver-memory\", \"2048m\",
29 | 	        \"--class\", \"osmesa.apps.streaming.StreamingChangesetStatsUpdater\",
30 | 	        \"/opt/osmesa-apps.jar\",
31 | 	        \"--augmented-diff-source\", \"${AUGDIFF_SOURCE}\"
32 | 	      ],
33 | 	      \"environment\": [
34 | 	        {
35 | 	          \"name\": \"DATABASE_URL\",
36 | 	          \"value\": \"${DB_BASE_URI}/${PRODUCTION_DB}\"
37 | 	        }
38 | 	      ],
39 | 	      \"image\": \"${ECR_STATS_IMAGE}:production\",
40 | 	      \"name\": \"streaming-changeset-stats-updater\"
41 | 	    },
42 | 	    {
43 | 	      \"logConfiguration\": {
44 | 	        \"logDriver\": \"awslogs\",
45 | 	        \"options\": {
46 | 	          \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}\",
47 | 	          \"awslogs-region\": \"${AWS_REGION}\",
48 | 	          \"awslogs-stream-prefix\": \"ecs\"
49 | 	        }
50 | 	      },
51 | 	      \"command\": [
52 | 	        \"/spark/bin/spark-submit\",
53 | 	        \"--driver-memory\", \"2048m\",
54 | 	        \"--class\", \"osmesa.apps.streaming.StreamingChangesetMetadataUpdater\",
55 | 	        \"/opt/osmesa-apps.jar\",
56 | 	        \"--changeset-source\", \"${CHANGESET_SOURCE}\"
57 | 	      ],
58 | 	      \"environment\": [
59 | 	        {
60 | 	          \"name\": \"DATABASE_URL\",
61 | 	          \"value\": \"${DB_BASE_URI}/${PRODUCTION_DB}\"
62 | 	        }
63 | 	      ],
64 | 	      \"image\": \"${ECR_STATS_IMAGE}:production\",
65 | 	      \"name\": \"streaming-changeset-metadata-updater\"
66 | 	    }
67 | 	  ]"
68 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/define-production-view-refresher.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | aws ecs register-task-definition \
 9 |     --family osmesa-stats-view-refresher \
10 |     --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \
11 |     --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \
12 |     --network-mode awsvpc \
13 |     --requires-compatibilities EC2 FARGATE \
14 |     --cpu "0.25 vCPU" \
15 |     --memory "0.5 GB" \
16 |     --container-definitions "[
17 |             {
18 |               \"logConfiguration\": {
19 |                 \"logDriver\": \"awslogs\",
20 |                 \"options\": {
21 | 	          \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}\",
22 | 	          \"awslogs-region\": \"${AWS_REGION}\",
23 |                   \"awslogs-stream-prefix\": \"ecs\"
24 |                 }
25 |               },
26 |               \"command\": [
27 |                 \"refresh-views.sh\"
28 |               ],
29 |               \"environment\": [
30 |                 {
31 |                   \"name\": \"DATABASE_URL\",
32 | 	          \"value\": \"${DB_BASE_URI}/${PRODUCTION_DB}\"
33 |                 },
34 |                 {
35 |                   \"name\": \"DATABASE_NAME\",
36 | 	          \"value\": \"${PRODUCTION_DB}\"
37 |                 }
38 |               ],
39 | 	      \"image\": \"${ECR_REFRESH_IMAGE}:production\",
40 |               \"name\": \"stats-view-refresher\"
41 |             }
42 |           ]"
43 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/define-staging-streaming-update-tasks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | aws ecs register-task-definition \
 9 |     --family streaming-stats-updater${TASK_SUFFIX} \
10 |     --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \
11 |     --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \
12 |     --network-mode awsvpc \
13 |     --requires-compatibilities EC2 FARGATE \
14 |     --cpu "1 vCPU" \
15 |     --memory "${ECS_MEMORY_GB} GB" \
16 |     --container-definitions "[
17 | 	    {
18 | 	      \"logConfiguration\": {
19 | 	        \"logDriver\": \"awslogs\",
20 | 	        \"options\": {
21 | 	          \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}${TASK_SUFFIX}\",
22 | 	          \"awslogs-region\": \"${AWS_REGION}\",
23 | 	          \"awslogs-stream-prefix\": \"ecs\"
24 | 	        }
25 | 	      },
26 | 	      \"command\": [
27 | 	        \"/spark/bin/spark-submit\",
28 | 	        \"--driver-memory\", \"${DRIVER_MEMORY}\",
29 | 	        \"--class\", \"osmesa.apps.streaming.StreamingChangesetStatsUpdater\",
30 | 	        \"/opt/osmesa-apps.jar\",
31 | 	        \"--augmented-diff-source\", \"${AUGDIFF_SOURCE}\"
32 | 	      ],
33 | 	      \"environment\": [
34 | 	        {
35 | 	          \"name\": \"DATABASE_URL\",
36 | 	          \"value\": \"${DB_BASE_URI}/${STAGING_DB}\"
37 | 	        }
38 | 	      ],
39 | 	      \"image\": \"${ECR_STATS_IMAGE}:${CONTAINER_TAG}\",
40 | 	      \"name\": \"streaming-changeset-stats-updater${TASK_SUFFIX}\"
41 | 	    },
42 | 	    {
43 | 	      \"logConfiguration\": {
44 | 	        \"logDriver\": \"awslogs\",
45 | 	        \"options\": {
46 | 	          \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}${TASK_SUFFIX}\",
47 | 	          \"awslogs-region\": \"${AWS_REGION}\",
48 | 	          \"awslogs-stream-prefix\": \"ecs\"
49 | 	        }
50 | 	      },
51 | 	      \"command\": [
52 | 	        \"/spark/bin/spark-submit\",
53 | 	        \"--driver-memory\", \"${DRIVER_MEMORY}\",
54 | 	        \"--class\", \"osmesa.apps.streaming.StreamingChangesetMetadataUpdater\",
55 | 	        \"/opt/osmesa-apps.jar\",
56 | 	        \"--changeset-source\", \"${CHANGESET_SOURCE}\"
57 | 	      ],
58 | 	      \"environment\": [
59 | 	        {
60 | 	          \"name\": \"DATABASE_URL\",
61 | 	          \"value\": \"${DB_BASE_URI}/${STAGING_DB}\"
62 | 	        }
63 | 	      ],
64 | 	      \"image\": \"${ECR_STATS_IMAGE}:${CONTAINER_TAG}\",
65 | 	      \"name\": \"streaming-changeset-metadata-updater${TASK_SUFFIX}\"
66 | 	    }
67 | 	  ]"
68 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/define-staging-view-refresher.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | aws ecs register-task-definition \
 9 |     --family osmesa-stats-view-refresher${TASK_SUFFIX} \
10 |     --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \
11 |     --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \
12 |     --network-mode awsvpc \
13 |     --requires-compatibilities EC2 FARGATE \
14 |     --cpu "0.25 vCPU" \
15 |     --memory "0.5 GB" \
16 |     --container-definitions "[
17 |             {
18 |               \"logConfiguration\": {
19 |                 \"logDriver\": \"awslogs\",
20 |                 \"options\": {
21 | 	          \"awslogs-group\": \"/ecs/${AWS_LOG_GROUP}${TASK_SUFFIX}\",
22 | 	          \"awslogs-region\": \"${AWS_REGION}\",
23 |                   \"awslogs-stream-prefix\": \"ecs\"
24 |                 }
25 |               },
26 |               \"command\": [
27 |                 \"refresh-views.sh\"
28 |               ],
29 |               \"environment\": [
30 |                 {
31 |                   \"name\": \"DATABASE_URL\",
32 | 	          \"value\": \"${DB_BASE_URI}/${STAGING_DB}\"
33 |                 },
34 |                 {
35 |                   \"name\": \"DATABASE_NAME\",
36 |                   \"value\": \"${STAGING_DB}\"
37 |                 }
38 |               ],
39 | 	      \"image\": \"${ECR_REFRESH_IMAGE}:${CONTAINER_TAG}\",
40 |               \"name\": \"stats-view-refresher${TASK_SUFFIX}\"
41 |             }
42 |           ]"
43 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/define-streaming-augdiff-producer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | aws ecs register-task-definition \
 9 |     --family "${AUGDIFF_SERVICE_NAME}${TASK_SUFFIX}" \
10 |     --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \
11 |     --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \
12 |     --network-mode awsvpc \
13 |     --requires-compatibilities EC2 FARGATE \
14 |     --cpu "1 vCPU" \
15 |     --memory "${AUGDIFF_ECS_MEMORY_GB} GB" \
16 |     --container-definitions "[
17 | 	    {
18 | 	      \"logConfiguration\": {
19 | 	        \"logDriver\": \"awslogs\",
20 | 	        \"options\": {
21 | 	          \"awslogs-group\": \"/ecs/osmesa-streaming-augdiff-producer\",
22 | 	          \"awslogs-region\": \"${AWS_REGION}\",
23 | 	          \"awslogs-stream-prefix\": \"ecs\"
24 | 	        }
25 | 	      },
26 | 	      \"command\": [
27 |                 \"-s\",
28 |                 \"onramp\",
29 | 	        \"${AUGDIFF_SOURCE}\"
30 | 	      ],
31 | 	      \"environment\": [
32 | 	        {
33 | 	          \"name\": \"OVERPASS_URL\",
34 | 	          \"value\": \"${OVERPASS_URL}\"
35 | 	        },
36 | 	        {
37 | 	          \"name\": \"ONRAMP_URL\",
38 | 	          \"value\": \"${ONRAMP_URL}\"
39 | 	        },
40 |                 {
41 |                   \"name\": \"NODE_OPTIONS\",
42 |                   \"value\": \"${NODE_OPTIONS}\"
43 |                 }
44 | 	      ],
45 | 	      \"image\": \"${AUGDIFF_ECR_IMAGE}\",
46 | 	      \"name\": \"${AUGDIFF_SERVICE_NAME}\"
47 | 	    }
48 | 	  ]"
49 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/define-streaming-vectortile-tasks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | aws ecs register-task-definition \
 9 |     --family streaming-edit-histogram-tile-updater \
10 |     --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \
11 |     --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \
12 |     --network-mode awsvpc \
13 |     --requires-compatibilities EC2 FARGATE \
14 |     --cpu "4 vCPU" \
15 |     --memory "30 GB" \
16 |     --container-definitions "[
17 | 	    {
18 | 	      \"logConfiguration\": {
19 | 	        \"logDriver\": \"awslogs\",
20 | 	        \"options\": {
21 | 	          \"awslogs-group\": \"/ecs/streaming-edit-histogram-tile-updater\",
22 | 	          \"awslogs-region\": \"${AWS_REGION}\",
23 | 	          \"awslogs-stream-prefix\": \"ecs\"
24 | 	        }
25 | 	      },
26 | 	      \"command\": [
27 | 	        \"/spark/bin/spark-submit\",
28 | 	        \"--driver-memory\", \"27G\",
29 | 	        \"--class\", \"osmesa.apps.streaming.StreamingFacetedEditHistogramTileUpdater\",
30 | 	        \"/opt/osmesa-apps.jar\",
31 | 	        \"--augmented-diff-source\", \"${AUGDIFF_SOURCE}\",
32 | 	        \"--tile-source\", \"${HISTOGRAM_VT_LOCATION}\",
33 | 	        \"--batch-size\", \"4\"
34 | 	      ],
35 | 	      \"environment\": [
36 | 	        {
37 | 	          \"name\": \"DATABASE_URL\",
38 | 	          \"value\": \"${DB_BASE_URI}/${PRODUCTION_DB}\"
39 | 	        }
40 | 	      ],
41 | 	      \"image\": \"${ECR_STATS_IMAGE}:production\",
42 | 	      \"name\": \"streaming-edit-histogram-tile-updater\"
43 | 	    }
44 | 	  ]"
45 | 
46 | aws ecs register-task-definition \
47 |     --family streaming-user-footprint-tile-updater \
48 |     --task-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ECSTaskS3" \
49 |     --execution-role-arn "arn:aws:iam::${IAM_ACCOUNT}:role/ecsTaskExecutionRole" \
50 |     --network-mode awsvpc \
51 |     --requires-compatibilities EC2 FARGATE \
52 |     --cpu "2 vCPU" \
53 |     --memory "8 GB" \
54 |     --container-definitions "[
55 | 	    {
56 | 	      \"logConfiguration\": {
57 | 	        \"logDriver\": \"awslogs\",
58 | 	        \"options\": {
59 | 	          \"awslogs-group\": \"/ecs/streaming-user-footprint-tile-updater\",
60 | 	          \"awslogs-region\": \"${AWS_REGION}\",
61 | 	          \"awslogs-stream-prefix\": \"ecs\"
62 | 	        }
63 | 	      },
64 | 	      \"command\": [
65 | 	        \"/spark/bin/spark-submit\",
66 | 	        \"--driver-memory\", \"7G\",
67 | 	        \"--class\", \"osmesa.apps.streaming.StreamingUserFootprintTileUpdater\",
68 | 	        \"/opt/osmesa-apps.jar\",
69 | 	        \"--change-source\", \"${CHANGE_SOURCE}\",
70 | 	        \"--tile-source\", \"${FOOTPRINT_VT_LOCATION}\",
71 | 	        \"--batch-size\", \"4\"
72 | 	      ],
73 | 	      \"environment\": [
74 | 	        {
75 | 	          \"name\": \"DATABASE_URL\",
76 | 	          \"value\": \"${DB_BASE_URI}/${PRODUCTION_DB}\"
77 | 	        }
78 | 	      ],
79 | 	      \"image\": \"${ECR_STATS_IMAGE}:production\",
80 | 	      \"name\": \"streaming-user-footprint-tile-updater\"
81 | 	    }
82 | 	  ]"
83 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/deploy-stats-refresher.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | aws events put-rule --schedule-expression "rate(1 minute)" --name osmesa-stats-view-refresher${TASK_SUFFIX}
 9 | aws events put-targets \
10 |     --rule "osmesa-stats-view-refresher${TASK_SUFFIX}" \
11 |     --targets "[
12 |       {
13 |       \"Id\": \"osmesa-stats-view-refresher${TASK_SUFFIX}\",
14 |       \"Arn\": \"arn:aws:ecs:${AWS_REGION}:${IAM_ACCOUNT}:cluster/${ECS_CLUSTER}\",
15 |       \"RoleArn\": \"arn:aws:iam::${IAM_ACCOUNT}:role/ecsEventsRole\",
16 |       \"EcsParameters\": {
17 |         \"TaskDefinitionArn\": \"arn:aws:ecs:${AWS_REGION}:${IAM_ACCOUNT}:task-definition/osmesa-stats-view-refresher${TASK_SUFFIX}\",
18 |         \"TaskCount\": 1,
19 |         \"LaunchType\": \"FARGATE\",
20 |         \"NetworkConfiguration\": {
21 |           \"awsvpcConfiguration\": {
22 |             \"Subnets\": [\"${ECS_SUBNET}\"],
23 |             \"SecurityGroups\": [\"${ECS_SECURITY_GROUP}\"],
24 |             \"AssignPublicIp\": \"DISABLED\"
25 |           }
26 |         }
27 |       }
28 |     }
29 |   ]"
30 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/emr-configurations/batch-process.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "Classification": "spark",
 4 |     "Properties": {
 5 |       "maximizeResourceAllocation": "false"
 6 |     }
 7 |   },
 8 |   {
 9 |     "Classification": "spark-defaults",
10 |     "Properties": {
11 |       "spark.dynamicAllocation.enabled": "true",
12 |       "spark.shuffle.service.enabled": "true",
13 |       "spark.shuffle.compress": "true",
14 |       "spark.shuffle.spill.compress": "true",
15 |       "spark.sql.shuffle.partitions": "2000",
16 |       "spark.speculation": "true",
17 |       "spark.rdd.compress": "true",
18 |       "spark.executor.memory": "2G",
19 |       "spark.executor.memoryOverhead": "1G",
20 |       "spark.driver.cores": "2",
21 |       "spark.driver.memory": "10G",
22 |       "spark.driver.memoryOverhead": "1G",
23 |       "spark.driver.maxResultSize": "3G",
24 |       "spark.executor.extraJavaOptions" : "-XX:+UseParallelGC -Dgeotrellis.s3.threads.rdd.write=64"
25 |     }
26 |   },
27 |   {
28 |     "Classification": "hdfs-site",
29 |     "Properties": {
30 |       "dfs.replication": "1",
31 |       "dfs.permissions": "false",
32 |       "dfs.datanode.max.xcievers": "16384",
33 |       "dfs.datanode.max.transfer.threads": "16384",
34 |       "dfs.datanode.balance.max.concurrent.moves": "1000",
35 |       "dfs.datanode.balance.bandwidthPerSec": "100000000"
36 |     }
37 |   },
38 |   {
39 |     "Classification": "yarn-site",
40 |     "Properties": {
41 |       "yarn.resourcemanager.am.max-attempts": "1",
42 |       "yarn.nodemanager.vmem-check-enabled": "false",
43 |       "yarn.nodemanager.pmem-check-enabled": "false"
44 |     }
45 |   },
46 |   {
47 |     "Classification": "hadoop-env",
48 |     "Configurations": [
49 |       {
50 |         "Classification": "export",
51 |         "Properties": {
52 |           "JAVA_HOME": "/usr/lib/jvm/java-1.8.0",
53 |           "GDAL_DATA": "/usr/local/share/gdal",
54 |           "LD_LIBRARY_PATH": "/usr/local/lib",
55 |           "PYSPARK_PYTHON": "python27",
56 |           "PYSPARK_DRIVER_PYTHON": "python27"
57 |         }
58 |       }
59 |     ]
60 |   },
61 |   {
62 |     "Classification": "spark-env",
63 |     "Configurations": [
64 |       {
65 |         "Classification": "export",
66 |         "Properties": {
67 |           "JAVA_HOME": "/usr/lib/jvm/java-1.8.0",
68 |           "GDAL_DATA": "/usr/local/share/gdal",
69 |           "LD_LIBRARY_PATH": "/usr/local/lib",
70 |           "SPARK_PRINT_LAUNCH_COMMAND": "1",
71 |           "PYSPARK_PYTHON": "python27",
72 |           "PYSPARK_DRIVER_PYTHON": "python27"
73 |         }
74 |       }
75 |     ]
76 |   },
77 |   {
78 |     "Classification": "yarn-env",
79 |     "Configurations": [
80 |       {
81 |         "Classification": "export",
82 |         "Properties": {
83 |           "JAVA_HOME": "/usr/lib/jvm/java-1.8.0",
84 |           "GDAL_DATA": "/usr/local/share/gdal",
85 |           "LD_LIBRARY_PATH": "/usr/local/lib",
86 |           "PYSPARK_PYTHON": "python27",
87 |           "PYSPARK_DRIVER_PYTHON": "python27"
88 |         }
89 |       }
90 |     ]
91 |   }
92 | ]
93 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/expand.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | PROG=$(basename $0)
 6 | 
 7 | usage()
 8 | {
 9 |     echo "${PROG} <template-file>"
10 | }
11 | 
12 | expand()
13 | {
14 |     local template="$(cat $1)"
15 |     eval "echo \"${template}\""
16 | }
17 | 
18 | case $# in
19 |     1) expand "$1";;
20 |     *) usage; exit 0;;
21 | esac
22 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/get-tag.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$(git branch | grep '* master')" = "* master" ]; then
 4 |     while true; do
 5 | 	>&2 echo "You are on the master branch.  Do you wish to publish to the production tag?"
 6 | 	select yn in "Yes" "No"; do
 7 | 	    case $yn in
 8 | 		Yes ) VERSION_TAG="production"; break;;
 9 | 		No ) VERSION_TAG="latest"; break;;
10 | 	    esac
11 | 	done
12 |         break
13 |     done
14 | else
15 |     if [ -z ${OVERRIDE_TAG+x} ]; then
16 |         VERSION_TAG="latest"
17 |     else
18 |         VERSION_TAG=${OVERRIDE_TAG}
19 |     fi
20 | fi
21 | 
22 | echo -n "${VERSION_TAG}"
23 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/latest-history-to-orc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install/build osm2orc
 4 | cd /mnt
 5 | sudo yum install -y git
 6 | git clone https://github.com/mojodna/osm2orc.git
 7 | cd osm2orc
 8 | ./gradlew distTar
 9 | tar xf build/distributions/osm2orc-*.tar -C /tmp
10 | 
11 | # Download latest planet history file
12 | aws s3 cp $PLANET_HISTORY_PBF /mnt
13 | 
14 | # Convert to ORC
15 | DATE=$(stat /mnt/planet-history.osm.pbf | sed -n -e 's/-//g;s/Modify: \([0-9\-]*\).*/\1/p')
16 | /tmp/osm2orc-*/bin/osm2orc /mnt/planet-history.osm.pbf /mnt/planet-${DATE}.osh.orc
17 | 
18 | # Upload ORC
19 | aws s3 cp /mnt/planet-${DATE}.osh.orc $PLANET_HISTORY_ORC_DIR
20 | 


--------------------------------------------------------------------------------
/deployment/streaming/scripts/stop-streaming-service.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${VERSION_TAG+x} ]; then
 4 |     echo "Do not run this script directly.  Use the Makefile in the parent directory."
 5 |     exit 1
 6 | fi
 7 | 
 8 | SERVICE=$1
 9 | echo "Attempting to stop $SERVICE on cluster $ECS_CLUSTER"
10 | 
11 | check_status() {
12 |     STATUS=$(aws ecs describe-services --services $SERVICE --cluster $ECS_CLUSTER | jq '.services[].status')
13 | }
14 | 
15 | check_status
16 | if [[ $STATUS == "\"ACTIVE\"" ]]; then
17 |     aws ecs delete-service --service $SERVICE --cluster $ECS_CLUSTER --force
18 |     echo "Waiting for shut down"
19 |     check_status
20 |     while [[ $STATUS != "\"INACTIVE\"" ]]; do
21 |         echo "  current status: $STATUS, still waiting"
22 |         sleep 15s
23 |         check_status
24 |     done
25 |     echo "  final status: $STATUS"
26 | else
27 |     echo "Status was $STATUS, nothing to stop"
28 | fi
29 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2.3'
 2 | services:
 3 |   backend:
 4 |     image: "osmesa-streaming-stats:${VERSION_TAG}"
 5 |     build:
 6 |       context: ./src
 7 |       dockerfile: Dockerfile.apps
 8 | 
 9 |   refresher:
10 |     image: "osmesa-stats-refresher:${VERSION_TAG}"
11 |     build:
12 |       context: ./src
13 |       dockerfile: Dockerfile.refresh
14 | 
15 |   database:
16 |     image: quay.io/azavea/postgis:2.4-postgres10.6-slim
17 |     environment:
18 |       - POSTGRES_USER=osmesa_stats
19 |       - POSTGRES_PASSWORD=osmesa_stats
20 |       - POSTGRES_DB=osmesa_stats
21 |     healthcheck:
22 |       test: ["CMD", "pg_isready", "-U", "osmesa_stats"]
23 |       interval: 3s
24 |       timeout: 3s
25 |       retries: 3
26 |       start_period: 5s
27 |     ports:
28 |       - 5433:5432
29 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.1.2
2 | 


--------------------------------------------------------------------------------
/scripts/cibuild:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [[ -n "${OSMESA_DEBUG}" ]]; then
 6 |     set -x
 7 | fi
 8 | 
 9 | if [ -z ${VERSION_TAG+x} ]; then
10 |     VERSION_TAG="$(git rev-parse --short HEAD)"
11 |     echo "VERSION_TAG was unset; using ${VERSION_TAG}"
12 | fi
13 | 
14 | DIR="$(dirname "$0")/../"
15 | 
16 | if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
17 |     echo "Building Scala assembly JAR"
18 |     pushd "${DIR}/src"
19 |     ./sbt "apps/assembly"
20 |     popd
21 | 
22 |     VERSION_TAG="${VERSION_TAG}" docker-compose \
23 |         -f docker-compose.yml \
24 |         build backend refresher
25 | fi
26 | 


--------------------------------------------------------------------------------
/scripts/cipublish:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [[ -n "${OSMESA_DEBUG}" ]]; then
 6 |     set -x
 7 | fi
 8 | 
 9 | if [ -z ${VERSION_TAG+x} ]; then
10 |     VERSION_TAG="$(git rev-parse --short HEAD)"
11 |     echo "VERSION_TAG was unset; using ${VERSION_TAG}"
12 | fi
13 | 
14 | DIR="$(dirname "$0")/../"
15 | 
16 | if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
17 |    mkdir osmesa-dist
18 |    cp src/apps/target/scala-2.11/osmesa-apps.jar osmesa-dist
19 |    cp -r deployment/sql osmesa-dist
20 | 
21 |    docker tag "osmesa-streaming-stats:${VERSION_TAG}" "${APP_IMAGE_ECR_ENDPOINT}:${VERSION_TAG}"
22 |    docker tag "osmesa-stats-refresher:${VERSION_TAG}" "${REFRESHER_IMAGE_ECR_ENDPOINT}:${VERSION_TAG}"
23 | 
24 |    eval "$(aws ecr get-login --no-include-email)"
25 |    docker push "${APP_IMAGE_ECR_ENDPOINT}:${VERSION_TAG}"
26 |    docker push "${REFRESHER_IMAGE_ECR_ENDPOINT}:${VERSION_TAG}"
27 | fi
28 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | derby.log
2 | 


--------------------------------------------------------------------------------
/src/.sbtopts:
--------------------------------------------------------------------------------
1 | -J-Xmx2g
2 | -J-XX:+CMSClassUnloadingEnabled
3 | -J-XX:+UseConcMarkSweepGC
4 | -Djava.awt.headless=true
5 | -Dsun.io.serialization.extendedDebugInfo=true
6 | -Dhttps.protocols=TLSv1,TLSv1.1,TLSv1.2


--------------------------------------------------------------------------------
/src/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | maxColumn = 100
2 | 


--------------------------------------------------------------------------------
/src/Dockerfile.apps:
--------------------------------------------------------------------------------
1 | FROM bde2020/spark-master:2.4.4-hadoop2.7
2 | 
3 | COPY apps/target/scala-2.11/osmesa-apps.jar /opt/osmesa-apps.jar
4 | COPY docker/log4j.properties /spark/conf/
5 | ENV PATH=$PATH:/spark/bin
6 | 
7 | WORKDIR /opt
8 | ENTRYPOINT ["spark-submit"]
9 | 


--------------------------------------------------------------------------------
/src/Dockerfile.refresh:
--------------------------------------------------------------------------------
1 | FROM alpine:3.12
2 | 
3 | RUN apk update && apk add bash postgresql-client
4 | COPY docker/refresh-views.sh /usr/local/bin/refresh-views.sh
5 | 
6 | WORKDIR /opt
7 | 


--------------------------------------------------------------------------------
/src/analytics/.envrc:
--------------------------------------------------------------------------------
1 | test -f .env && dotenv
2 | 


--------------------------------------------------------------------------------
/src/analytics/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | package-lock.json
3 | .env
4 | 


--------------------------------------------------------------------------------
/src/analytics/bin/apply.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -eo pipefail
 4 | 
 5 | OPTIND=1         # Reset in case getopts has been used previously in the shell.
 6 | 
 7 | replication_source=""
 8 | tile_source=""
 9 | 
10 | while getopts "r:s:t:" opt; do
11 |   case "$opt" in
12 |   r) replication_source=$OPTARG
13 |     ;;
14 |   s) sequence=$OPTARG
15 |     ;;
16 |   t) tile_source=$OPTARG
17 |     ;;
18 |   esac
19 | done
20 | 
21 | shift $((OPTIND-1))
22 | 
23 | [ "$1" = "--" ] && shift
24 | 
25 | if [[ -z $sequence ]]; then
26 |   sequence=$(aws s3 cp ${tile_source}sequence.txt - 2> /dev/null)
27 | else
28 |   sequence=$[$sequence - 1]
29 | fi
30 | 
31 | if [[ "$sequence" == "-1" || -z $replication_source || -z $tile_source ]]; then
32 |   echo "Usage: $0 -r <replication source> -t <tile source> -s [initial sequence] -- [update-tiles options]"
33 |   exit 1
34 | fi
35 | 
36 | echo "Starting at sequence $(echo $[$sequence + 1])"
37 | 
38 | while true; do
39 |   set +e
40 |   aws s3 ls ${replication_source}$((sequence + 1)).json > /dev/null
41 |   retcode=$?
42 |   set -e
43 | 
44 |   if [[ $retcode -eq 0 ]]; then
45 |     sequence=$[$sequence + 1]
46 | 
47 |     $(dirname $0)/update-tiles -r $replication_source -t $tile_source -s urchn -l history -v $* $sequence
48 | 
49 |     echo $sequence | aws s3 cp - ${tile_source}sequence.txt
50 |   else
51 |     echo Waiting for $((sequence + 1))...
52 |     sleep 15
53 |   fi
54 | done
55 | 


--------------------------------------------------------------------------------
/src/analytics/build.sbt:
--------------------------------------------------------------------------------
 1 | import Dependencies._
 2 | 
 3 | name := "osmesa-analytics"
 4 | 
 5 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-core" % "2.6.7"
 6 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.7"
 7 | dependencyOverrides += "com.fasterxml.jackson.module" % "jackson-module-scala_2.11" % "2.6.7"
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   postgresql,
11 |   decline,
12 |   sparkHive % Provided,
13 |   sparkJts,
14 |   gtGeotools,
15 |   gtS3,
16 |   gtSpark,
17 |   gtVector,
18 |   gtVectorTile,
19 |   vectorpipe,
20 |   cats,
21 |   scalactic,
22 |   gtSparkTestKit,
23 |   logging,
24 |   log4j2,
25 |   scalatest
26 | )
27 | 
28 | /* Fixes Spark breakage with `sbt run` as of sbt-1.0.2 */
29 | fork in run := true
30 | 
31 | fork in Test := true
32 | 
33 | test in assembly := {}
34 | 
35 | javaOptions ++= Seq("-Xmx5G")
36 | 
37 | initialCommands in console :=
38 |   """
39 |   """
40 | 
41 | assemblyJarName in assembly := "osmesa-analytics.jar"
42 | 
43 | assemblyShadeRules in assembly := {
44 |   // TODO: Do we still need these shade rules?
45 |   val shadePackage = "com.azavea.shaded.demo"
46 |   Seq(
47 |     ShadeRule.rename("com.google.common.**" -> s"$shadePackage.google.common.@1")
48 |       .inLibrary("com.azavea.geotrellis" %% "geotrellis-cassandra" % Version.geotrellis).inAll,
49 |     ShadeRule.rename("io.netty.**" -> s"$shadePackage.io.netty.@1")
50 |       .inLibrary("com.azavea.geotrellis" %% "geotrellis-hbase" % Version.geotrellis).inAll,
51 |     ShadeRule.rename("com.fasterxml.jackson.**" -> s"$shadePackage.com.fasterxml.jackson.@1")
52 |       .inLibrary("com.networknt" % "json-schema-validator" % "0.1.7").inAll,
53 |     ShadeRule.rename("org.apache.avro.**" -> s"$shadePackage.org.apache.avro.@1")
54 |       .inLibrary("com.azavea.geotrellis" %% "geotrellis-spark" % Version.geotrellis).inAll
55 |   )
56 | }
57 | 
58 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
59 | 


--------------------------------------------------------------------------------
/src/analytics/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.2.1
2 | 


--------------------------------------------------------------------------------
/src/analytics/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
 9 | # log level for this class is used to overwrite the root logger's log level, so that
10 | # the user can have different defaults for the shell and regular Spark apps.
11 | log4j.logger.org.apache.spark.repl.Main=WARN
12 | 
13 | # Settings to quiet third party logs that are too verbose
14 | log4j.logger.org.spark_project.jetty=WARN
15 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
18 | log4j.logger.org.apache.parquet=ERROR
19 | log4j.logger.parquet=ERROR
20 | 
21 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
22 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
23 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
24 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=WARN, console
2 | log4j.appender.console=org.apache.log4j.ConsoleAppender
3 | log4j.appender.console.target=System.out
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | # log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c: %m%n
6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
7 | log4j.logger.osmesa=DEBUG


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/Analytics.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics
 2 | 
 3 | import geotrellis.spark.store.kryo.KryoRegistrator
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.serializer.KryoSerializer
 6 | import org.apache.spark.sql._
 7 | import org.locationtech.geomesa.spark.jts._
 8 | 
 9 | object Analytics {
10 |   def sparkSession(appName: String): SparkSession = {
11 |     val conf = new SparkConf()
12 |       .setIfMissing("spark.master", "local[*]")
13 |       .setAppName(s"OSMesa Analytics - ${appName}")
14 |       .set("spark.sql.orc.impl", "native")
15 |       .set("spark.sql.orc.filterPushdown", "true")
16 |       .set("spark.sql.parquet.mergeSchema", "false")
17 |       .set("spark.sql.parquet.filterPushdown", "true")
18 |       .set("spark.sql.hive.metastorePartitionPruning", "true")
19 |       .set("spark.ui.showConsoleProgress", "true")
20 |       .set("spark.serializer", classOf[KryoSerializer].getName)
21 |       .set("spark.kryo.registrator", classOf[KryoRegistrator].getName)
22 | 
23 |     SparkSession.builder
24 |       .config(conf)
25 |       .enableHiveSupport
26 |       .getOrCreate
27 |       .withJTS
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/Countries.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics
 2 | 
 3 | import org.locationtech.jts.geom.Coordinate
 4 | import org.locationtech.jts.geom.prep.{PreparedGeometry, PreparedGeometryFactory}
 5 | import geotrellis.vector._
 6 | import geotrellis.vector.io.json._
 7 | import _root_.io.circe._
 8 | import _root_.io.circe.generic.semiauto._
 9 | 
10 | 
11 | case class CountryId(name: String, code: Short)
12 | object CountryId {
13 |   implicit val countryIdDecoder: Decoder[CountryId] = deriveDecoder
14 |   implicit val countryIdEncoder: Encoder[CountryId] = deriveEncoder
15 | }
16 | 
17 | object Countries {
18 |   def all: Vector[MultiPolygonFeature[CountryId]] = {
19 |     val collection =
20 |       Resource("countries.geojson").
21 |         parseGeoJson[JsonFeatureCollection]
22 | 
23 |     val polys =
24 |       collection.
25 |         getAllPolygonFeatures[CountryId].
26 |         map(_.mapGeom(MultiPolygon(_)))
27 | 
28 |     val mps =
29 |       collection.
30 |         getAllMultiPolygonFeatures[CountryId]
31 | 
32 |     polys ++ mps
33 |   }
34 | 
35 |   def byName: Map[String, MultiPolygonFeature[CountryId]] =
36 |     all.map { f => (f.data.name, f) }.toMap
37 | 
38 |   def indexed: SpatialIndex[MultiPolygonFeature[CountryId]] =
39 |     SpatialIndex.fromExtents(all) { mpf => mpf.geom.getEnvelopeInternal }
40 | 
41 | }
42 | 
43 | class CountryLookup() extends Serializable {
44 |   private val index =
45 |     SpatialIndex.fromExtents(
46 |       Countries.all.
47 |         map { mpf =>
48 |           (PreparedGeometryFactory.prepare(mpf.geom), mpf.data)
49 |         }
50 |     ) { case (pg, _) => pg.getGeometry.getEnvelopeInternal }
51 | 
52 |   def lookup(coord: Coordinate): Option[CountryId] = {
53 |     val t =
54 |       new Traversable[(PreparedGeometry, CountryId)] {
55 |         override def foreach[U](f: ((PreparedGeometry, CountryId)) => U): Unit = {
56 |           val visitor = new org.locationtech.jts.index.ItemVisitor {
57 |             override def visitItem(obj: AnyRef): Unit = f(obj.asInstanceOf[(PreparedGeometry, CountryId)])
58 |           }
59 |           index.rtree.query(new org.locationtech.jts.geom.Envelope(coord), visitor)
60 |         }
61 |       }
62 | 
63 |     t.
64 |       find(_._1.covers(Point(coord.x, coord.y))).
65 |       map(_._2)
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/Resource.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics
 2 | 
 3 | import java.io._
 4 | 
 5 | object Resource {
 6 |   def apply(name: String): String = {
 7 |     val stream: InputStream = getClass.getResourceAsStream(s"/$name")
 8 |     try { scala.io.Source.fromInputStream( stream ).getLines.mkString(" ") } finally { stream.close() }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/S3Utils.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics
 2 | 
 3 | import java.net.{URI, URLDecoder}
 4 | import java.nio.charset.StandardCharsets
 5 | 
 6 | import geotrellis.store.s3.S3ClientProducer
 7 | import software.amazon.awssdk.services.s3.S3Client
 8 | import software.amazon.awssdk.services.s3.model.GetObjectRequest
 9 | 
10 | object S3Utils {
11 |   def readText(uri: String): String = {
12 |     val s3Client: S3Client = S3ClientProducer.get()
13 |     val s3uri = URI.create(uri)
14 |     val key = URLDecoder.decode(s3uri.getPath.drop(1), StandardCharsets.UTF_8.toString)
15 |     val request = GetObjectRequest.builder()
16 |       .bucket(s3uri.getHost)
17 |       .key(key)
18 |       .build()
19 |     s3Client.getObjectAsBytes(request).asString(StandardCharsets.UTF_8)
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/VectorGrid.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics
 2 | 
 3 | import java.io.ByteArrayInputStream
 4 | import java.net.URI
 5 | import java.util.zip.GZIPInputStream
 6 | 
 7 | import geotrellis.proj4.WebMercator
 8 | import geotrellis.layer.ZoomedLayoutScheme
 9 | import geotrellis.vector.Extent
10 | import geotrellis.vectortile.{Layer, MVTFeature, VInt64, VectorTile}
11 | import org.apache.commons.io.IOUtils
12 | import org.apache.spark.internal.Logging
13 | import osmesa.analytics.updater.Implicits._
14 | import osmesa.analytics.updater._
15 | 
16 | import scala.collection.GenMap
17 | import scala.collection.parallel.TaskSupport
18 | 
19 | trait VectorGrid extends Logging {
20 |   // Default base zoom (highest resolution tiles produced)
21 |   val DefaultBaseZoom: Int = 10
22 | 
23 |   // Number of cells per side in a gridded tile
24 |   implicit val Cells: Int = 128
25 | 
26 |   // Number of cells in a gridded tile at the base of the pyramid (may be used for over-zooming)
27 |   val BaseCells: Int = Cells
28 | 
29 |   // Default upload concurrency
30 |   val DefaultUploadConcurrency: Int = 8
31 | 
32 |   implicit val LayoutScheme: ZoomedLayoutScheme = ZoomedLayoutScheme(WebMercator)
33 |   val SequenceLayerName: String = "__sequences__"
34 | 
35 |   def getCommittedSequences(tile: VectorTile): Set[Int] =
36 |     // NOTE when working with hashtags, this should be the changeset sequence, since changes from a
37 |     // single sequence may appear in different batches depending on when changeset metadata arrives
38 |     tile.layers
39 |       .get(SequenceLayerName)
40 |       .map(_.features.flatMap(f => f.data.values.map(valueToLong).map(_.intValue)))
41 |       .map(_.toSet)
42 |       .getOrElse(Set.empty)
43 | 
44 |   def makeSequenceLayer(sequences: Set[Int], extent: Extent, tileWidth: Int = 4096): (String, Layer) = {
45 |     // create a second layer w/ a feature corresponding to committed sequences (in the absence of
46 |     // available tile / layer metadata)
47 |     val updatedSequences =
48 |       sequences.toSeq.sorted
49 |         .takeRight(1000)
50 |         .zipWithIndex
51 |         .map {
52 |           case (seq, idx) =>
53 |             idx.toString -> VInt64(seq)
54 |         }
55 |         .toMap
56 | 
57 |     val sequenceFeature = MVTFeature(extent.center, updatedSequences)
58 | 
59 |     makeLayer(SequenceLayerName, extent, Seq(sequenceFeature), tileWidth)
60 |   }
61 | 
62 |   def loadMVTs(urls: Map[URI, Extent])(
63 |       implicit taskSupport: TaskSupport): GenMap[URI, VectorTile] = {
64 |     // convert to a parallel collection to load more tiles concurrently
65 |     val parUrls = urls.par
66 |     parUrls.tasksupport = taskSupport
67 | 
68 |     parUrls.map {
69 |       case (uri, extent) =>
70 |         (uri,
71 |          read(uri).map(
72 |            bytes =>
73 |              VectorTile.fromBytes(
74 |                IOUtils.toByteArray(new GZIPInputStream(new ByteArrayInputStream(bytes))),
75 |                extent)))
76 |     } filter {
77 |       case (_, mvt) => mvt.isDefined
78 |     } map {
79 |       case (uri, mvt) => uri -> mvt.get
80 |     }
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/raster/MutableSparseIntTile.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics.raster
 2 | 
 3 | import geotrellis.raster.{
 4 |   ArrayTile,
 5 |   CellType,
 6 |   IntCellType,
 7 |   IntCells,
 8 |   IntConstantNoDataCellType,
 9 |   IntTileVisitor,
10 |   IntUserDefinedNoDataCellType,
11 |   MutableArrayTile,
12 |   NoDataHandling,
13 |   Tile,
14 |   isData
15 | }
16 | 
17 | import scala.collection.mutable
18 | 
19 | class MutableSparseIntTile(val cols: Int,
20 |                            val rows: Int,
21 |                            val values: scala.collection.mutable.LongMap[Int],
22 |                            val cellType: IntCells with NoDataHandling)
23 |     extends MutableArrayTile {
24 |   private val noDataValue = cellType match {
25 |     case IntConstantNoDataCellType        => Int.MinValue
26 |     case IntUserDefinedNoDataCellType(nd) => nd
27 |     case IntCellType                      => 0
28 |   }
29 | 
30 |   override def updateDouble(i: Int, z: Double): Unit = update(i, z.toInt)
31 | 
32 |   override def update(i: Int, z: Int): Unit = {
33 |     if (isData(z)) {
34 |       values(i) = z
35 |     } else {
36 |       values.remove(i)
37 |     }
38 |   }
39 | 
40 |   def interpretAs(newCellType: CellType): Tile = {
41 |     newCellType match {
42 |       case dt: IntCells with NoDataHandling =>
43 |         MutableSparseIntTile(cols, rows, values, dt)
44 |       case _ =>
45 |         withNoData(None).convert(newCellType)
46 |     }
47 |   }
48 | 
49 |   def withNoData(noDataValue: Option[Double]): Tile =
50 |     MutableSparseIntTile(cols, rows, values, cellType.withNoData(noDataValue))
51 | 
52 |   override def applyDouble(i: Int): Double = apply(i).toDouble
53 | 
54 |   override def apply(i: Int): Int = values.getOrElse(i, noDataValue)
55 | 
56 |   override def copy: ArrayTile = MutableSparseIntTile(cols, rows, values.clone(), cellType)
57 | 
58 |   // unimplemented because it doesn't make sense in this context (and MutableSparseIntTile can't be instantiated from
59 |   // Array[Byte])
60 |   override def toBytes(): Array[Byte] = ???
61 | 
62 |   def toMap: Map[Long, Int] = values.toMap
63 | 
64 |   override def foreachIntVisitor(visitor: IntTileVisitor): Unit = {
65 |     values.foreach {
66 |       case (k, v) =>
67 |         val col = k % cols
68 |         val row = k / cols
69 | 
70 |         visitor(col.toInt, row.toInt, v)
71 |     }
72 |   }
73 | }
74 | 
75 | object MutableSparseIntTile {
76 |   def apply(cols: Int,
77 |             rows: Int,
78 |             values: mutable.LongMap[Int] = mutable.LongMap.empty[Int],
79 |             cellType: IntCells with NoDataHandling = IntConstantNoDataCellType) =
80 |     new MutableSparseIntTile(cols, rows, values, cellType)
81 | }
82 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/raster/SparseIntTile.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics.raster
 2 | 
 3 | import geotrellis.raster.{
 4 |   ArrayTile,
 5 |   CellType,
 6 |   IntCellType,
 7 |   IntCells,
 8 |   IntConstantNoDataCellType,
 9 |   IntTileVisitor,
10 |   IntUserDefinedNoDataCellType,
11 |   MutableArrayTile,
12 |   NoDataHandling,
13 |   Tile
14 | }
15 | 
16 | class SparseIntTile(val cols: Int,
17 |                     val rows: Int,
18 |                     val values: Map[Long, Int],
19 |                     val cellType: IntCells with NoDataHandling)
20 |     extends ArrayTile {
21 |   private val noDataValue = cellType match {
22 |     case IntConstantNoDataCellType        => Int.MinValue
23 |     case IntUserDefinedNoDataCellType(nd) => nd
24 |     case IntCellType                      => 0
25 |   }
26 | 
27 |   def interpretAs(newCellType: CellType): Tile = {
28 |     newCellType match {
29 |       case dt: IntCells with NoDataHandling =>
30 |         SparseIntTile(cols, rows, values, dt)
31 |       case _ =>
32 |         withNoData(None).convert(newCellType)
33 |     }
34 |   }
35 | 
36 |   def withNoData(noDataValue: Option[Double]): Tile =
37 |     SparseIntTile(cols, rows, values, cellType.withNoData(noDataValue))
38 | 
39 |   override def applyDouble(i: Int): Double = apply(i).toDouble
40 | 
41 |   override def apply(i: Int): Int = values.getOrElse(i, noDataValue)
42 | 
43 |   override def copy: ArrayTile = SparseIntTile(cols, rows, Map(values.toSeq: _*), cellType)
44 | 
45 |   // unimplemented because it doesn't make sense in this context (and SparseIntTile can't be instantiated from
46 |   // Array[Byte])
47 |   override def toBytes(): Array[Byte] = ???
48 | 
49 |   def toMap: Map[Long, Int] = values
50 | 
51 |   override def mutable: MutableArrayTile =
52 |     MutableSparseIntTile(cols, rows, scala.collection.mutable.LongMap(values.toSeq: _*), cellType)
53 | 
54 |   override def foreachIntVisitor(visitor: IntTileVisitor): Unit = {
55 |     // NOTE only visits coordinates containing data; this isn't strictly correct for some uses
56 |     values.foreach {
57 |       case (k, v) =>
58 |         val col = k % cols
59 |         val row = k / cols
60 | 
61 |         visitor(col.toInt, row.toInt, v)
62 |     }
63 |   }
64 | }
65 | 
66 | object SparseIntTile {
67 |   def apply(cols: Int,
68 |             rows: Int,
69 |             values: Map[Long, Int] = Map.empty[Long, Int],
70 |             cellType: IntCells with NoDataHandling = IntConstantNoDataCellType) =
71 |     new SparseIntTile(cols, rows, values, cellType)
72 | }
73 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/raster/package.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics
 2 | import geotrellis.raster.{Raster, Tile, isData}
 3 | 
 4 | package object raster {
 5 |   implicit class RasterMethods(val raster: Raster[Tile]) {
 6 |     def toMap: Map[Long, Int] = {
 7 |       raster.tile match {
 8 |         case tile: SparseIntTile        => tile.toMap
 9 |         case tile: MutableSparseIntTile => tile.toMap
10 |         case tile =>
11 |           tile
12 |             .toArray()
13 |             .zipWithIndex
14 |             .filter(x => isData(x._1))
15 |             .map(x => (x._2.toLong, x._1))
16 |             .toMap
17 |       }
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/stats/functions/package.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics.stats
 2 | 
 3 | import org.apache.spark.sql.expressions.UserDefinedFunction
 4 | import org.apache.spark.sql.functions._
 5 | import vectorpipe.util._
 6 | 
 7 | package object functions {
 8 |   // A brief note about style
 9 |   // Spark functions are typically defined using snake_case, therefore so are the UDFs
10 |   // internal helper functions use standard Scala naming conventions
11 | 
12 |   lazy val merge_measurements: UserDefinedFunction = udf(_mergeDoubleCounts)
13 | 
14 |   lazy val sum_measurements: UserDefinedFunction = udf { counts: Iterable[Map[String, Double]] =>
15 |     Option(counts.reduce(_mergeDoubleCounts)).filter(_.nonEmpty).orNull
16 |   }
17 | 
18 |   lazy val sum_count_values: UserDefinedFunction = udf { counts: Map[String, Int] =>
19 |     counts.values.sum
20 |   }
21 | 
22 |   lazy val simplify_measurements: UserDefinedFunction = udf { counts: Map[String, Double] =>
23 |     counts.filter(_._2 != 0)
24 |   }
25 | 
26 |   lazy val simplify_counts: UserDefinedFunction = udf { counts: Map[String, Int] =>
27 |     counts.filter(_._2 != 0)
28 |   }
29 | 
30 |   private val _mergeIntCounts = (a: Map[String, Int], b: Map[String, Int]) =>
31 |     mergeMaps(Option(a).getOrElse(Map.empty),
32 |       Option(b).getOrElse(Map.empty))(_ + _)
33 | 
34 |   private val _mergeDoubleCounts = (a: Map[String, Double], b: Map[String, Double]) =>
35 |     mergeMaps(Option(a).getOrElse(Map.empty),
36 |       Option(b).getOrElse(Map.empty))(_ + _)
37 | }
38 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/updater/Implicits.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics.updater
 2 | 
 3 | import geotrellis.vectortile.{VInt64, VString, Value}
 4 | 
 5 | object Implicits {
 6 |   implicit def valueToLong(x: Value): Long = (x: @unchecked) match {
 7 |     case y: VInt64  => y.value
 8 |     case y: VString => y.value.toLong
 9 |   }
10 | 
11 |   implicit def valueToString(x: Value): String = (x: @unchecked) match {
12 |     case y: VInt64  => y.value.toString
13 |     case y: VString => y.value
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/updater/Schema.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics.updater
 2 | 
 3 | import java.sql.Timestamp
 4 | import java.time.Instant
 5 | 
 6 | import geotrellis.vector.Geometry
 7 | import geotrellis.vectortile.{Layer, MVTFeature}
 8 | import org.apache.log4j.Logger
 9 | import osmesa.analytics.updater.Implicits._
10 | 
11 | trait Schema {
12 |   val layer: Layer
13 |   val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]
14 | 
15 |   val newFeatures: Seq[MVTFeature[Geometry]]
16 |   lazy val replacementFeatures: Seq[MVTFeature[Geometry]] = Seq.empty[MVTFeature[Geometry]]
17 |   lazy val retainedFeatures: Seq[MVTFeature[Geometry]] = Seq.empty[MVTFeature[Geometry]]
18 | 
19 |   protected lazy val logger: Logger = Logger.getLogger(getClass)
20 | 
21 |   protected lazy val touchedFeatures: Map[String, Seq[MVTFeature[Geometry]]] =
22 |     Map.empty[String, Seq[MVTFeature[Geometry]]]
23 | 
24 |   protected lazy val versionInfo: Map[String, (Int, Int, Timestamp)] =
25 |     touchedFeatures
26 |       .mapValues(_.last)
27 |       .mapValues(
28 |         f =>
29 |           (
30 |             f.data("__version").toInt,
31 |             f.data("__minorVersion").toInt,
32 |             Timestamp.from(Instant.ofEpochMilli(f.data("__updated")))
33 |         ))
34 | 
35 |   protected lazy val minorVersions: Map[String, Int] =
36 |     features
37 |       .mapValues {
38 |         case (_, curr) => curr.data
39 |       }
40 |       .map {
41 |         case (id, f) =>
42 |           versionInfo.get(id) match {
43 |             case Some((prevVersion, _, _)) if prevVersion < f.version => (id, 0)
44 |             case Some((prevVersion, prevMinorVersion, _)) if prevVersion == f.version =>
45 |               (id, prevMinorVersion + 1)
46 |             case _ => (id, 0)
47 |           }
48 |       }
49 | }
50 | 
51 | trait SchemaBuilder {
52 |   val layerName: String
53 | 
54 |   def apply(layer: Layer,
55 |             features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]): Schema
56 | }
57 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/updater/TileUpdater.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.analytics.updater
  2 | 
  3 | import java.io.File
  4 | import java.net.URI
  5 | import java.nio.file.Path
  6 | 
  7 | import cats.implicits._
  8 | import com.monovore.decline._
  9 | import org.apache.log4j.Logger
 10 | import osmesa.analytics.updater.schemas._
 11 | 
 12 | class TileUpdater
 13 | 
 14 | object TileUpdater extends CommandApp(
 15 |   name = "update-tiles",
 16 |   header = "Update vector tiles with changes from an augmented diff",
 17 |   main = {
 18 |     val rootURI = new File("").toURI
 19 | 
 20 |     val replicationSourceOpt =
 21 |       Opts.option[URI](
 22 |         "replication-source",
 23 |         short = "r",
 24 |         metavar = "uri",
 25 |         help = "URI prefix for replication files"
 26 |       ).withDefault(rootURI)
 27 |     val tileSourceOpt = Opts
 28 |       .option[URI](
 29 |         "tile-source",
 30 |         short = "t",
 31 |         metavar = "uri",
 32 |         help = "URI prefix for vector tiles to update"
 33 |       ).withDefault(rootURI)
 34 |     val minZoomOpt =
 35 |       Opts.option[Int](
 36 |         "min-zoom",
 37 |         short = "z",
 38 |         metavar = "zoom",
 39 |         help = "Minimum zoom to consider"
 40 |       )
 41 |     val maxZoomOpt =
 42 |       Opts.option[Int](
 43 |         "max-zoom",
 44 |         short = "Z",
 45 |         metavar = "zoom",
 46 |         help = "Maximum zoom to consider"
 47 |       )
 48 |     val schemaOpt =
 49 |       Opts.option[String](
 50 |         "schema",
 51 |         short = "s",
 52 |         metavar = "schema",
 53 |         help = "Schema"
 54 |       ).withDefault("snapshot")
 55 |       .validate("Must be a registered schema") { Schemas.keySet.contains(_) }
 56 |       .map { Schemas(_) }
 57 |     val listingOpt =
 58 |       Opts.option[Path](
 59 |         "tiles",
 60 |         short = "T",
 61 |         metavar = "tile list",
 62 |         help = "List of tiles available for updating"
 63 |       ).orNone
 64 |     val dryRunOpt =
 65 |       Opts.flag(
 66 |         "dry-run",
 67 |         short = "n",
 68 |         help = "Dry run"
 69 |       ).orFalse
 70 |     val verboseOpt =
 71 |       Opts.flag(
 72 |         "verbose",
 73 |         short = "v",
 74 |         help = "Be verbose"
 75 |       ).orFalse
 76 |     val sequenceOpt = Opts.argument[Int]("sequence")
 77 | 
 78 |     val logger = Logger.getLogger(classOf[TileUpdater])
 79 | 
 80 |     (replicationSourceOpt,
 81 |      tileSourceOpt,
 82 |      minZoomOpt,
 83 |      maxZoomOpt,
 84 |      schemaOpt,
 85 |      listingOpt,
 86 |      dryRunOpt,
 87 |      verboseOpt,
 88 |      sequenceOpt).mapN {
 89 |       (replicationSource,
 90 |        tileSource,
 91 |        minZoom,
 92 |        maxZoom,
 93 |        schema,
 94 |        listing,
 95 |        dryRun,
 96 |        verbose,
 97 |        sequence) =>
 98 |         val replicationUri = replicationSource.resolve(s"$sequence.json")
 99 | 
100 |         if (verbose) {
101 |           println(s"Applying $replicationUri to $tileSource from zoom $minZoom to $maxZoom...")
102 |         }
103 | 
104 |         readFeatures(replicationUri) match {
105 |           case Some(features) =>
106 |             for (zoom <- minZoom to maxZoom) {
107 |               updateTiles(
108 |                 tileSource = tileSource,
109 |                 zoom = zoom,
110 |                 schemaType = schema,
111 |                 features = features,
112 |                 listing = listing,
113 |                 process = (sk, tile) => {
114 |                   val filename = s"$zoom/${sk.col}/${sk.row}.mvt"
115 |                   val uri = tileSource.resolve(filename)
116 | 
117 |                   if (dryRun) {
118 |                     println(
119 |                       s"Would write ${tile.toBytes.length.formatted("%,d")} bytes to $uri")
120 |                   } else {
121 |                     logger.info(
122 |                       s"Writing ${tile.toBytes.length.formatted("%,d")} bytes to $uri")
123 |                     // TODO gzip compress
124 |                     write(uri, tile.toBytes)
125 |                   }
126 | 
127 |                   if (verbose) {
128 |                     println(filename)
129 |                   }
130 |                 }
131 |               )
132 |             }
133 | 
134 |           case None =>
135 |             println(s"No features available for $sequence")
136 |             System.exit(1)
137 |         }
138 |     }
139 |   }
140 | )
141 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/updater/schemas/History.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.analytics.updater.schemas
  2 | 
  3 | import java.sql.Timestamp
  4 | import java.time.Instant
  5 | 
  6 | import geotrellis.vector.Geometry
  7 | import geotrellis.vectortile._
  8 | import osmesa.analytics.updater.Implicits._
  9 | import osmesa.analytics.updater._
 10 | 
 11 | class History(
 12 |     override val layer: Layer,
 13 |     override val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)])
 14 |     extends Schema {
 15 |   override protected lazy val touchedFeatures: Map[String, Seq[MVTFeature[Geometry]]] = {
 16 |     val featureIds = features.keySet
 17 | 
 18 |     layer.features
 19 |       .filter(f => featureIds.contains(f.data("__id")))
 20 |       .groupBy(f => f.data("__id"): String)
 21 |       .mapValues(
 22 |         fs =>
 23 |           fs.sortWith(_.data("__minorVersion") < _.data("__minorVersion"))
 24 |             .sortWith(_.data("__version") < _.data("__version")))
 25 |   }
 26 | 
 27 |   lazy val newFeatures: Seq[MVTFeature[Geometry]] =
 28 |     features
 29 |       .filter {
 30 |         case (id, (_, curr)) =>
 31 |           versionInfo.get(id) match {
 32 |             case Some((_, _, prevTimestamp)) if curr.data.timestamp.after(prevTimestamp) => true
 33 |             case None                                                                    => true
 34 |             case _                                                                       => false
 35 |           }
 36 |       }
 37 |       .filter {
 38 |         // filter out null geometries
 39 |         case (_, (_, curr)) => Option(curr.geom).isDefined && curr.isValid
 40 |       }
 41 |       .map {
 42 |         case (id, (_, curr)) => (id, makeFeature(curr, minorVersions.get(id)))
 43 |       }
 44 |       .values
 45 |       .filter(_.isDefined)
 46 |       .map(_.get)
 47 |       .toSeq
 48 | 
 49 |   override lazy val replacementFeatures: Seq[MVTFeature[Geometry]] = {
 50 |     val activeFeatures = touchedFeatures
 51 |       .filter {
 52 |         case (id, fs) =>
 53 |           features(id)._2.data.timestamp
 54 |             .after(Timestamp.from(Instant.ofEpochMilli(fs.last.data("__updated"))))
 55 |       }
 56 | 
 57 |     val featuresToReplace = activeFeatures
 58 |       .mapValues(fs => fs.filter(_.data("__validUntil").toLong == 0))
 59 |       .values
 60 |       .flatten
 61 |       .toSeq
 62 | 
 63 |     val replacedFeatures = featuresToReplace
 64 |       .map(f => updateFeature(f, features(f.data("__id"))._2.data.timestamp))
 65 | 
 66 |     logger.info(s"Rewriting ${replacedFeatures.length.formatted("%,d")} features")
 67 | 
 68 |     replacedFeatures
 69 |   }
 70 | 
 71 |   override lazy val retainedFeatures: Seq[MVTFeature[Geometry]] = {
 72 |     val activeFeatures = touchedFeatures
 73 |       .filter {
 74 |         case (id, fs) =>
 75 |           features(id)._2.data.timestamp
 76 |             .after(Timestamp.from(Instant.ofEpochMilli(fs.last.data("__updated"))))
 77 |       }
 78 | 
 79 |     activeFeatures
 80 |       .mapValues(fs => fs.filterNot(_.data("__validUntil").toLong == 0))
 81 |       .values
 82 |       .flatten
 83 |       .toSeq
 84 |   }
 85 | 
 86 |   private def makeFeature(feature: AugmentedDiffFeature,
 87 |                           minorVersion: Option[Int],
 88 |                           validUntil: Option[Long] = None): Option[MVTFeature[Geometry]] = {
 89 |     val id = feature.data.id
 90 | 
 91 |     val elementId = feature.data.`type` match {
 92 |       case "node"     => s"n$id"
 93 |       case "way"      => s"w$id"
 94 |       case "relation" => s"r$id"
 95 |       case _          => id.toString
 96 |     }
 97 | 
 98 |     feature match {
 99 |       case _ if feature.geom.isValid =>
100 |         Some(
101 |           MVTFeature(
102 |             feature.geom, // when features are deleted, this will be the last geometry that was visible
103 |             feature.data.tags.map {
104 |               case (k, v) => (k, VString(v))
105 |             } ++ Map(
106 |               "__id" -> VString(elementId),
107 |               "__changeset" -> VInt64(feature.data.changeset),
108 |               "__updated" -> VInt64(feature.data.timestamp.getTime),
109 |               "__validUntil" -> VInt64(validUntil.getOrElse(0L)),
110 |               "__version" -> VInt64(feature.data.version),
111 |               "__uid" -> VInt64(feature.data.uid),
112 |               "__user" -> VString(feature.data.user),
113 |               "__visible" -> VBool(feature.data.visible.getOrElse(true))
114 |             ) ++ minorVersion
115 |               .map(v => Map("__minorVersion" -> VInt64(v)))
116 |               .getOrElse(Map.empty[String, Value])
117 |           )
118 |         )
119 |       case _ => None
120 |     }
121 |   }
122 | 
123 |   private def updateFeature(feature: MVTFeature[Geometry], validUntil: Timestamp): MVTFeature[Geometry] = {
124 |     MVTFeature(
125 |       feature.geom,
126 |       feature.data.updated("__validUntil", VInt64(validUntil.getTime))
127 |     )
128 |   }
129 | }
130 | 
131 | object History extends SchemaBuilder {
132 |   override val layerName: String = "all"
133 | 
134 |   def apply(layer: Layer,
135 |             features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]) =
136 |     new History(layer, features)
137 | }
138 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/updater/schemas/Snapshot.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics.updater.schemas
 2 | 
 3 | import geotrellis.vector.Geometry
 4 | import geotrellis.vectortile.{Layer, MVTFeature, VInt64, VString}
 5 | import osmesa.analytics.updater._
 6 | 
 7 | class Snapshot(
 8 |     override val layer: Layer,
 9 |     override val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)])
10 |     extends Schema {
11 |   lazy val newFeatures: Seq[MVTFeature[Geometry]] =
12 |     features.values
13 |       .map(_._2)
14 |       .filter(_.data.visible.getOrElse(true))
15 |       .map(makeFeature)
16 |       .filter(_.isDefined)
17 |       .map(_.get)
18 |       .toSeq
19 | 
20 |   private def makeFeature(feature: AugmentedDiffFeature): Option[MVTFeature[Geometry]] = {
21 |     val id = feature.data.id
22 | 
23 |     val elementId = feature.data.`type` match {
24 |       case "node"     => s"n$id"
25 |       case "way"      => s"w$id"
26 |       case "relation" => s"r$id"
27 |       case _          => id.toString
28 |     }
29 | 
30 |     feature match {
31 |       case _ if feature.geom.isValid =>
32 |         Some(
33 |           MVTFeature(
34 |             feature.geom,
35 |             feature.data.tags.map {
36 |               case (k, v) => (k, VString(v))
37 |             } ++ Map(
38 |               "__id" -> VString(elementId),
39 |               "__changeset" -> VInt64(feature.data.changeset),
40 |               "__updated" -> VInt64(feature.data.timestamp.getTime),
41 |               "__version" -> VInt64(feature.data.version),
42 |               "__uid" -> VInt64(feature.data.uid),
43 |               "__user" -> VString(feature.data.user)
44 |             )
45 |           )
46 |         )
47 |       case _ => None
48 |     }
49 |   }
50 | }
51 | 
52 | object Snapshot extends SchemaBuilder {
53 |   override val layerName: String = "data"
54 | 
55 |   def apply(layer: Layer,
56 |             features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]) =
57 |     new Snapshot(layer, features)
58 | }
59 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/updater/schemas/Urchn.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.analytics.updater.schemas
  2 | 
  3 | import geotrellis.vector.Geometry
  4 | import geotrellis.vectortile._
  5 | import osmesa.analytics.updater.Implicits._
  6 | import osmesa.analytics.updater._
  7 | 
  8 | class Urchn(
  9 |     override val layer: Layer,
 10 |     override val features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)])
 11 |     extends Schema {
 12 |   override protected lazy val touchedFeatures: Map[String, Seq[MVTFeature[Geometry]]] = {
 13 |     val featureIds = features.keySet
 14 | 
 15 |     layer.features
 16 |       .filter(f => featureIds.contains(f.data("__id")))
 17 |       .groupBy(f => f.data("__id"): String)
 18 |       .mapValues(
 19 |         fs =>
 20 |           fs.sortWith(_.data("__minorVersion") < _.data("__minorVersion"))
 21 |             .sortWith(_.data("__version") < _.data("__version")))
 22 |   }
 23 | 
 24 |   private lazy val authors: Map[String, Set[String]] =
 25 |     touchedFeatures
 26 |       .mapValues(_.last)
 27 |       .mapValues(_.data("__authors").split(",").toSet)
 28 | 
 29 |   private lazy val creation: Map[String, Long] =
 30 |     touchedFeatures
 31 |       .mapValues(_.head)
 32 |       .mapValues(_.data("__creation"))
 33 | 
 34 |   lazy val newFeatures: Seq[MVTFeature[Geometry]] =
 35 |     features
 36 |       .filter {
 37 |         case (id, (_, curr)) =>
 38 |           versionInfo.get(id) match {
 39 |             case Some((_, _, prevTimestamp)) if curr.data.timestamp.after(prevTimestamp) => true
 40 |             case None                                                                    => true
 41 |             case _                                                                       => false
 42 |           }
 43 |       }
 44 |       .values
 45 |       .filter {
 46 |         // filter out null geometries
 47 |         case (_, curr) => Option(curr.geom).isDefined && curr.isValid
 48 |       }
 49 |       .map {
 50 |         case (_, curr) =>
 51 |           // NOTE: if this feature appears in the current tile for the first time, creation, authors, and minorVersions
 52 |           // will be incomplete (and therefore wrong)
 53 |           makeFeature(
 54 |             curr,
 55 |             creation
 56 |               .getOrElse(curr.data.elementId, curr.data.timestamp.getTime),
 57 |             authors
 58 |               .get(curr.data.elementId)
 59 |               .map(_ + curr.data.user)
 60 |               .getOrElse(Set(curr.data.user)),
 61 |             minorVersions.get(curr.data.elementId)
 62 |           )
 63 |       }
 64 |       .filter(_.isDefined)
 65 |       .map(_.get)
 66 |       .toSeq
 67 | 
 68 |   private def makeFeature(feature: AugmentedDiffFeature,
 69 |                           creation: Long,
 70 |                           authors: Set[String],
 71 |                           minorVersion: Option[Int]): Option[MVTFeature[Geometry]] = {
 72 |     val id = feature.data.id
 73 | 
 74 |     val elementId = feature.data.`type` match {
 75 |       case "node"     => s"n$id"
 76 |       case "way"      => s"w$id"
 77 |       case "relation" => s"r$id"
 78 |       case _          => id.toString
 79 |     }
 80 | 
 81 |     feature match {
 82 |       case _ if Option(feature.geom).isDefined && feature.geom.isValid =>
 83 |         Some(
 84 |           MVTFeature(
 85 |             feature.geom, // when features are deleted, this will be the last geometry that was visible
 86 |             feature.data.tags.map {
 87 |               case (k, v) => (k, VString(v))
 88 |             } ++ Map(
 89 |               "__id" -> VString(elementId),
 90 |               "__changeset" -> VInt64(feature.data.changeset),
 91 |               "__updated" -> VInt64(feature.data.timestamp.getTime),
 92 |               "__version" -> VInt64(feature.data.version),
 93 |               "__vtileGen" -> VInt64(System.currentTimeMillis),
 94 |               "__creation" -> VInt64(creation),
 95 |               "__authors" -> VString(authors.mkString(",")),
 96 |               "__lastAuthor" -> VString(feature.data.user)
 97 |             ) ++ minorVersion
 98 |               .map(v => Map("__minorVersion" -> VInt64(v)))
 99 |               .getOrElse(Map.empty[String, Value])
100 |           )
101 |         )
102 |       case _ => None
103 |     }
104 |   }
105 | }
106 | 
107 | object Urchn extends SchemaBuilder {
108 |   override val layerName: String = "history"
109 | 
110 |   def apply(layer: Layer,
111 |             features: Map[String, (Option[AugmentedDiffFeature], AugmentedDiffFeature)]) =
112 |     new Urchn(layer, features)
113 | }
114 | 


--------------------------------------------------------------------------------
/src/analytics/src/main/scala/osmesa/analytics/updater/schemas/package.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics.updater
 2 | 
 3 | package object schemas {
 4 |   val Schemas: Map[String, SchemaBuilder] = Map(
 5 |     "history" -> History,
 6 |     "snapshot" -> Snapshot,
 7 |     "urchn" -> Urchn
 8 |   )
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/analytics/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=INFO, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.out
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{HH:mm:ss} %c{1}: %m%n
 7 | 
 8 | log4j.logger.osmesa.analytics=INFO
 9 | 
10 | # Settings to quiet third party logs that are too verbose
11 | log4j.logger.org.eclipse.jetty=WARN
12 | log4j.logger.org.apache.spark=WARN
13 | log4j.logger.org.apache.hadoop=WARN
14 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN
15 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN
16 | 
17 | log4j.logger.org.spark-project.jetty=WARN
18 | org.spark-project.jetty.LEVEL=WARN


--------------------------------------------------------------------------------
/src/analytics/src/test/scala/osmesa/analytics/CountriesTest.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.analytics
 2 | 
 3 | import org.locationtech.jts.geom.Coordinate
 4 | import geotrellis.vector._
 5 | import geotrellis.vector.io._
 6 | import org.scalatest._
 7 | import spray.json._
 8 | 
 9 | import geotrellis.spark.util._
10 | 
11 | class CountriesTest extends FunSuite with Matchers {
12 |   def time[T](msg: String)(f: => T) = {
13 |     val start = System.currentTimeMillis
14 |     val v = f
15 |     val end = System.currentTimeMillis
16 |     println(s"[TIMING] ${msg}: ${java.text.NumberFormat.getIntegerInstance.format(end - start)} ms")
17 |     v
18 |   }
19 | 
20 |   def write(path: String, txt: String): Unit = {
21 |     import java.nio.file.{Paths, Files}
22 |     import java.nio.charset.StandardCharsets
23 | 
24 |     Files.write(Paths.get(path), txt.getBytes(StandardCharsets.UTF_8))
25 |   }
26 | 
27 |   test("Generate some random points and see if they make sense") {
28 |     val countries = Countries.all
29 |     val rand = new scala.util.Random
30 |     val points =
31 |       countries.flatMap { mpf =>
32 |         val env = mpf.geom.envelope
33 | 
34 |         for(i <- 0 until 10) yield {
35 |           val x = env.xmin + (rand.nextDouble * env.width)
36 |           val y = env.ymin + (rand.nextDouble * env.height)
37 |           new Coordinate(x, y)
38 |         }
39 |       }
40 | 
41 |     val l = {
42 |       // Ensure that we can serialize the Lookup.
43 |       val x =
44 |         time("Creating CountryLookup") { new CountryLookup() }
45 |       val s = KryoSerializer.serialize(x)
46 |       KryoSerializer.deserialize[CountryLookup](s)
47 |     }
48 | 
49 |     val pcs =
50 |       Countries.all.map { mpf =>
51 |         (mpf.geom.prepare, mpf.data)
52 |       }
53 | 
54 |     // Brute force lookup, without spatial index
55 |     def bfLookup(coord: Coordinate): Option[CountryId] =
56 |       pcs.find { case (pg, _) => pg.contains(Point(coord.x, coord.y)) }.
57 |         map { case (_, data) => data }
58 | 
59 |     val actual =
60 |       time("LOOKUP") {
61 |         points.
62 |           map { p => l.lookup(p).map { cid => PointFeature(Point(p.x, p.y), cid) } }
63 |       }
64 | 
65 |     val expected =
66 |       time("BRUTE FORCE LOOKUP") {
67 |         points.
68 |           map { p =>
69 |             bfLookup(p).map { cid => PointFeature(Point(p.x, p.y), cid) }
70 |           }
71 |       }
72 | 
73 |     val nodeIndex =
74 |       time("Creating nodeIndex") {
75 |         SpatialIndex(points) { p => (p.x, p.y) }
76 |       }
77 | 
78 |     val nodeIndexed =
79 |       time("NODE INDEX LOOKUP") {
80 |         // Another way to do the spatial index, indexing the nodes instead of the countries.
81 |         // This turns out to be slower than the lookup for large point sets.
82 |         val result: Vector[Option[PointFeature[CountryId]]] =
83 |           Countries.all.
84 |             flatMap { mpf =>
85 |               val pg = mpf.geom.prepare
86 |               nodeIndex.traversePointsInExtent(mpf.geom.envelope).
87 |                 map { p =>
88 |                   if(pg.covers(p)) { Some(PointFeature(Point(p.x, p.y), mpf.data)) }
89 |                   else { None }
90 |                 }
91 |             }
92 |         result
93 |       }
94 | 
95 |     actual.flatten.length should be (expected.flatten.length)
96 |     actual.flatten.length should be (nodeIndexed.flatten.length)
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/apps/build.sbt:
--------------------------------------------------------------------------------
 1 | import Dependencies._
 2 | 
 3 | name := "osmesa-apps"
 4 | 
 5 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-core" % "2.6.7"
 6 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.7"
 7 | dependencyOverrides += "com.fasterxml.jackson.module" % "jackson-module-scala_2.11" % "2.6.7"
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   postgresql,
11 |   decline,
12 |   sparkHive % Provided,
13 |   sparkJts,
14 |   gtGeotools,
15 |   gtS3,
16 |   gtSpark,
17 |   gtVector,
18 |   gtVectorTile,
19 |   vectorpipe,
20 |   cats,
21 |   scalactic,
22 |   gtSparkTestKit,
23 |   logging,
24 |   scalatest,
25 |   apacheCommonsEmail,
26 | )
27 | 
28 | /* Fixes Spark breakage with `sbt run` as of sbt-1.0.2 */
29 | fork in run := true
30 | 
31 | fork in Test := true
32 | 
33 | test in assembly := {}
34 | 
35 | javaOptions ++= Seq("-Xmx5G")
36 | 
37 | initialCommands in console :=
38 |   """
39 |   """
40 | 
41 | assemblyJarName in assembly := "osmesa-apps.jar"
42 | 
43 | assemblyShadeRules in assembly := {
44 |   val shadePackage = "com.azavea.shaded.demo"
45 |   Seq(
46 |     ShadeRule.rename("com.google.common.**" -> s"$shadePackage.google.common.@1")
47 |       .inLibrary("com.azavea.geotrellis" %% "geotrellis-cassandra" % Version.geotrellis).inAll,
48 |     ShadeRule.rename("io.netty.**" -> s"$shadePackage.io.netty.@1")
49 |       .inLibrary("com.azavea.geotrellis" %% "geotrellis-hbase" % Version.geotrellis).inAll,
50 |     ShadeRule.rename("com.fasterxml.jackson.**" -> s"$shadePackage.com.fasterxml.jackson.@1")
51 |       .inLibrary("com.networknt" % "json-schema-validator" % "0.1.7").inAll,
52 |     ShadeRule.rename("org.apache.avro.**" -> s"$shadePackage.org.apache.avro.@1")
53 |       .inLibrary("com.azavea.geotrellis" %% "geotrellis-spark" % Version.geotrellis).inAll
54 |   )
55 | }
56 | 
57 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
58 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/DbUtils.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.apps
 2 | 
 3 | import java.net.URI
 4 | import java.sql.Connection
 5 | 
 6 | import vectorpipe.util.DBUtils
 7 | 
 8 | object DbUtils {
 9 |   /**
10 |     * Upsert a diff sequence number to a database, tied to a unique procName String
11 |     *
12 |     * Must be a PostgreSQL database
13 |     * PostgreSQL database must contain table schema:
14 |     * `checkpoints`:
15 |     *   - proc_name: String
16 |     *   - sequence: Int
17 |     *
18 |     * @param procName
19 |     * @param sequence
20 |     * @param databaseURI
21 |     * @return
22 |     */
23 |   def saveLocations(procName: String, sequence: Int, databaseURI: URI) = {
24 |     var connection: Connection = null
25 |     try {
26 |       connection = DBUtils.getJdbcConnection(databaseURI)
27 |       val upsertSequence =
28 |         connection.prepareStatement(
29 |           """
30 |             |INSERT INTO checkpoints (proc_name, sequence)
31 |             |VALUES (?, ?)
32 |             |ON CONFLICT (proc_name)
33 |             |DO UPDATE SET sequence = ?
34 |           """.stripMargin
35 |         )
36 |       upsertSequence.setString(1, procName)
37 |       upsertSequence.setInt(2, sequence)
38 |       upsertSequence.setInt(3, sequence)
39 |       upsertSequence.execute()
40 |     } finally {
41 |       if (connection != null) connection.close()
42 |     }
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/batch/EditHistogramTileCreator.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.apps.batch
 2 | 
 3 | import java.net.URI
 4 | 
 5 | import cats.implicits._
 6 | import com.monovore.decline._
 7 | import org.apache.spark.sql.SparkSession
 8 | import org.apache.spark.sql.functions._
 9 | import org.locationtech.geomesa.spark.jts._
10 | import osmesa.analytics.{Analytics, EditHistogram}
11 | import vectorpipe.functions.asDouble
12 | 
13 | object EditHistogramTileCreator
14 |     extends CommandApp(
15 |       name = "edit-histogram",
16 |       header = "Create vector tiles containing histograms of editing activity",
17 |       main = {
18 | 
19 |         val historyOpt = Opts
20 |           .option[URI]("history", help = "URI of the history ORC file to process.")
21 | 
22 |         val outputOpt = Opts.option[URI]("out", help = "Base URI for output.")
23 | 
24 |         val concurrentUploadsOpt = Opts
25 |           .option[Int]("concurrent-uploads",
26 |                        short = "c",
27 |                        metavar = "concurrent uploads",
28 |                        help = "Set the number of concurrent uploads.")
29 |           .orNone
30 | 
31 |         val baseZoomOpt = Opts
32 |           .option[Int]("base-zoom",
33 |                        short = "z",
34 |                        metavar = "Base zoom",
35 |                        help = "Most detailed zoom level")
36 |           .orNone
37 | 
38 |         (
39 |           historyOpt,
40 |           outputOpt,
41 |           concurrentUploadsOpt,
42 |           baseZoomOpt
43 |         ).mapN {
44 |           (historyURI, outputURI, _concurrentUploads, baseZoom) =>
45 |             implicit val spark: SparkSession =
46 |               Analytics.sparkSession("State of the Data tile generation")
47 |             import spark.implicits._
48 |             implicit val concurrentUploads: Option[Int] = _concurrentUploads
49 |             spark.withJTS
50 | 
51 |             val history = spark.read.orc(historyURI.toString)
52 | 
53 |             val nodes = history
54 |               .where('type === "node" and 'lat.isNotNull and 'lon.isNotNull)
55 |               .withColumn("lat", asDouble('lat))
56 |               .withColumn("lon", asDouble('lon))
57 |               .where('uid > 1)
58 |               .select(st_makePoint('lon, 'lat) as 'geom,
59 |                       year('timestamp) * 1000 + dayofyear('timestamp) as 'key)
60 | 
61 |             val stats = EditHistogram.create(nodes,
62 |                                              outputURI,
63 |                                              baseZoom.getOrElse(EditHistogram.DefaultBaseZoom))
64 |             println(s"${stats.count} tiles created.")
65 | 
66 |             spark.stop()
67 |         }
68 |       }
69 |     )
70 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/batch/MergeChangesets.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.apps.batch
  2 | 
  3 | import java.net.URI
  4 | import java.sql._
  5 | import java.time.Instant
  6 | 
  7 | import cats.data.{Validated, ValidatedNel}
  8 | import cats.implicits._
  9 | import com.monovore.decline._
 10 | import io.circe._
 11 | import org.apache.spark.sql._
 12 | import org.apache.spark.sql.functions._
 13 | import org.joda.time.DateTime
 14 | import org.joda.time.format.DateTimeFormat
 15 | import osmesa.analytics.Analytics
 16 | import vectorpipe.sources.{ChangesetSource, Source}
 17 | import vectorpipe.util.DBUtils
 18 | 
 19 | /*
 20 |  * Usage example:
 21 |  *
 22 |  * sbt "project apps" assembly
 23 |  *
 24 |  * spark-submit \
 25 |  *   --class osmesa.apps.batch.MergeChangesets \
 26 |  *   ingest/target/scala-2.11/osmesa-apps.jar \
 27 |  *   --changesets http://location/of/changeset/replications \
 28 |  *   --end-time 1970-01-01T13:00:00Z
 29 |  *   s3://path/to/history.orc
 30 |  *   s3://path/to/output.orc
 31 |  */
 32 | object MergeChangesets
 33 |   extends CommandApp(
 34 |     name = "osmesa-merge-changesets",
 35 |     header = "Bring existing changeset ORC file up to date using changeset stream",
 36 |     main = {
 37 | 
 38 |       import ChangesetSource._
 39 |       import MergeChangesetsImplicits._
 40 | 
 41 |       val changesetSourceOpt =
 42 |         Opts
 43 |           .option[URI](
 44 |           "changesets",
 45 |           short = "c",
 46 |           metavar = "uri",
 47 |           help = "Location of replication changesets"
 48 |         )
 49 |         .validate("Changeset source must have trailing '/'") { _.getPath.endsWith("/") }
 50 | 
 51 |       val endTimeOpt =
 52 |         Opts
 53 |           .option[Instant]("end-time",
 54 |                        short = "e",
 55 |                        metavar = "timestamp",
 56 |                        help = "Timestamp of stream end (of the form 2016-02-29T13:45:00Z); if absent, the time now will be used")
 57 |                          .orNone
 58 | 
 59 |       val orcArg = Opts
 60 |         .argument[URI]("source ORC")
 61 |         .validate("URI to ORC must have an s3 or file scheme") { _.getScheme != null }
 62 |         .validate("orc must be an S3 or file Uri") { uri =>
 63 |           uri.getScheme.startsWith("s3") || uri.getScheme.startsWith("file")
 64 |         }
 65 |         .validate("orc must be an .orc file") { _.getPath.endsWith(".orc") }
 66 | 
 67 |       val outputArg = Opts.argument[URI]("destination ORC")
 68 |         .validate("Output URI must have a scheme") { _.getScheme != null }
 69 |         .validate("Output URI must have an S3 or file scheme") { uri =>
 70 |           uri.getScheme.startsWith("s3") || uri.getScheme.startsWith("file")
 71 |         }
 72 |         .validate("orc must be an .orc file") { _.getPath.endsWith(".orc") }
 73 | 
 74 |       (changesetSourceOpt,
 75 |        endTimeOpt,
 76 |        orcArg,
 77 |        outputArg).mapN {
 78 |         (changesetSource, endTime, orcUri, outputURI) =>
 79 |         implicit val spark: SparkSession = Analytics.sparkSession("MergeChangesets")
 80 | 
 81 |         import spark.implicits._
 82 | 
 83 |         val df = spark.read.orc(orcUri.toString)
 84 |         val lastModified = df.select(max(coalesce('closedAt, 'createdAt))).first.getAs[Timestamp](0)
 85 | 
 86 |         val startSequence = findSequenceFor(lastModified.toInstant, changesetSource)
 87 |         val endSequence = endTime.map(findSequenceFor(_, changesetSource)).getOrElse(getCurrentSequence(changesetSource).get.sequence)
 88 | 
 89 |         val options = Map(
 90 |           Source.BaseURI -> changesetSource.toString,
 91 |           Source.StartSequence -> startSequence.toString,
 92 |           Source.EndSequence -> (endSequence + 1).toString // sequence range is (]; end sequence is exclusive
 93 |         )
 94 | 
 95 |         val changesets = spark.read.format(Source.Changesets).options(options).load
 96 | 
 97 |         // TODO: Clean up the following by providing and using a function in VP to coerce the
 98 |         // column names into camel case (see https://github.com/geotrellis/vectorpipe/issues/113)
 99 |         changesets
100 |           .drop("comments", "sequence")
101 |           .union(df.select(
102 |             'id,
103 |             'tags,
104 |             'createdAt,
105 |             'open,
106 |             'closedAt,
107 |             'commentsCount,
108 |             'minLat,
109 |             'maxLat,
110 |             'minLon,
111 |             'maxLon,
112 |             'numChanges,
113 |             'uid,
114 |             'user)
115 |           )
116 |           .repartition(1)
117 |           .write
118 |           .orc(outputURI.toString)
119 | 
120 |         spark.stop()
121 |       }
122 |     }
123 | )
124 | object MergeChangesetsImplicits {
125 |   implicit val readInstant: Argument[Instant] = new Argument[Instant] {
126 |     override def read(string: String): ValidatedNel[String, Instant] = {
127 |       try { Validated.valid(Instant.parse(string)) }
128 |       catch { case e: Exception => Validated.invalidNel(s"Invalid time: $string (${ e.getMessage })") }
129 |     }
130 | 
131 |     override def defaultMetavar: String = "time"
132 |   }
133 | 
134 |   private val formatter = DateTimeFormat.forPattern("y-M-d H:m:s.SSSSSSSSS Z")
135 | 
136 |   private implicit val dateTimeDecoder: Decoder[DateTime] =
137 |     Decoder.instance(a => a.as[String].map(DateTime.parse(_, formatter)))
138 | }
139 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/streaming/ChangeStreamProcessor.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.apps.streaming
  2 | 
  3 | import java.net.URI
  4 | 
  5 | import cats.implicits._
  6 | import com.monovore.decline._
  7 | import org.apache.spark.sql._
  8 | import osmesa.analytics.Analytics
  9 | import vectorpipe.sources.Source
 10 | import vectorpipe.{internal => ProcessOSM}
 11 | 
 12 | /*
 13 |  * Usage example:
 14 |  *
 15 |  * sbt "project apps" assembly
 16 |  *
 17 |  * # Running an infinite stream from the beginning of time
 18 |  * spark-submit \
 19 |  *   --class osmesa.apps.streaming.ChangeStreamProcessor \
 20 |  *   ./analytics/target/scala-2.11/osmesa-apps.jar \
 21 |  *   --start-sequence 1
 22 |  *
 23 |  * This class prints the change stream out to console for debugging
 24 |  */
 25 | object ChangeStreamProcessor
 26 |     extends CommandApp(
 27 |       name = "osmesa-diff-stream-processor",
 28 |       header = "display diffs from a change stream",
 29 |       main = {
 30 |         val changeSourceOpt =
 31 |           Opts
 32 |             .option[URI](
 33 |               "change-source",
 34 |               short = "c",
 35 |               metavar = "uri",
 36 |               help = "Location of changes to process"
 37 |             )
 38 |             .withDefault(new URI("https://planet.osm.org/replication/minute/"))
 39 | 
 40 |         val startSequenceOpt =
 41 |           Opts
 42 |             .option[Int](
 43 |               "start-sequence",
 44 |               short = "s",
 45 |               metavar = "sequence",
 46 |               help = "Starting sequence. If absent, the current (remote) sequence will be used."
 47 |             )
 48 |             .orNone
 49 | 
 50 |         val endSequenceOpt =
 51 |           Opts
 52 |             .option[Int](
 53 |               "end-sequence",
 54 |               short = "e",
 55 |               metavar = "sequence",
 56 |               help = "Ending sequence. If absent, this will be an infinite stream."
 57 |             )
 58 |             .orNone
 59 | 
 60 |         val databaseUriOpt =
 61 |           Opts
 62 |             .option[URI](
 63 |               "database-url",
 64 |               short = "d",
 65 |               metavar = "database URL",
 66 |               help = "Database URL (default: $DATABASE_URL environment variable)"
 67 |             )
 68 |             .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database"))
 69 |             .orNone
 70 | 
 71 |         (changeSourceOpt, startSequenceOpt, endSequenceOpt, databaseUriOpt).mapN {
 72 |           (changeSource, startSequence, endSequence, databaseUri) =>
 73 |             implicit val ss: SparkSession =
 74 |               Analytics.sparkSession("ChangeStreamProcessor")
 75 | 
 76 |             import ss.implicits._
 77 | 
 78 |             val options = Map(
 79 |               Source.BaseURI -> changeSource.toString,
 80 |               Source.ProcessName -> "ChangeStream"
 81 |             ) ++
 82 |               databaseUri
 83 |                 .map(x => Map(Source.DatabaseURI -> x.toString))
 84 |                 .getOrElse(Map.empty[String, String]) ++
 85 |               startSequence
 86 |                 .map(s => Map(Source.StartSequence -> s.toString))
 87 |                 .getOrElse(Map.empty[String, String]) ++
 88 |               endSequence
 89 |                 .map(s => Map(Source.EndSequence -> s.toString))
 90 |                 .getOrElse(Map.empty[String, String])
 91 | 
 92 |             val changes =
 93 |               ss.readStream
 94 |                 .format(Source.Changes)
 95 |                 .options(options)
 96 |                 .load
 97 | 
 98 |             val changeProcessor = changes
 99 |               .select('id, 'version, 'lat, 'lon, 'visible)
100 |               .where('_type === ProcessOSM.NodeType and !'visible)
101 |               .writeStream
102 |               .queryName("display change data")
103 |               .format("console")
104 |               .start
105 | 
106 |             changeProcessor.awaitTermination()
107 | 
108 |             ss.stop()
109 |         }
110 |       }
111 |     )
112 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/streaming/ChangesetMetadataUpdater.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.apps.streaming
  2 | 
  3 | import java.net.URI
  4 | 
  5 | import cats.implicits._
  6 | import com.monovore.decline._
  7 | import org.apache.spark.TaskContext
  8 | import org.apache.spark.sql._
  9 | import osmesa.analytics.Analytics
 10 | import osmesa.analytics.stats.ChangesetMetadataForeachWriter
 11 | import vectorpipe.functions._
 12 | import vectorpipe.functions.osm._
 13 | import vectorpipe.sources.Source
 14 | 
 15 | /*
 16 |  * Usage example:
 17 |  *
 18 |  * sbt "project apps" assembly
 19 |  *
 20 |  * spark-submit \
 21 |  *   --class osmesa.apps.streaming.ChangesetMetadataUpdater \
 22 |  *   ingest/target/scala-2.11/osmesa-analytics.jar \
 23 |  *   --database-url $DATABASE_URL
 24 |  */
 25 | object ChangesetMetadataUpdater
 26 |     extends CommandApp(
 27 |       name = "osmesa-augmented-diff-stream-processor",
 28 |       header = "Update statistics from streaming augmented diffs",
 29 |       main = {
 30 |         val changesetSourceOpt =
 31 |           Opts
 32 |             .option[URI]("changeset-source",
 33 |                          short = "c",
 34 |                          metavar = "uri",
 35 |                          help = "Location of changesets to process")
 36 |             .withDefault(new URI("https://planet.osm.org/replication/changesets/"))
 37 | 
 38 |         val databaseUrlOpt =
 39 |           Opts
 40 |             .option[URI](
 41 |               "database-url",
 42 |               short = "d",
 43 |               metavar = "database URL",
 44 |               help = "Database URL (default: $DATABASE_URL environment variable)"
 45 |             )
 46 |             .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database"))
 47 | 
 48 |         val startSequenceOpt =
 49 |           Opts
 50 |             .option[Int](
 51 |               "start-sequence",
 52 |               short = "s",
 53 |               metavar = "sequence",
 54 |               help = "Starting sequence. If absent, the current (remote) sequence will be used."
 55 |             )
 56 |             .orNone
 57 | 
 58 |         val endSequenceOpt =
 59 |           Opts
 60 |             .option[Int](
 61 |               "end-sequence",
 62 |               short = "e",
 63 |               metavar = "sequence",
 64 |               help = "Ending sequence. If absent, this will be an infinite stream."
 65 |             )
 66 |             .orNone
 67 | 
 68 |         val partitionCountOpt = Opts
 69 |           .option[Int]("partition-count",
 70 |                        short = "p",
 71 |                        metavar = "partition count",
 72 |                        help = "Change partition count.")
 73 |           .orNone
 74 | 
 75 |         (changesetSourceOpt, databaseUrlOpt, startSequenceOpt, endSequenceOpt, partitionCountOpt)
 76 |           .mapN {
 77 |             (changesetSource, databaseUrl, startSequence, endSequence, partitionCount) =>
 78 |               implicit val ss: SparkSession = Analytics.sparkSession("ChangesetMetadataUpdater")
 79 | 
 80 |               import ss.implicits._
 81 | 
 82 |               val options = Map(
 83 |                 Source.BaseURI -> changesetSource.toString,
 84 |                 Source.ProcessName -> "ChangesetMetadataUpdater"
 85 |               ) ++
 86 |                 startSequence
 87 |                   .map(s => Map(Source.StartSequence -> s.toString))
 88 |                   .getOrElse(Map.empty) ++
 89 |                 endSequence
 90 |                   .map(s => Map(Source.EndSequence -> s.toString))
 91 |                   .getOrElse(Map.empty) ++
 92 |                 partitionCount
 93 |                   .map(x => Map(Source.PartitionCount -> x.toString))
 94 |                   .getOrElse(Map.empty)
 95 | 
 96 |               val changesets =
 97 |                 ss.read
 98 |                   .format(Source.Changesets)
 99 |                   .options(options)
100 |                   .load
101 | 
102 |               changesets
103 |                 .select(
104 |                   'id,
105 |                   'createdAt,
106 |                   'closedAt,
107 |                   'user,
108 |                   'uid,
109 |                   'tags.getField("created_by") as 'editor,
110 |                   merge_sets(hashtags('tags.getField("comment")),
111 |                              hashtags('tags.getField("hashtags"))) as 'hashtags
112 |                 )
113 |                 .foreachPartition(rows => {
114 |                   val writer =
115 |                     new ChangesetMetadataForeachWriter(databaseUrl, shouldUpdateUsernames = true)
116 | 
117 |                   if (writer.open(TaskContext.getPartitionId(), 0)) {
118 |                     try {
119 |                       rows.foreach(writer.process)
120 | 
121 |                       writer.close(null)
122 |                     } catch {
123 |                       case e: Throwable => writer.close(e)
124 |                     }
125 |                   }
126 |                 })
127 | 
128 |               ss.stop()
129 |           }
130 |       }
131 |     )
132 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/streaming/ChangesetStatsUpdater.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.apps.streaming
  2 | 
  3 | import java.net.URI
  4 | 
  5 | import cats.implicits._
  6 | import com.monovore.decline._
  7 | import geotrellis.vector.{Feature, Geometry}
  8 | import org.apache.spark.TaskContext
  9 | import org.apache.spark.sql._
 10 | import org.apache.spark.sql.functions._
 11 | import osmesa.analytics.Analytics
 12 | import osmesa.analytics.stats._
 13 | import osmesa.analytics.stats.functions._
 14 | import vectorpipe.functions.{flatten => _, _}
 15 | import vectorpipe.functions.osm.isTagged
 16 | import vectorpipe.model.ElementWithSequence
 17 | import vectorpipe.sources.Source
 18 | import vectorpipe.util.Geocode
 19 | 
 20 | /*
 21 |  * Usage example:
 22 |  *
 23 |  * sbt "project apps" assembly
 24 |  *
 25 |  * spark-submit \
 26 |  *   --class osmesa.apps.streaming.ChangesetStatsUpdater \
 27 |  *   ingest/target/scala-2.11/osmesa-apps.jar \
 28 |  *   --augmented-diff-source s3://somewhere/diffs/ \
 29 |  *   --database-url $DATABASE_URL
 30 |  */
 31 | object ChangesetStatsUpdater
 32 |     extends CommandApp(
 33 |       name = "osmesa-changeset-stats-updater",
 34 |       header = "Update statistics from augmented diffs",
 35 |       main = {
 36 |         type AugmentedDiffFeature = Feature[Geometry, ElementWithSequence]
 37 | 
 38 |         val augmentedDiffSourceOpt =
 39 |           Opts
 40 |             .option[URI](
 41 |               "augmented-diff-source",
 42 |               short = "a",
 43 |               metavar = "uri",
 44 |               help = "Location of augmented diffs to process"
 45 |             )
 46 | 
 47 |         val databaseUrlOpt =
 48 |           Opts
 49 |             .option[URI](
 50 |               "database-url",
 51 |               short = "d",
 52 |               metavar = "database URL",
 53 |               help = "Database URL (default: $DATABASE_URL environment variable)"
 54 |             )
 55 |             .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database"))
 56 | 
 57 |         val startSequenceOpt =
 58 |           Opts
 59 |             .option[Int](
 60 |               "start-sequence",
 61 |               short = "s",
 62 |               metavar = "sequence",
 63 |               help = "Starting sequence. If absent, the current (remote) sequence will be used."
 64 |             )
 65 |             .orNone
 66 | 
 67 |         val endSequenceOpt =
 68 |           Opts
 69 |             .option[Int]("end-sequence",
 70 |                          short = "e",
 71 |                          metavar = "sequence",
 72 |                          help = "Ending sequence. If absent, this will be an infinite stream.")
 73 |             .orNone
 74 | 
 75 |         val partitionCountOpt = Opts
 76 |           .option[Int]("partition-count",
 77 |                        short = "p",
 78 |                        metavar = "partition count",
 79 |                        help = "Change partition count.")
 80 |           .orNone
 81 | 
 82 |         (augmentedDiffSourceOpt,
 83 |          startSequenceOpt,
 84 |          endSequenceOpt,
 85 |          databaseUrlOpt,
 86 |          partitionCountOpt).mapN {
 87 |           (augmentedDiffSource, startSequence, endSequence, databaseUrl, partitionCount) =>
 88 |             implicit val ss: SparkSession = Analytics.sparkSession("ChangesetStatsUpdater")
 89 | 
 90 |             import ss.implicits._
 91 | 
 92 |             val options = Map(
 93 |               Source.BaseURI -> augmentedDiffSource.toString,
 94 |               Source.ProcessName -> "ChangesetStatsUpdater"
 95 |             ) ++
 96 |               startSequence
 97 |                 .map(s => Map(Source.StartSequence -> s.toString))
 98 |                 .getOrElse(Map.empty) ++
 99 |               endSequence
100 |                 .map(s => Map(Source.EndSequence -> s.toString))
101 |                 .getOrElse(Map.empty) ++
102 |               partitionCount
103 |                 .map(x => Map(Source.PartitionCount -> x.toString))
104 |                 .getOrElse(Map.empty)
105 | 
106 |             val geoms = ss.read.format(Source.AugmentedDiffs).options(options).load
107 | 
108 |             Geocode(geoms.where(isTagged('tags)))
109 |               .withLinearDelta
110 |               .withAreaDelta
111 |               .select(
112 |                 'sequence,
113 |                 'changeset,
114 |                 'uid,
115 |                 'user,
116 |                 'countries,
117 |                 DefaultMeasurements,
118 |                 DefaultCounts
119 |               )
120 |               .groupBy('sequence, 'changeset, 'uid, 'user)
121 |               .agg(
122 |                 sum_measurements(collect_list('measurements)) as 'measurements,
123 |                 sum_counts(collect_list('counts)) as 'counts,
124 |                 count_values(flatten(collect_list('countries))) as 'countries
125 |               )
126 |               .withColumn("totalEdits", sum_count_values('counts))
127 |               .foreachPartition(rows => {
128 |                 val writer =
129 |                   new ChangesetStatsForeachWriter(databaseUrl, shouldUpdateUsernames = true)
130 | 
131 |                 if (writer.open(TaskContext.getPartitionId(), 0)) {
132 |                   try {
133 |                     rows.foreach(writer.process)
134 | 
135 |                     writer.close(null)
136 |                   } catch {
137 |                     case e: Throwable => writer.close(e)
138 |                   }
139 |                 }
140 |               })
141 | 
142 |             ss.stop()
143 |         }
144 |       }
145 |     )
146 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/streaming/EditHistogramTileUpdater.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.apps.streaming
  2 | 
  3 | import java.io._
  4 | import java.net.URI
  5 | 
  6 | import cats.implicits._
  7 | import com.monovore.decline._
  8 | import org.apache.spark.sql._
  9 | import org.apache.spark.sql.functions._
 10 | import org.locationtech.geomesa.spark.jts._
 11 | import osmesa.analytics.{Analytics, EditHistogram}
 12 | import vectorpipe.sources.Source
 13 | 
 14 | /*
 15 |  * Usage example:
 16 |  *
 17 |  * sbt "project apps" assembly
 18 |  *
 19 |  * spark-submit \
 20 |  *   --class osmesa.apps.streaming.EditHistogramTileUpdater \
 21 |  *   ingest/target/scala-2.11/osmesa-apps.jar
 22 |  */
 23 | object EditHistogramTileUpdater
 24 |     extends CommandApp(
 25 |       name = "osmesa-edit-histogram-tile-updater",
 26 |       header = "Consume minutely diffs to update edit histogram MVTs",
 27 |       main = {
 28 |         val changeSourceOpt = Opts
 29 |           .option[URI]("source",
 30 |                        short = "d",
 31 |                        metavar = "uri",
 32 |                        help = "Location of minutely diffs to process")
 33 |           .withDefault(new URI("https://planet.osm.org/replication/minute/"))
 34 | 
 35 |         val startSequenceOpt = Opts
 36 |           .option[Int](
 37 |             "start-sequence",
 38 |             short = "s",
 39 |             metavar = "sequence",
 40 |             help =
 41 |               "Minutely diff starting sequence. If absent, the current (remote) sequence will be used.")
 42 |           .orNone
 43 | 
 44 |         val endSequenceOpt = Opts
 45 |           .option[Int](
 46 |             "end-sequence",
 47 |             short = "e",
 48 |             metavar = "sequence",
 49 |             help =
 50 |               "Minutely diff ending sequence. If absent, the current (remote) sequence will be used.")
 51 |           .orNone
 52 | 
 53 |         val partitionCountOpt = Opts
 54 |           .option[Int]("partition-count",
 55 |                        short = "p",
 56 |                        metavar = "partition count",
 57 |                        help = "Change partition count.")
 58 |           .orNone
 59 | 
 60 |         val tileSourceOpt = Opts
 61 |           .option[URI](
 62 |             "tile-source",
 63 |             short = "t",
 64 |             metavar = "uri",
 65 |             help = "URI prefix of MVTs to update"
 66 |           )
 67 |           .withDefault(new File("").toURI)
 68 | 
 69 |         val concurrentUploadsOpt = Opts
 70 |           .option[Int]("concurrent-uploads",
 71 |                        short = "c",
 72 |                        metavar = "concurrent uploads",
 73 |                        help = "Set the number of concurrent uploads.")
 74 |           .orNone
 75 | 
 76 |         val baseZoomOpt = Opts
 77 |           .option[Int]("base-zoom",
 78 |                        short = "z",
 79 |                        metavar = "Base zoom",
 80 |                        help = "Most detailed zoom level")
 81 |           .orNone
 82 | 
 83 |         (changeSourceOpt,
 84 |          startSequenceOpt,
 85 |          endSequenceOpt,
 86 |          partitionCountOpt,
 87 |          tileSourceOpt,
 88 |          concurrentUploadsOpt,
 89 |          baseZoomOpt).mapN {
 90 |           (changeSource,
 91 |            startSequence,
 92 |            endSequence,
 93 |            partitionCount,
 94 |            tileSource,
 95 |            _concurrentUploads,
 96 |            baseZoom) =>
 97 |             val AppName = "EditHistogramTileUpdater"
 98 | 
 99 |             val spark: SparkSession = Analytics.sparkSession(AppName)
100 |             import spark.implicits._
101 |             implicit val concurrentUploads: Option[Int] = _concurrentUploads
102 |             spark.withJTS
103 | 
104 |             val changeOptions = Map(Source.BaseURI -> changeSource.toString) ++
105 |               startSequence
106 |                 .map(x => Map(Source.StartSequence -> x.toString))
107 |                 .getOrElse(Map.empty) ++
108 |               endSequence
109 |                 .map(x => Map(Source.EndSequence -> x.toString))
110 |                 .getOrElse(Map.empty) ++
111 |               partitionCount
112 |                 .map(x => Map(Source.PartitionCount -> x.toString))
113 |                 .getOrElse(Map.empty)
114 | 
115 |             val changes = spark.read
116 |               .format(Source.Changes)
117 |               .options(changeOptions)
118 |               .load
119 | 
120 |             val changedNodes = changes
121 |               .where('type === "node" and 'lat.isNotNull and 'lon.isNotNull)
122 |               .select('sequence,
123 |                       st_makePoint('lon, 'lat) as 'geom,
124 |                       year('timestamp) * 1000 + dayofyear('timestamp) as 'key)
125 | 
126 |             val tiledNodes = EditHistogram.update(changedNodes,
127 |                                                   tileSource,
128 |                                                   baseZoom.getOrElse(EditHistogram.DefaultBaseZoom))
129 | 
130 |             val lastSequence =
131 |               changedNodes.select(max('sequence) as 'sequence).first.getAs[Int]("sequence")
132 | 
133 |             println(s"${tiledNodes.count} tiles updated to ${lastSequence}.")
134 |         }
135 |       }
136 |     )
137 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/streaming/StreamingChangesetMetadataUpdater.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.apps.streaming
  2 | 
  3 | import java.net.URI
  4 | 
  5 | import cats.implicits._
  6 | import com.monovore.decline._
  7 | import org.apache.spark.sql._
  8 | import osmesa.analytics.Analytics
  9 | import osmesa.analytics.stats.ChangesetMetadataForeachWriter
 10 | import vectorpipe.functions._
 11 | import vectorpipe.functions.osm._
 12 | import vectorpipe.sources.Source
 13 | 
 14 | /*
 15 |  * Usage example:
 16 |  *
 17 |  * sbt "project apps" assembly
 18 |  *
 19 |  * spark-submit \
 20 |  *   --class osmesa.apps.streaming.StreamingChangesetMetadataUpdater \
 21 |  *   ingest/target/scala-2.11/osmesa-apps.jar \
 22 |  *   --database-url $DATABASE_URL
 23 |  */
 24 | object StreamingChangesetMetadataUpdater
 25 |     extends CommandApp(
 26 |       name = "osmesa-changeset-stream-processor",
 27 |       header = "Update statistics from changeset replication stream",
 28 |       main = {
 29 |         val changesetSourceOpt =
 30 |           Opts
 31 |             .option[URI]("changeset-source",
 32 |                          short = "c",
 33 |                          metavar = "uri",
 34 |                          help = "Location of changesets to process")
 35 |             .withDefault(new URI("https://planet.osm.org/replication/changesets/"))
 36 | 
 37 |         val databaseUrlOpt =
 38 |           Opts
 39 |             .option[URI](
 40 |               "database-url",
 41 |               short = "d",
 42 |               metavar = "database URL",
 43 |               help = "Database URL (default: $DATABASE_URL environment variable)"
 44 |             )
 45 |             .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database"))
 46 | 
 47 |         val startSequenceOpt =
 48 |           Opts
 49 |             .option[Int](
 50 |               "start-sequence",
 51 |               short = "s",
 52 |               metavar = "sequence",
 53 |               help = "Starting sequence. If absent, the current (remote) sequence will be used."
 54 |             )
 55 |             .orNone
 56 | 
 57 |         val endSequenceOpt =
 58 |           Opts
 59 |             .option[Int](
 60 |               "end-sequence",
 61 |               short = "e",
 62 |               metavar = "sequence",
 63 |               help = "Ending sequence. If absent, this will be an infinite stream."
 64 |             )
 65 |             .orNone
 66 | 
 67 |         val batchSizeOpt = Opts
 68 |           .option[Int]("batch-size",
 69 |                        short = "b",
 70 |                        metavar = "batch size",
 71 |                        help = "Change batch size.")
 72 |           .orNone
 73 | 
 74 |         (changesetSourceOpt, databaseUrlOpt, startSequenceOpt, endSequenceOpt, batchSizeOpt).mapN {
 75 |           (changesetSource, databaseUrl, startSequence, endSequence, batchSize) =>
 76 |             implicit val ss: SparkSession =
 77 |               Analytics.sparkSession("StreamingChangesetMetadataUpdater")
 78 | 
 79 |             import ss.implicits._
 80 | 
 81 |             val options = Map(
 82 |               Source.BaseURI -> changesetSource.toString,
 83 |               Source.DatabaseURI -> databaseUrl.toString,
 84 |               Source.ProcessName -> "ChangesetMetadataUpdater"
 85 |             ) ++
 86 |               startSequence
 87 |                 .map(s => Map(Source.StartSequence -> s.toString))
 88 |                 .getOrElse(Map.empty) ++
 89 |               endSequence
 90 |                 .map(s => Map(Source.EndSequence -> s.toString))
 91 |                 .getOrElse(Map.empty) ++
 92 |               batchSize
 93 |                 .map(x => Map(Source.BatchSize -> x.toString))
 94 |                 .getOrElse(Map.empty)
 95 | 
 96 |             val changesets =
 97 |               ss.readStream
 98 |                 .format(Source.Changesets)
 99 |                 .options(options)
100 |                 .load
101 | 
102 |             val changesetProcessor = changesets
103 |               .select(
104 |                 'id,
105 |                 'createdAt,
106 |                 'closedAt,
107 |                 'user,
108 |                 'uid,
109 |                 'tags.getField("created_by") as 'editor,
110 |                 merge_sets(hashtags('tags.getField("comment")),
111 |                            hashtags('tags.getField("hashtags"))) as 'hashtags
112 |               )
113 |               .writeStream
114 |               .queryName("update changeset metadata")
115 |               .foreach(new ChangesetMetadataForeachWriter(databaseUrl,
116 |                                                           shouldUpdateUsernames = true))
117 |               .start
118 | 
119 |             changesetProcessor.awaitTermination()
120 | 
121 |             ss.stop()
122 |         }
123 |       }
124 |     )
125 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/streaming/StreamingEditHistogramTileUpdater.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.apps.streaming
  2 | 
  3 | import java.io._
  4 | import java.net.URI
  5 | 
  6 | import cats.implicits._
  7 | import com.monovore.decline._
  8 | import org.apache.spark.sql._
  9 | import org.apache.spark.sql.functions._
 10 | import org.locationtech.geomesa.spark.jts._
 11 | import osmesa.analytics.{Analytics, EditHistogram}
 12 | import vectorpipe.sources.Source
 13 | 
 14 | /*
 15 |  * Usage example:
 16 |  *
 17 |  * sbt "project apps" assembly
 18 |  *
 19 |  * spark-submit \
 20 |  *   --class osmesa.apps.streaming.StreamingEditHistogramTileUpdater \
 21 |  *   ingest/target/scala-2.11/osmesa-apps.jar
 22 |  */
 23 | object StreamingEditHistogramTileUpdater
 24 |     extends CommandApp(
 25 |       name = "osmesa-edit-histogram-tile-updater",
 26 |       header = "Consume minutely diffs to update edit histogram MVTs",
 27 |       main = {
 28 |         val changeSourceOpt = Opts
 29 |           .option[URI]("source",
 30 |                        short = "d",
 31 |                        metavar = "uri",
 32 |                        help = "Location of minutely diffs to process")
 33 |           .withDefault(new URI("https://planet.osm.org/replication/minute/"))
 34 | 
 35 |         val startSequenceOpt = Opts
 36 |           .option[Int](
 37 |             "start-sequence",
 38 |             short = "s",
 39 |             metavar = "sequence",
 40 |             help =
 41 |               "Minutely diff starting sequence. If absent, the current (remote) sequence will be used.")
 42 |           .orNone
 43 | 
 44 |         val batchSizeOpt = Opts
 45 |           .option[Int]("batch-size",
 46 |                        short = "b",
 47 |                        metavar = "batch size",
 48 |                        help = "Change batch size.")
 49 |           .orNone
 50 | 
 51 |         val tileSourceOpt = Opts
 52 |           .option[URI](
 53 |             "tile-source",
 54 |             short = "t",
 55 |             metavar = "uri",
 56 |             help = "URI prefix of MVTs to update"
 57 |           )
 58 |           .withDefault(new File("").toURI)
 59 | 
 60 |         val concurrentUploadsOpt = Opts
 61 |           .option[Int]("concurrent-uploads",
 62 |                        short = "c",
 63 |                        metavar = "concurrent uploads",
 64 |                        help = "Set the number of concurrent uploads.")
 65 |           .orNone
 66 | 
 67 |         val databaseUrlOpt =
 68 |           Opts
 69 |             .option[URI](
 70 |               "database-url",
 71 |               short = "d",
 72 |               metavar = "database URL",
 73 |               help = "Database URL (default: DATABASE_URL environment variable)"
 74 |             )
 75 |             .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database"))
 76 |             .orNone
 77 | 
 78 |         val baseZoomOpt = Opts
 79 |           .option[Int]("base-zoom",
 80 |                        short = "z",
 81 |                        metavar = "Base zoom",
 82 |                        help = "Most detailed zoom level")
 83 |           .orNone
 84 | 
 85 |         (changeSourceOpt,
 86 |          startSequenceOpt,
 87 |          batchSizeOpt,
 88 |          tileSourceOpt,
 89 |          concurrentUploadsOpt,
 90 |          databaseUrlOpt,
 91 |          baseZoomOpt).mapN {
 92 |           (changeSource,
 93 |            startSequence,
 94 |            batchSize,
 95 |            tileSource,
 96 |            _concurrentUploads,
 97 |            databaseUrl,
 98 |            baseZoom) =>
 99 |             val AppName = "EditHistogramTileUpdater"
100 | 
101 |             val spark: SparkSession = Analytics.sparkSession(AppName)
102 |             import spark.implicits._
103 |             implicit val concurrentUploads: Option[Int] = _concurrentUploads
104 |             spark.withJTS
105 | 
106 |             val changeOptions = Map(Source.BaseURI -> changeSource.toString,
107 |                                     Source.ProcessName -> AppName) ++
108 |               databaseUrl
109 |                 .map(x => Map(Source.DatabaseURI -> x.toString))
110 |                 .getOrElse(Map.empty) ++
111 |               startSequence
112 |                 .map(x => Map(Source.StartSequence -> x.toString))
113 |                 .getOrElse(Map.empty) ++
114 |               batchSize
115 |                 .map(x => Map(Source.BatchSize -> x.toString))
116 |                 .getOrElse(Map.empty)
117 | 
118 |             val changes = spark.readStream
119 |               .format(Source.Changes)
120 |               .options(changeOptions)
121 |               .load
122 | 
123 |             val changedNodes = changes
124 |               .where('type === "node" and 'lat.isNotNull and 'lon.isNotNull)
125 |               .select('sequence,
126 |                       year('timestamp) * 1000 + dayofyear('timestamp) as 'key,
127 |                       st_makePoint('lon, 'lat) as 'geom)
128 | 
129 |             val tiledNodes = EditHistogram.update(changedNodes,
130 |                                                   tileSource,
131 |                                                   baseZoom.getOrElse(EditHistogram.DefaultBaseZoom))
132 | 
133 |             val query = tiledNodes.writeStream
134 |               .queryName("edit histogram tiles")
135 |               .format("console")
136 |               .start
137 | 
138 |             query.awaitTermination()
139 | 
140 |             spark.stop()
141 |         }
142 |       }
143 |     )
144 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/streaming/StreamingUserFootprintTileUpdater.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.apps.streaming
  2 | 
  3 | import java.io._
  4 | import java.net.URI
  5 | 
  6 | import cats.implicits._
  7 | import com.monovore.decline._
  8 | import org.apache.spark.sql._
  9 | import org.locationtech.geomesa.spark.jts._
 10 | import osmesa.analytics.{Analytics, Footprints}
 11 | import vectorpipe.sources.Source
 12 | 
 13 | /*
 14 |  * Usage example:
 15 |  *
 16 |  * sbt "project apps" assembly
 17 |  *
 18 |  * spark-submit \
 19 |  *   --class osmesa.apps.streaming.StreamingUserFootprintUpdater \
 20 |  *   ingest/target/scala-2.11/osmesa-apps.jar
 21 |  */
 22 | object StreamingUserFootprintTileUpdater
 23 |     extends CommandApp(
 24 |       name = "osmesa-user-footprint-updater",
 25 |       header = "Consume minutely diffs to update user footprint MVTs",
 26 |       main = {
 27 |         val changeSourceOpt = Opts
 28 |           .option[URI]("change-source",
 29 |                        short = "d",
 30 |                        metavar = "uri",
 31 |                        help = "Location of minutely diffs to process")
 32 |           .withDefault(new URI("https://planet.osm.org/replication/minute/"))
 33 | 
 34 |         val startSequenceOpt = Opts
 35 |           .option[Int](
 36 |             "start-sequence",
 37 |             short = "s",
 38 |             metavar = "sequence",
 39 |             help =
 40 |               "Minutely diff starting sequence. If absent, the current (remote) sequence will be used.")
 41 |           .orNone
 42 | 
 43 |         val batchSizeOpt = Opts
 44 |           .option[Int]("batch-size",
 45 |                        short = "b",
 46 |                        metavar = "batch size",
 47 |                        help = "Change batch size.")
 48 |           .orNone
 49 | 
 50 |         val tileSourceOpt = Opts
 51 |           .option[URI](
 52 |             "tile-source",
 53 |             short = "t",
 54 |             metavar = "uri",
 55 |             help = "URI prefix for vector tiles to update"
 56 |           )
 57 |           .withDefault(new File("").toURI)
 58 | 
 59 |         val concurrentUploadsOpt = Opts
 60 |           .option[Int]("concurrent-uploads",
 61 |                        short = "c",
 62 |                        metavar = "concurrent uploads",
 63 |                        help = "Set the number of concurrent uploads.")
 64 |           .orNone
 65 | 
 66 |         val databaseUrlOpt =
 67 |           Opts
 68 |             .option[URI](
 69 |               "database-url",
 70 |               short = "d",
 71 |               metavar = "database URL",
 72 |               help = "Database URL (default: DATABASE_URL environment variable)"
 73 |             )
 74 |             .orElse(Opts.env[URI]("DATABASE_URL", help = "The URL of the database"))
 75 |             .orNone
 76 | 
 77 |         (changeSourceOpt,
 78 |          startSequenceOpt,
 79 |          batchSizeOpt,
 80 |          tileSourceOpt,
 81 |          concurrentUploadsOpt,
 82 |          databaseUrlOpt).mapN {
 83 |           (changeSource, startSequence, batchSize, tileSource, _concurrentUploads, databaseUrl) =>
 84 |             val AppName = "UserFootprintUpdater"
 85 | 
 86 |             val spark: SparkSession = Analytics.sparkSession(AppName)
 87 |             import spark.implicits._
 88 |             implicit val concurrentUploads: Option[Int] = _concurrentUploads
 89 |             spark.withJTS
 90 | 
 91 |             val changeOptions = Map(Source.BaseURI -> changeSource.toString,
 92 |                                     Source.ProcessName -> AppName) ++
 93 |               databaseUrl
 94 |                 .map(x => Map(Source.DatabaseURI -> x.toString))
 95 |                 .getOrElse(Map.empty) ++
 96 |               startSequence
 97 |                 .map(x => Map(Source.StartSequence -> x.toString))
 98 |                 .getOrElse(Map.empty) ++
 99 |               batchSize
100 |                 .map(x => Map(Source.BatchSize -> x.toString))
101 |                 .getOrElse(Map.empty)
102 | 
103 |             val changes = spark.readStream
104 |               .format(Source.Changes)
105 |               .options(changeOptions)
106 |               .load
107 | 
108 |             val changedNodes = changes
109 |               .where('type === "node" and 'lat.isNotNull and 'lon.isNotNull)
110 |               .select('sequence, 'uid as 'key, st_makePoint('lon, 'lat) as 'geom)
111 | 
112 |             val tiledNodes =
113 |               Footprints.update(changedNodes, tileSource)
114 | 
115 |             val query = tiledNodes.writeStream
116 |               .queryName("tiled user footprints")
117 |               .format("console")
118 |               .start
119 | 
120 |             query.awaitTermination()
121 | 
122 |             spark.stop()
123 |         }
124 |       }
125 |     )
126 | 


--------------------------------------------------------------------------------
/src/apps/src/main/scala/osmesa/apps/streaming/UserFootprintUpdater.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.apps.streaming
  2 | 
  3 | import java.io._
  4 | import java.net.URI
  5 | 
  6 | import cats.implicits._
  7 | import com.monovore.decline._
  8 | import org.apache.spark.sql._
  9 | import org.apache.spark.sql.functions._
 10 | import org.locationtech.geomesa.spark.jts._
 11 | import osmesa.analytics.{Analytics, Footprints}
 12 | import vectorpipe.sources.Source
 13 | 
 14 | /*
 15 |  * Usage example:
 16 |  *
 17 |  * sbt "project apps" assembly
 18 |  *
 19 |  * spark-submit \
 20 |  *   --class osmesa.apps.streaming.UserFootprintUpdater \
 21 |  *   ingest/target/scala-2.11/osmesa-apps.jar
 22 |  */
 23 | object UserFootprintUpdater
 24 |     extends CommandApp(
 25 |       name = "osmesa-user-footprint-updater",
 26 |       header = "Consume minutely diffs to update user footprint MVTs",
 27 |       main = {
 28 |         val changeSourceOpt = Opts
 29 |           .option[URI]("change-source",
 30 |                        short = "d",
 31 |                        metavar = "uri",
 32 |                        help = "Location of minutely diffs to process")
 33 |           .withDefault(new URI("https://planet.osm.org/replication/minute/"))
 34 | 
 35 |         val startSequenceOpt = Opts
 36 |           .option[Int](
 37 |             "start-sequence",
 38 |             short = "s",
 39 |             metavar = "sequence",
 40 |             help =
 41 |               "Minutely diff starting sequence. If absent, the current (remote) sequence will be used.")
 42 |           .orNone
 43 | 
 44 |         val endSequenceOpt = Opts
 45 |           .option[Int](
 46 |             "end-sequence",
 47 |             short = "e",
 48 |             metavar = "sequence",
 49 |             help =
 50 |               "Minutely diff ending sequence. If absent, the current (remote) sequence will be used.")
 51 |           .orNone
 52 | 
 53 |         val partitionCountOpt = Opts
 54 |           .option[Int]("partition-count",
 55 |                        short = "p",
 56 |                        metavar = "partition count",
 57 |                        help = "Change partition count.")
 58 |           .orNone
 59 | 
 60 |         val tileSourceOpt = Opts
 61 |           .option[URI](
 62 |             "tile-source",
 63 |             short = "t",
 64 |             metavar = "uri",
 65 |             help = "URI prefix for vector tiles to update"
 66 |           )
 67 |           .withDefault(new File("").toURI)
 68 | 
 69 |         val concurrentUploadsOpt = Opts
 70 |           .option[Int]("concurrent-uploads",
 71 |                        short = "c",
 72 |                        metavar = "concurrent uploads",
 73 |                        help = "Set the number of concurrent uploads.")
 74 |           .orNone
 75 | 
 76 |         (changeSourceOpt,
 77 |          startSequenceOpt,
 78 |          endSequenceOpt,
 79 |          partitionCountOpt,
 80 |          tileSourceOpt,
 81 |          concurrentUploadsOpt).mapN {
 82 |           (changeSource,
 83 |            startSequence,
 84 |            endSequence,
 85 |            partitionCount,
 86 |            tileSource,
 87 |            _concurrentUploads) =>
 88 |             val AppName = "UserFootprintUpdater"
 89 | 
 90 |             val spark: SparkSession = Analytics.sparkSession(AppName)
 91 |             import spark.implicits._
 92 |             implicit val concurrentUploads: Option[Int] = _concurrentUploads
 93 |             spark.withJTS
 94 | 
 95 |             val changeOptions = Map(Source.BaseURI -> changeSource.toString) ++
 96 |               startSequence
 97 |                 .map(x => Map(Source.StartSequence -> x.toString))
 98 |                 .getOrElse(Map.empty) ++
 99 |               endSequence
100 |                 .map(x => Map(Source.EndSequence -> x.toString))
101 |                 .getOrElse(Map.empty) ++
102 |               partitionCount
103 |                 .map(x => Map(Source.PartitionCount -> x.toString))
104 |                 .getOrElse(Map.empty)
105 | 
106 |             val changes = spark.read
107 |               .format(Source.Changes)
108 |               .options(changeOptions)
109 |               .load
110 | 
111 |             val changedNodes = changes
112 |               .where('type === "node" and 'lat.isNotNull and 'lon.isNotNull)
113 |               .select('sequence, 'uid as 'key, st_makePoint('lon, 'lat) as 'geom)
114 | 
115 |             val tiledNodes =
116 |               Footprints.update(changedNodes, tileSource)
117 | 
118 |             val lastSequence =
119 |               changedNodes.select(max('sequence) as 'sequence).first.getAs[Int]("sequence")
120 | 
121 |             println(s"${tiledNodes.count} tiles updated to ${lastSequence}.")
122 |         }
123 |       }
124 |     )
125 | 


--------------------------------------------------------------------------------
/src/bench/src/main/scala/osmesa/Bench.scala:
--------------------------------------------------------------------------------
 1 | // package osmesa
 2 | 
 3 | // import java.util.concurrent.TimeUnit
 4 | 
 5 | // import scala.util.Try
 6 | 
 7 | // import cats.implicits._
 8 | // import org.apache.log4j
 9 | // import org.apache.spark._
10 | // import org.apache.spark.sql._
11 | // import org.openjdk.jmh.annotations._
12 | // import osmesa.analytics.oneoffs.Analysis
13 | 
14 | // // --- //
15 | 
16 | // @BenchmarkMode(Array(Mode.AverageTime))
17 | // @OutputTimeUnit(TimeUnit.SECONDS)
18 | // @State(Scope.Thread)
19 | // class Bench {
20 | 
21 | //   var conf: SparkConf = _
22 | //   implicit var ss: SparkSession = _
23 | 
24 | //   @Setup
25 | //   def setup: Unit = {
26 | //     conf = new SparkConf()
27 | //       .setIfMissing("spark.master", "local[*]")
28 | //       .setAppName("road-changes")
29 | //       .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
30 | //       .set("spark.kryo.registrator", classOf[geotrellis.spark.io.kryo.KryoRegistrator].getName)
31 | 
32 | //     ss = SparkSession.builder.config(conf).enableHiveSupport.getOrCreate
33 | 
34 | //     /* Silence the damn INFO logger */
35 | //     log4j.Logger.getRootLogger().setLevel(log4j.Level.ERROR)
36 | //   }
37 | 
38 | //   @TearDown
39 | //   def close: Unit = ss.stop()
40 | 
41 | //   @Benchmark
42 | //   def roads: Try[Double] = {
43 | //     val path: String = "/home/colin/code/azavea/vectorpipe/data/isle-of-man.orc"
44 | 
45 | //     (Try(ss.read.orc(path)) >>= Analysis.newRoadsByUser).map(_.aggregate(0d)({ _ + _._2 }, { _ + _ }))
46 | //   }
47 | 
48 | // }
49 | 


--------------------------------------------------------------------------------
/src/bench/src/main/scala/osmesa/MetresBench.scala:
--------------------------------------------------------------------------------
 1 | package osmesa
 2 | 
 3 | import java.util.concurrent.TimeUnit
 4 | 
 5 | import geotrellis.vector.{Point, Line}
 6 | import geotrellis.util.Haversine
 7 | import org.openjdk.jmh.annotations._
 8 | 
 9 | // --- //
10 | 
11 | @BenchmarkMode(Array(Mode.AverageTime))
12 | @OutputTimeUnit(TimeUnit.MICROSECONDS)
13 | @State(Scope.Thread)
14 | class MetresBench {
15 | 
16 |   var line0: Line = _
17 |   var line1: Line = _
18 | 
19 |   @Setup
20 |   def setup: Unit = {
21 |     line0 = Line((0 to 9).map(n => Point(n, n)))
22 |     line1 = Line((0 to 90).map(n => Point(n, n)))
23 |   }
24 | 
25 |   def iterator(line: Line): Double = {
26 |     val ps: List[Point] = line.points.toList
27 |     val pairs: Iterator[(Point, Point)] = ps.iterator.zip(ps.tail.iterator)
28 | 
29 |     pairs.foldLeft(0d) { case (acc, (p,c)) => acc + Haversine(p.x, p.y, c.x, c.y) }
30 |   }
31 | 
32 |   def manual(line: Line): Double = {
33 |     val geom = line.jtsGeom
34 | 
35 |     (0 until (geom.getNumPoints - 1)).map { i =>
36 |       val p = geom.getPointN(i)
37 |       val c = geom.getPointN(i + 1)
38 | 
39 |       Haversine(p.getX, p.getY, c.getX, c.getY)
40 |     } reduce (_ + _)
41 |   }
42 | 
43 |   def whiley(line: Line): Double = {
44 |     val geom = line.jtsGeom
45 |     var i: Int = 0
46 |     var r: Double = 0
47 | 
48 |     while (i < geom.getNumPoints - 1) {
49 |       val p = geom.getPointN(i)
50 |       val c = geom.getPointN(i + 1)
51 | 
52 |       r += Haversine(p.getX, p.getY, c.getX, c.getY)
53 |       i += 1
54 |     }
55 | 
56 |     r
57 |   }
58 | 
59 |   def sliding(line: Line): Double = {
60 |     val geom = line.jtsGeom
61 | 
62 |     line.points.sliding(2)
63 |       .map(pair => Haversine(pair.head.x, pair.head.y, pair.last.x, pair.last.y))
64 |       .foldLeft(0d) { _ + _ }
65 |   }
66 | 
67 |   @Benchmark
68 |   def iterator10: Double = iterator(line0)
69 |   @Benchmark
70 |   def iterator100: Double = iterator(line1)
71 | 
72 |   @Benchmark
73 |   def manual10: Double = manual(line0)
74 |   @Benchmark
75 |   def manual100: Double = manual(line1)
76 | 
77 |   @Benchmark
78 |   def while10: Double = whiley(line0)
79 |   @Benchmark
80 |   def while100: Double = whiley(line1)
81 | 
82 |   @Benchmark
83 |   def sliding10: Double = sliding(line0)
84 |   @Benchmark
85 |   def sliding100: Double = sliding(line1)
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/src/bench/src/main/scala/osmesa/SAXBench.scala:
--------------------------------------------------------------------------------
 1 | package osmesa
 2 | 
 3 | import java.util.concurrent.TimeUnit
 4 | 
 5 | import org.apache.commons.io.IOUtils
 6 | import org.openjdk.jmh.annotations._
 7 | import vectorpipe.model.{Actions, Change}
 8 | 
 9 | import java.util.zip.GZIPInputStream
10 | import javax.xml.parsers.{SAXParser, SAXParserFactory}
11 | import scala.xml.XML
12 | 
13 | // --- //
14 | 
15 | @BenchmarkMode(Array(Mode.AverageTime))
16 | @OutputTimeUnit(TimeUnit.MICROSECONDS)
17 | @State(Scope.Thread)
18 | class SAXBench {
19 | 
20 |   val sequence = 0
21 | 
22 |   @Setup
23 |   def setup: Unit = {
24 |   }
25 | 
26 |   def gzipInputStream(): GZIPInputStream = {
27 |     // requires the addition of a gzipped OSC file in bench/src/main/resources
28 |     val stream = getClass.getResourceAsStream("/942.osc.gz")
29 |     new GZIPInputStream(stream)
30 |   }
31 | 
32 |   def withScalaXML(): Int = {
33 |     // requires Change.fromXML (see commit 1b04a1e81f1a88f374a086c98d58677ec537b1bf)
34 |     val data = XML.loadString(IOUtils.toString(gzipInputStream))
35 | 
36 |     val changes = (data \ "_").flatMap { node =>
37 |       (node \ "_").map(Change.fromXML(_, Actions.fromString(node.label), sequence))
38 |     }
39 | 
40 |     changes.length
41 |   }
42 | 
43 |   def withSAXParser(): Int = {
44 |     val factory = SAXParserFactory.newInstance
45 |     val parser = factory.newSAXParser
46 |     val handler = new Change.ChangeHandler(sequence)
47 |     parser.parse(gzipInputStream(), handler)
48 |     handler.changeSeq.length
49 |   }
50 | 
51 |   @Benchmark
52 |   def useScala: Double = withScalaXML()
53 |   @Benchmark
54 |   def getSAXyGirl: Double = withSAXParser()
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/src/bm/build.sbt:
--------------------------------------------------------------------------------
 1 | import Dependencies._
 2 | 
 3 | name := "bm-standalone"
 4 | 
 5 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-core" % "2.6.7"
 6 | dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.7"
 7 | dependencyOverrides += "com.fasterxml.jackson.module" % "jackson-module-scala_2.11" % "2.6.7"
 8 | 
 9 | def excludeVP(module: ModuleID): ModuleID =
10 |   module.excludeAll(ExclusionRule("com.azavea", "vectorpipe"))
11 | 
12 | libraryDependencies ~= (_.map(excludeVP))
13 | 
14 | libraryDependencies ++= Seq(
15 |   decline,
16 |   sparkHive % "provided",
17 |   "com.google.protobuf" % "protobuf-java" % "2.5.0",
18 |   cats,
19 |   gtS3,
20 |   gtSparkTestKit,
21 |   logging,
22 |   scalatest,
23 |   "com.azavea" %% "vectorpipe" % "0.2.2",
24 |   "org.jblas" % "jblas" % "1.2.4"
25 | )
26 | 
27 | /* Fixes Spark breakage with `sbt run` as of sbt-1.0.2 */
28 | fork in run := true
29 | 
30 | fork in Test := true
31 | 
32 | test in assembly := {}
33 | 
34 | javaOptions ++= Seq("-Xmx5G")
35 | 
36 | initialCommands in console :=
37 |   """
38 |   """
39 | 
40 | assemblyJarName in assembly := "bm-standalone.jar"
41 | 
42 | assemblyShadeRules in assembly := {
43 |   val shadePackage = "com.azavea.shaded.demo"
44 |   Seq(
45 |     ShadeRule.rename("com.google.common.**" -> s"$shadePackage.google.common.@1")
46 |       .inLibrary("com.azavea.geotrellis" %% "geotrellis-cassandra" % Version.geotrellis).inAll,
47 |     ShadeRule.rename("io.netty.**" -> s"$shadePackage.io.netty.@1")
48 |       .inLibrary("com.azavea.geotrellis" %% "geotrellis-hbase" % Version.geotrellis).inAll,
49 |     ShadeRule.rename("com.fasterxml.jackson.**" -> s"$shadePackage.com.fasterxml.jackson.@1")
50 |       .inLibrary("com.networknt" % "json-schema-validator" % "0.1.7").inAll,
51 |     ShadeRule.rename("org.apache.avro.**" -> s"$shadePackage.org.apache.avro.@1")
52 |       .inLibrary("com.azavea.geotrellis" %% "geotrellis-spark" % Version.geotrellis).inAll
53 |   )
54 | }
55 | 
56 | assemblyMergeStrategy in assembly := {
57 |   case s if s.startsWith("META-INF/services") => MergeStrategy.concat
58 |   case "reference.conf" | "application.conf"  => MergeStrategy.concat
59 |   case "META-INF/MANIFEST.MF" | "META-INF\\MANIFEST.MF" => MergeStrategy.discard
60 |   case "META-INF/ECLIPSEF.RSA" | "META-INF/ECLIPSEF.SF" => MergeStrategy.discard
61 |   case _ => MergeStrategy.first
62 | }
63 | 
64 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
65 | 


--------------------------------------------------------------------------------
/src/bm/src/main/scala/osmesa/bm/Downsample.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.bm
 2 | 
 3 | import geotrellis.vector._
 4 | 
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | import vectorpipe.osm._
 8 | 
 9 | import monocle.macros.GenLens
10 | import com.vividsolutions.jts.algorithm.Centroid
11 | 
12 | 
13 | object Downsample {
14 | 
15 |   val tags = GenLens[vectorpipe.osm.ElementMeta](_.tags)
16 | 
17 |   def transmute(rdd: RDD[OSMFeature]) = {
18 |     rdd.map({ f =>
19 |       val geom = {
20 |         val _geom = Centroid.getCentroid(f.geom.jtsGeom)
21 |         Point(_geom.x, _geom.y)
22 |       }
23 |       val data = tags.set(f.data.tags + ("multiplicity" -> 1.toString))(f.data)
24 |       new OSMFeature(geom, data)
25 |     })
26 |   }
27 | 
28 |   private def getAddress(f: OSMFeature, zoom: Int): (Double, Double) = {
29 |     f.geom match {
30 |       case p: Point =>
31 |         val u: Double = (p.x + 180.0)/360.0
32 |         val v: Double = (p.y + 90.0)/180.0
33 |         val x: Long = java.lang.Double.doubleToRawLongBits(u) >> (48-zoom)
34 |         val y: Long = java.lang.Double.doubleToRawLongBits(v) >> (48-zoom)
35 |         (x, y)
36 |       case _ => throw new Exception
37 |     }
38 |   }
39 | 
40 |   def apply(rdd: RDD[OSMFeature], zoom: Int) = {
41 |     rdd
42 |       .map({ f => (getAddress(f, zoom), f) })
43 |       .reduceByKey({ case (f1: OSMFeature, f2: OSMFeature) =>
44 |         val mult1 = f1.data.tags.getOrElse("multiplicity", throw new Exception).toInt
45 |         val mult2 = f2.data.tags.getOrElse("multiplicity", throw new Exception).toInt
46 |         val geom = f1.geom
47 |         val data = tags.set(f1.data.tags + ("multiplicity" -> (mult1+mult2).toString))(f1.data)
48 |         new OSMFeature(geom, data)
49 |       })
50 |       .values
51 |   }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/bm/src/main/scala/osmesa/bm/Homography.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.bm
 2 | 
 3 | import geotrellis.vector._
 4 | 
 5 | import vectorpipe.osm._
 6 | 
 7 | import org.jblas.{DoubleMatrix, Eigen, Singular}
 8 | 
 9 | 
10 | object Homography {
11 | 
12 |   private def pairToRows(
13 |     a: Point, b: Point,
14 |     xbar: Double, ybar: Double,
15 |     maxabsx: Double, maxabsy: Double
16 |   ) = {
17 |     val x: Double = (a.x - xbar) / maxabsx
18 |     val y: Double = (a.y - ybar) / maxabsy
19 |     val u: Double = (b.x - xbar) / maxabsx
20 |     val v: Double = (b.y - ybar) / maxabsy
21 | 
22 |     Array(
23 |       (new DoubleMatrix(Array(-x, -y, -1.0, 0.0, 0.0, 0.0, u*x, u*y, u))).transpose,
24 |       (new DoubleMatrix(Array(0.0, 0.0, 0.0, -x, -y, -1.0, v*x, v*y, v))).transpose
25 |     )
26 |   }
27 | 
28 |   def dlt(pairs: Seq[(Point, Point)], cx: Double, cy: Double): DoubleMatrix = {
29 |     val m = new DoubleMatrix(pairs.length * 2, 9)
30 | 
31 |     pairs
32 |       .flatMap({ case (a: Point, b: Point) => pairToRows(a, b, cx, cy, 1e-5, 1e-5) })
33 |       .zipWithIndex
34 |       .foreach({ case (c: DoubleMatrix, i: Int) => m.putRow(i, c) })
35 | 
36 |     val svd = Singular.fullSVD(m)
37 |     val h = svd(2).getColumn(8).reshape(3,3).transpose()
38 |     val h33 = h.get(2,2)
39 | 
40 |     h.div(h33)
41 |   }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/bm/src/main/scala/osmesa/bm/QuadTreePartitioner.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.bm
  2 | 
  3 | import geotrellis.vector._
  4 | 
  5 | import vectorpipe.osm._
  6 | 
  7 | import org.apache.spark.{Partitioner, HashPartitioner }
  8 | import org.apache.spark.rdd.RDD
  9 | 
 10 | 
 11 | class QuadTreePartitioner(divisionSet: Set[Int], partitions: Int) extends Partitioner {
 12 | 
 13 |   val maxDivisions = divisionSet.reduce(math.max)
 14 | 
 15 |   require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")
 16 | 
 17 |   val hashPartitioner = new HashPartitioner(partitions)
 18 | 
 19 |   private def step(
 20 |     bits: Int,
 21 |     _xmin: Double, _ymin: Double,
 22 |     _xmax: Double, _ymax:Double
 23 |   ): (Double, Double, Double, Double) = {
 24 |     var xmin = _xmin
 25 |     var ymin = _ymin
 26 |     var xmax = _xmax
 27 |     var ymax = _ymax
 28 | 
 29 |     bits match {
 30 |       case 0 =>
 31 |         xmin = 2*xmin
 32 |         ymin = 2*ymin
 33 |         xmax = 2*xmax
 34 |         ymax = 2*ymax
 35 |       case 1 =>
 36 |         xmin = 2*(xmin - 0.5)
 37 |         ymin = 2*ymin
 38 |         xmax = 2*(xmax - 0.5)
 39 |         ymax = 2*ymax
 40 |       case 2 =>
 41 |         xmin = 2*xmin
 42 |         ymin = 2*(ymin - 0.5)
 43 |         xmax = 2*xmax
 44 |         ymax = 2*(ymax - 0.5)
 45 |       case 3 =>
 46 |         xmin = 2*(xmin - 0.5)
 47 |         ymin = 2*(ymin - 0.5)
 48 |         xmax = 2*(xmax - 0.5)
 49 |         ymax = 2*(ymax - 0.5)
 50 |     }
 51 | 
 52 |     (xmin, ymin, xmax, ymax)
 53 |   }
 54 | 
 55 |   private def getBits(xmin: Double, ymin: Double, xmax: Double, ymax:Double): Option[Int] = {
 56 |     val minBits = ((xmin > 0.5),(ymin > 0.5)) match {
 57 |       case (false, false) => 0
 58 |       case (true, false) => 1
 59 |       case (false, true) => 2
 60 |       case (true, true) => 3
 61 |     }
 62 |     val maxBits = ((xmax > 0.5),(ymax > 0.5)) match {
 63 |       case (false, false) => 0
 64 |       case (true, false) => 1
 65 |       case (false, true) => 2
 66 |       case (true, true) => 3
 67 |     }
 68 | 
 69 |     if (minBits == maxBits) Some(minBits); else None
 70 |   }
 71 | 
 72 |   private def getBox(g: Geometry): (Double, Double, Double, Double) = {
 73 |     val e: Extent = g.envelope
 74 |     ((e.xmin+180)/360, (e.ymin+90)/180, (e.xmax+180)/360, (e.ymax+90)/180)
 75 |   }
 76 | 
 77 |   def getAddress(g: Geometry): Long = {
 78 |     var box: (Double, Double, Double, Double) = getBox(g)
 79 |     var address: Long = 0
 80 |     var bits: Option[Int] = getBits(box._1, box._2, box._3, box._4)
 81 |     var division = 0
 82 | 
 83 |     while (bits != None && division <= maxDivisions) {
 84 |       if (divisionSet.contains(division))
 85 |         address = (address << 2) | bits.get
 86 |       box = step(bits.get, box._1, box._2, box._3, box._4)
 87 |       bits = getBits(box._1, box._2, box._3, box._4)
 88 |       division = division + 1
 89 |     }
 90 | 
 91 |     address
 92 |   }
 93 | 
 94 |   def numPartitions: Int = partitions
 95 | 
 96 |   def getPartition(key: Any): Int = {
 97 |     key match {
 98 |       case f: Feature[Geometry, Any] =>
 99 |         (getAddress(f.geom) % partitions).toInt
100 |       case g: Geometry =>
101 |         (getAddress(g) % partitions).toInt
102 |       case _ =>
103 |         throw new Exception
104 |     }
105 |   }
106 | 
107 |   override def equals(other: Any): Boolean = false
108 | 
109 |   override def hashCode: Int = numPartitions
110 | }
111 | 


--------------------------------------------------------------------------------
/src/bm/src/main/scala/osmesa/bm/VertexMatching.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.bm
  2 | 
  3 | import geotrellis.vector._
  4 | import geotrellis.vector.io._
  5 | 
  6 | import com.vividsolutions.jts.algorithm.{Centroid, CGAlgorithms}
  7 | import com.vividsolutions.jts.geom.Coordinate
  8 | 
  9 | 
 10 | object VertexMatching {
 11 | 
 12 |   private def matcher(
 13 |     points1: Array[Point], points2: Array[Point],
 14 |     offsetx: Double, offsety: Double,
 15 |     list: List[(Point, Point)] = List.empty[(Point, Point)]
 16 |   ): List[(Point, Point)] = {
 17 |     if (points1.isEmpty || points2.isEmpty) list
 18 |     else {
 19 |       val (_, i) = argmin(points1.head, points2, offsetx, offsety)
 20 |       matcher(
 21 |         points1.drop(1), points2.drop(i+1),
 22 |         offsetx, offsety,
 23 |         list ++ List((points1.head, points2(i)))
 24 |       )
 25 |     }
 26 |   }
 27 | 
 28 |   private def argmin(
 29 |     p: Point, ps: Array[Point],
 30 |     offsetx: Double, offsety: Double
 31 |   ): (Double, Int) = {
 32 |     ps
 33 |       .map({ p2 =>
 34 |         val temp = Point(p2.x - offsetx, p2.y - offsety)
 35 |         temp.distance(p)
 36 |       })
 37 |       .zipWithIndex
 38 |       .reduce({ (pair1, pair2) =>
 39 |         if (pair1._1 <= pair2._1) pair1
 40 |         else pair2
 41 |       })
 42 |   }
 43 | 
 44 |   private def polygonToPolygon(_p1: Polygon, _p2: Polygon, relative: Boolean) = {
 45 |     val (p1, p2) =
 46 |       if (_p1.vertices.length < _p2.vertices.length) (_p1, _p2)
 47 |       else (_p2, _p1)
 48 | 
 49 |     val (centroidx, centroidy) = {
 50 |       val centroid = Centroid.getCentroid(p1.jtsGeom)
 51 |       (centroid.x, centroid.y)
 52 |     }
 53 | 
 54 |     val (offsetx: Double, offsety: Double) =
 55 |       if (relative) {
 56 |         val centroid = Centroid.getCentroid(p2.jtsGeom)
 57 |         (centroid.x - centroidx, centroid.y - centroidy)
 58 |       }
 59 |       else (0.0, 0.0)
 60 | 
 61 |     val points1 = {
 62 |       val pts = p1.jtsGeom.getCoordinates
 63 |       if (CGAlgorithms.isCCW(pts)) pts
 64 |       else pts.reverse
 65 |     }.drop(1).map({ p => Point(p.x, p.y) })
 66 | 
 67 |     val points2 = {
 68 |       val points = {
 69 |         val pts = p2.jtsGeom.getCoordinates
 70 |         if (CGAlgorithms.isCCW(pts)) pts
 71 |         else pts.reverse
 72 |       }.drop(1).map({ p => Point(p.x, p.y) })
 73 |       val (_, i) = argmin(points1.head, points, offsetx, offsety)
 74 |       points.drop(i) ++ points.take(i)
 75 |     }
 76 | 
 77 |     val pairs = matcher(points1, points2, offsetx, offsety)
 78 | 
 79 |     Homography.dlt(
 80 |       if (pairs.length >= 4) pairs; else points1.zip(points2).take(4).toList,
 81 |       centroidx, centroidy
 82 |     )
 83 |   }
 84 | 
 85 |   def score(p1: Polygon, p2: Polygon): Double = {
 86 |     val h1 = polygonToPolygon(p1, p2, false).toArray
 87 |     val Δ1 = math.abs(h1(0)-1.0) + math.abs(h1(1)) + math.abs(h1(2)) + math.abs(h1(3)) + math.abs(h1(4)-1.0) + math.abs(h1(5))
 88 | 
 89 |     val h2 = polygonToPolygon(p1, p2, true).toArray
 90 |     val Δ2 = math.abs(h2(0)-1.0) + math.abs(h2(1)) + math.abs(h2(2)) + math.abs(h2(3)) + math.abs(h2(4)-1.0) + math.abs(h2(5))
 91 | 
 92 |     math.min(Δ1, Δ2)
 93 |   }
 94 | 
 95 |   def main(args: Array[String]): Unit = {
 96 |     val polygon1 =
 97 |       if (args(0).endsWith(".geojson"))
 98 |         scala.io.Source.fromFile(args(0)).mkString.parseGeoJson[Polygon]
 99 |       else
100 |         args(0).parseGeoJson[Polygon]
101 | 
102 |     val polygon2 =
103 |       if (args(1).endsWith(".geojson"))
104 |         scala.io.Source.fromFile(args(1)).mkString.parseGeoJson[Polygon]
105 |       else
106 |         args(1).parseGeoJson[Polygon]
107 | 
108 |     println(polygon1.distance(polygon2))
109 |     println(Centroid.getCentroid(polygon1.jtsGeom).distance(Centroid.getCentroid(polygon2.jtsGeom)))
110 |     println(polygonToPolygon(polygon1, polygon2, false))
111 |     println(polygonToPolygon(polygon1, polygon2, true))
112 |   }
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/src/bm/src/main/scala/osmesa/bm/VertexProjection.scala:
--------------------------------------------------------------------------------
  1 | package osmesa.bm
  2 | 
  3 | import geotrellis.vector._
  4 | import geotrellis.vector.io._
  5 | import geotrellis.vector.io.json.JsonFeatureCollection
  6 | 
  7 | import com.vividsolutions.jts.algorithm.Centroid
  8 | 
  9 | 
 10 | object VertexProjection {
 11 | 
 12 |   private def pointToPolygon(p: Point, offsetx: Double, offsety: Double, right: Polygon): Point = {
 13 | 
 14 |     val point =
 15 |       right.vertices
 16 |         .map({ p => Point(p.x - offsetx, p.y - offsety) })
 17 |         .sliding(2)
 18 |         .map({ case Array(a, b) =>
 19 |           val px: Double = p.x - a.x
 20 |           val py: Double = p.y - a.y
 21 |           val vx: Double = b.x - a.x
 22 |           val vy: Double = b.y - a.y
 23 |           val absv: Double = math.sqrt(vx*vx + vy*vy)
 24 |           val t = px*vx/absv + py*vy/absv
 25 | 
 26 |           val c =
 27 |             if (t <= 0.0) a
 28 |             else if (t >= 1.0) b
 29 |             else Point(a.x*t + b.x*(1.0-t), a.y*t + b.y*(1.0-t))
 30 | 
 31 |           (p.distance(c), c)
 32 |         })
 33 |         .reduce({ (t1: (Double, Point), t2: (Double, Point)) =>
 34 |           if (t1._1 <= t2._1) t1
 35 |           else t2
 36 |         })._2
 37 | 
 38 |     Point(point.x + offsetx, point.y + offsety)
 39 |   }
 40 | 
 41 |   private def polygonToPolygon(left: Polygon, right: Polygon, relative: Boolean) = {
 42 |     val (centroidx, centroidy) = {
 43 |       val centroid = Centroid.getCentroid(left.jtsGeom)
 44 |       (centroid.x, centroid.y)
 45 |     }
 46 | 
 47 |     val (offsetx: Double, offsety: Double) = {
 48 |       if (relative) {
 49 |         val centroid = Centroid.getCentroid(right.jtsGeom)
 50 |         (centroid.x - centroidx, centroid.y - centroidy)
 51 |       }
 52 |       else (0.0, 0.0)
 53 |     }
 54 | 
 55 |     val xs1 = left.vertices
 56 |     val xs2 = xs1.map({ point => pointToPolygon(point, offsetx, offsety, right) })
 57 | 
 58 |     Homography.dlt(xs1.zip(xs2), centroidx, centroidy)
 59 |   }
 60 | 
 61 |   private def geometryToGeometry(left: Geometry, right: Geometry, relative: Boolean) = {
 62 |     val polygon1 = left match {
 63 |       case p: Polygon => p
 64 |       case mp: MultiPolygon =>
 65 |         mp.polygons.reduce({ (p1, p2) => if (p1.vertices.length > p2.vertices.length) p1; else p2 })
 66 |     }
 67 |     val polygon2 = right match {
 68 |       case p: Polygon => p
 69 |       case mp: MultiPolygon =>
 70 |         mp.polygons.reduce({ (p1, p2) => if (p1.vertices.length > p2.vertices.length) p1; else p2 })
 71 |     }
 72 | 
 73 |     polygonToPolygon(polygon1, polygon2, relative)
 74 |   }
 75 | 
 76 |   def score(p1: Polygon, p2: Polygon): Double = {
 77 |     val h1 = polygonToPolygon(p1, p2, false).toArray
 78 |     val Δ1 = math.abs(h1(0)-1.0) + math.abs(h1(1)) + math.abs(h1(2)) + math.abs(h1(3)) + math.abs(h1(4)-1.0) + math.abs(h1(5))
 79 | 
 80 |     val h2 = polygonToPolygon(p2, p1, false).toArray
 81 |     val Δ2 = math.abs(h2(0)-1.0) + math.abs(h2(1)) + math.abs(h2(2)) + math.abs(h2(3)) + math.abs(h2(4)-1.0) + math.abs(h2(5))
 82 | 
 83 |     val h3 = polygonToPolygon(p1, p2, true).toArray
 84 |     val Δ3 = math.abs(h3(0)-1.0) + math.abs(h3(1)) + math.abs(h3(2)) + math.abs(h3(3)) + math.abs(h3(4)-1.0) + math.abs(h3(5))
 85 | 
 86 |     val h4 = polygonToPolygon(p2, p1, true).toArray
 87 |     val Δ4 = math.abs(h4(0)-1.0) + math.abs(h4(1)) + math.abs(h4(2)) + math.abs(h4(3)) + math.abs(h4(4)-1.0) + math.abs(h4(5))
 88 | 
 89 |     math.min(Δ1, math.min(Δ2, math.min(Δ3, Δ4)))
 90 |   }
 91 | 
 92 |   def main(args: Array[String]): Unit = {
 93 |     val polygon1 =
 94 |       if (args(0).endsWith(".geojson"))
 95 |         scala.io.Source.fromFile(args(0)).mkString.parseGeoJson[Geometry]
 96 |       else
 97 |         args(0).parseGeoJson[Geometry]
 98 | 
 99 |     val polygon2 =
100 |       if (args(1).endsWith(".geojson"))
101 |         scala.io.Source.fromFile(args(1)).mkString.parseGeoJson[Geometry]
102 |       else
103 |         args(1).parseGeoJson[Geometry]
104 | 
105 |     println(geometryToGeometry(polygon1, polygon2, false))
106 |     println(geometryToGeometry(polygon2, polygon1, false))
107 |     println(geometryToGeometry(polygon1, polygon2, true))
108 |     println(geometryToGeometry(polygon2, polygon1, true))
109 |   }
110 | 
111 | }
112 | 


--------------------------------------------------------------------------------
/src/bm/src/main/scala/osmesa/bm/VolumeMatching.scala:
--------------------------------------------------------------------------------
 1 | package osmesa.bm
 2 | 
 3 | import geotrellis.vector._
 4 | import geotrellis.vector.io._
 5 | 
 6 | 
 7 | object VolumeMatching {
 8 | 
 9 |   def data(p1: Polygon, p2: Polygon): (Double, Double) = {
10 |     val a1 = p1.jtsGeom.getArea
11 |     val a2 = p2.jtsGeom.getArea
12 |     val a3 = p1.jtsGeom.intersection(p2.jtsGeom).getArea
13 |     (a3/a1, a3/a2)
14 |   }
15 | 
16 |   def min(p1: Polygon, p2: Polygon): Double = {
17 |     val (a1, a2) = data(p1, p2)
18 |     math.min(a1, a2)
19 |   }
20 | 
21 |   def max(p1: Polygon, p2: Polygon): Double = {
22 |     val (a1, a2) = data(p1, p2)
23 |     math.max(a1, a2)
24 |   }
25 | 
26 |   def main(args: Array[String]): Unit = {
27 |     val polygon1 =
28 |       if (args(0).endsWith(".geojson"))
29 |         scala.io.Source.fromFile(args(0)).mkString.parseGeoJson[Polygon]
30 |       else
31 |         args(0).parseGeoJson[Polygon]
32 | 
33 |     val polygon2 =
34 |       if (args(1).endsWith(".geojson"))
35 |         scala.io.Source.fromFile(args(1)).mkString.parseGeoJson[Polygon]
36 |       else
37 |         args(1).parseGeoJson[Polygon]
38 | 
39 |     println(polygon1 == polygon2)
40 |     println(data(polygon1, polygon2))
41 |   }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/bm/view/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset='utf-8' />
  5 |     <title>BM</title>
  6 |     <meta name='viewport' content='initial-scale=1,maximum-scale=1,user-scalable=no' />
  7 |     <script src='https://api.tiles.mapbox.com/mapbox-gl-js/v0.44.1/mapbox-gl.js'></script>
  8 |     <link href='https://api.tiles.mapbox.com/mapbox-gl-js/v0.44.1/mapbox-gl.css' rel='stylesheet' />
  9 |     <style>
 10 |         body { margin:0; padding:0; }
 11 |         #map { position:absolute; top:0; bottom:0; width:100%; }
 12 |     </style>
 13 | </head>
 14 | <body>
 15 | 
 16 | <div id='map'></div>
 17 | <script>
 18 |     var map = new mapboxgl.Map({
 19 |         container: 'map',
 20 |         style: {
 21 |             "version": 8,
 22 |             "sources": {
 23 |                 "simple-tiles": {
 24 |                     "type": "raster",
 25 |                     "tiles": [
 26 |                         "http://a.tile.stamen.com/toner/{z}/{x}/{y}.png",
 27 |                         "http://b.tile.stamen.com/toner/{z}/{x}/{y}.png",
 28 |                         "http://c.tile.stamen.com/toner/{z}/{x}/{y}.png",
 29 |                         "http://d.tile.stamen.com/toner/{z}/{x}/{y}.png"
 30 |                     ],
 31 |                     "tileSize": 256
 32 |                 }
 33 |             },
 34 |             "layers": [{
 35 |                 "id": "simple-tiles",
 36 |                 "type": "raster",
 37 |                 "source": "simple-tiles",
 38 |                 "minzoom": 0,
 39 |                 "maxzoom": 19
 40 |             }]
 41 |         },
 42 |         zoom: 6,
 43 |         center: [-71.5237372,41.4857846]
 44 |     });
 45 | map.on('load', function() {
 46 | 
 47 |     // Add VecTiles source
 48 |     map.addLayer({
 49 |         "id": "bm-heat",
 50 |         "type": "heatmap",
 51 |         "source": {
 52 |             "type": "vector",
 53 |             "tiles": ["http://localhost:8080/point/{z}/{x}/{y}.mvt"],
 54 |             "minzoom": 0,
 55 |             "maxzoom": 19
 56 |         },
 57 |         "source-layer": "bm-heat",
 58 |         "paint": {
 59 |             "heatmap-weight": {
 60 |                 "type": "identity",
 61 |                 "property": "multiplicity"
 62 |             }
 63 |         }
 64 |     });
 65 | 
 66 |     // Add VecTiles source
 67 |     map.addLayer({
 68 |         "id": "bm",
 69 |         "type": "fill",
 70 |         "source": {
 71 |             "type": "vector",
 72 |             "tiles": ["http://localhost:8080/poly/{z}/{x}/{y}.mvt"],
 73 |             "minzoom": 15,
 74 |             "maxzoom": 19
 75 |         },
 76 |         "source-layer": "bm",
 77 |         "paint": {
 78 |             "fill-color": {
 79 |                 property: 'displayNumber',
 80 |                 stops: [
 81 |                     [0, "rgb(255,0,0)"],
 82 |                     [1, "rgb(255,0,255)"],
 83 |                     [2, "rgb(0,0,255)"]
 84 |                 ]
 85 |             }
 86 |         }
 87 |     });
 88 | 
 89 |     map.on("click", "bm", function(e) {
 90 |         console.log(e)
 91 |         var osm_info = {
 92 |             "bestMatchProb" : e.features[0].properties["bestMatchProb"],
 93 |             "bestMatchUid" : e.features[0].properties["bestMatchUid"],
 94 |             "dataset" : e.features[0].properties["dataset"],
 95 |             "id" : e.features[0].properties["__id"],
 96 |             "totalMatches": e.features[0].properties["totalMatches"]
 97 |         }
 98 |         console.log(osm_info)
 99 |     });
100 | });
101 | 
102 | map.addControl(new mapboxgl.NavigationControl());
103 | </script>
104 | 
105 | </body>
106 | </html>
107 | 


--------------------------------------------------------------------------------
/src/build.sbt:
--------------------------------------------------------------------------------
 1 | import Dependencies._
 2 | 
 3 | lazy val commonSettings = Seq(
 4 |   organization := "com.azavea",
 5 |   version := Version.osmesa,
 6 |   cancelable in Global := true,
 7 |   scalaVersion in ThisBuild := Version.scala,
 8 |   scalacOptions := Seq(
 9 |     "-deprecation",
10 |     "-unchecked",
11 |     "-feature",
12 |     "-language:implicitConversions",
13 |     "-language:reflectiveCalls",
14 |     "-language:higherKinds",
15 |     "-language:postfixOps",
16 |     "-language:existentials",
17 |     "-language:experimental.macros",
18 |     "-feature",
19 |     "-Ypartial-unification",
20 |     "-Ypatmat-exhaust-depth", "100"
21 |   ),
22 | 
23 |   // resolvers ++= Seq(
24 |   //   "locationtech-releases" at "https://repo.locationtech.org/content/repositories/releases/",
25 |   //   "locationtech-snapshots" at "https://repo.locationtech.org/content/repositories/snapshots/",
26 |   //   "geosolutions" at "http://maven.geo-solutions.it/",
27 |   //   "osgeo-releases" at "https://repo.osgeo.org/repository/release/",
28 |   //   "apache.commons.io" at "https://mvnrepository.com/artifact/commons-io/commons-io"
29 |   // ),
30 |   externalResolvers := Settings.Repositories.all,
31 | 
32 |   updateOptions := updateOptions.value.withGigahorse(false),
33 |   shellPrompt := { s => Project.extract(s).currentProject.id + " > " },
34 |   assemblyMergeStrategy in assembly := {
35 |     case "reference.conf" | "application.conf"  => MergeStrategy.concat
36 |     case PathList("META-INF", xs@_*) =>
37 |       xs match {
38 |         case ("MANIFEST.MF" :: Nil) => MergeStrategy.discard
39 |         // Concatenate everything in the services directory to keep GeoTools happy.
40 |         case ("services" :: _ :: Nil) =>
41 |           MergeStrategy.concat
42 |         // Concatenate these to keep JAI happy.
43 |         case ("javax.media.jai.registryFile.jai" :: Nil) | ("registryFile.jai" :: Nil) | ("registryFile.jaiext" :: Nil) =>
44 |           MergeStrategy.concat
45 |         case (name :: Nil) => {
46 |           // Must exclude META-INF/*.([RD]SA|SF) to avoid "Invalid signature file digest for Manifest main attributes" exception.
47 |           if (name.endsWith(".RSA") || name.endsWith(".DSA") || name.endsWith(".SF"))
48 |             MergeStrategy.discard
49 |           else
50 |             MergeStrategy.first
51 |         }
52 |         case _ => MergeStrategy.first
53 |       }
54 |     case _ => MergeStrategy.first
55 |   }
56 | )
57 | 
58 | /* Allow `run` to be used with Spark code, while assembling fat JARs w/o Spark bundled */
59 | // run in Compile := Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)).evaluated
60 | // runMain in Compile := Defaults.runMainTask(fullClasspath in Compile, runner in(Compile, run)).evaluated
61 | 
62 | lazy val root = Project("osmesa", file("."))
63 |   .aggregate(
64 |     analytics,
65 |     apps,
66 |     bm
67 |   ).settings(commonSettings: _*)
68 | 
69 | lazy val analytics =
70 |   project
71 |     .settings(commonSettings: _*)
72 | 
73 | lazy val apps =
74 |   project
75 |     .dependsOn(analytics)
76 |     .settings(commonSettings: _*)
77 | 
78 | lazy val bm =
79 |   project
80 |     .settings(commonSettings: _*)
81 | 
82 | /* Run with
83 |       jmh:run -t 1 -f 1 -wi 5 -i 5 .*Bench.*
84 |  */
85 | // lazy val bench =
86 | //   project.in(file("bench"))
87 | //     .settings(commonSettings)
88 | //     .dependsOn(analytics)
89 | //     .enablePlugins(JmhPlugin)
90 | 


--------------------------------------------------------------------------------
/src/docker/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=WARN, console
2 | log4j.appender.console=org.apache.log4j.ConsoleAppender
3 | log4j.appender.console.target=System.out
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | # log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c: %m%n
6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
7 | log4j.logger.osmesa=DEBUG
8 | log4j.logger.vectorpipe=DEBUG


--------------------------------------------------------------------------------
/src/docker/refresh-views.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo "$(date -Iseconds): Starting view refreshment in $DATABASE_NAME"
 4 | 
 5 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently user_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then
 6 |   echo "$(date -Iseconds): Refreshing user statistics"
 7 |   # refresh in the background to return immediately
 8 |   psql -Aqt \
 9 |     -c "REFRESH MATERIALIZED VIEW CONCURRENTLY user_statistics" \
10 |     -c "UPDATE refreshments SET updated_at=now() where mat_view='user_statistics'" \
11 |     $DATABASE_URL &
12 | else
13 |   echo "$(date -Iseconds): User stats table already refreshing"
14 | fi
15 | 
16 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently hashtag_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then
17 |   echo "$(date -Iseconds): Refreshing hashtag statistics"
18 |   # refresh in the background to return immediately
19 |   psql -Aqt \
20 |     -c "REFRESH MATERIALIZED VIEW CONCURRENTLY hashtag_statistics" \
21 |     -c "UPDATE refreshments SET updated_at=now() where mat_view='hashtag_statistics'" \
22 |     $DATABASE_URL &
23 | else
24 |   echo "$(date -Iseconds): Hashtag stats table already refreshing"
25 | fi
26 | 
27 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently country_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then
28 |   # refresh in the background to return immediately
29 |   echo "$(date -Iseconds): Refreshing country statistics"
30 |   psql -Aqt \
31 |     -c "REFRESH MATERIALIZED VIEW CONCURRENTLY country_statistics" \
32 |     -c "UPDATE refreshments SET updated_at=now() where mat_view='country_statistics'" \
33 |     $DATABASE_URL &
34 | else
35 |   echo "$(date -Iseconds): Country stats table already refreshing"
36 | fi
37 | 
38 | if [ "$(psql -Aqtc "select count(pid) from pg_stat_activity where query ilike 'refresh materialized view concurrently hashtag_user_statistics%' and state='active' and datname='$DATABASE_NAME'" $DATABASE_URL 2> /dev/null)" == "0" ]; then
39 |   # refresh in the background to return immediately
40 |   echo "$(date -Iseconds): Refreshing hashtag/user statistics"
41 |   psql -Aqt \
42 |     -c "REFRESH MATERIALIZED VIEW CONCURRENTLY hashtag_user_statistics" \
43 |     -c "UPDATE refreshments SET updated_at=now() where mat_view='hashtag_user_statistics'" \
44 |     $DATABASE_URL &
45 | else
46 |   echo "$(date -Iseconds): Hashtag/user stats table already refreshing"
47 | fi
48 | 
49 | wait
50 | echo "$(date -Iseconds): Completed"
51 | 


--------------------------------------------------------------------------------
/src/project/Dependencies.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | 
 3 | object Dependencies {
 4 |   val decline        = "com.monovore"                %% "decline"                     % Version.decline
 5 |   val sparkHive      = "org.apache.spark"            %% "spark-hive"                  % Version.spark
 6 |   val sparkStreaming = "org.apache.spark"            %% "spark-streaming"             % Version.spark
 7 |   val sparkJts       = "org.locationtech.geomesa"    %% "geomesa-spark-jts"           % Version.geomesa
 8 |   val gtGeomesa      = "org.locationtech.geotrellis" %% "geotrellis-geomesa"          % Version.geotrellis
 9 |   val gtGeotools     = "org.locationtech.geotrellis" %% "geotrellis-geotools"         % Version.geotrellis
10 |   val gtS3           = "org.locationtech.geotrellis" %% "geotrellis-s3"               % Version.geotrellis
11 |   val gtSpark        = "org.locationtech.geotrellis" %% "geotrellis-spark"            % Version.geotrellis
12 |   val gtSparkTestKit = "org.locationtech.geotrellis" %% "geotrellis-spark-testkit"    % Version.geotrellis % "test"
13 |   val gtVector       = "org.locationtech.geotrellis" %% "geotrellis-vector"           % Version.geotrellis
14 |   val gtShapefile    = "org.locationtech.geotrellis" %% "geotrellis-shapefile"        % Version.geotrellis
15 |   val gtVectorTile   = "org.locationtech.geotrellis" %% "geotrellis-vectortile"       % Version.geotrellis
16 |   val vectorpipe     = "com.azavea.geotrellis"       %% "vectorpipe"                  % Version.vectorpipe
17 |   val cats           = "org.typelevel"               %% "cats-core"                   % Version.cats
18 |   val scalactic      = "org.scalactic"               %% "scalactic"                   % Version.scalactic
19 |   val scalatest      = "org.scalatest"               %%  "scalatest"                  % Version.scalatest % "test"
20 |   //val jaiCore        = "javax.media" % "jai_core" % Version.jai % "test" from s"http://download.osgeo.org/webdav/geotools/javax/media/jai_core/${Version.jai}/jai_core-${Version.jai}.jar"
21 |   val apacheCommonsEmail = "org.apache.commons" % "commons-email" % Version.apacheCommonsEmail
22 |   val hbaseCommon    = "org.apache.hbase" % "hbase-common" % Version.hbase
23 |   val hbaseClient    = "org.apache.hbase" % "hbase-client" % Version.hbase
24 |   val hbaseServer    = "org.apache.hbase" % "hbase-server" % Version.hbase
25 |   val geomesaHbaseDatastore = "org.locationtech.geomesa" % "geomesa-hbase-datastore_2.11" % Version.geomesa
26 |   val kryo           = "com.esotericsoftware"        % "kryo-shaded"                   % Version.kryo
27 |   val snakeyaml      = "org.yaml"                    % "snakeyaml"                     % Version.snakeyaml
28 |   val circeCore      = "io.circe"                    %% "circe-core"                   % Version.circe
29 |   val circeGeneric   = "io.circe"                    %% "circe-generic"                % Version.circe
30 |   val circeExtras    = "io.circe"                    %% "circe-generic-extras"         % Version.circe
31 |   val circeParser    = "io.circe"                    %% "circe-parser"                 % Version.circe
32 |   val circeOptics    = "io.circe"                    %% "circe-optics"                 % Version.circe
33 |   val circeJava8     = "io.circe"                    %% "circe-java8"                  % Version.circe
34 |   val circeYaml      = "io.circe"                    %% "circe-yaml"                   % Version.circeYaml
35 |   val logging        = "com.typesafe.scala-logging"  %% "scala-logging"                % Version.scalaLogging
36 |   val log4j2         = "org.apache.logging.log4j"    % "log4j-1.2-api"                 % "2.17.1"
37 |   val commonsIO      = "commons-io"                  %  "commons-io"                   % Version.commonsIO
38 |   val postgresql = "org.postgresql" % "postgresql" % Version.postgresql
39 | }
40 | 


--------------------------------------------------------------------------------
/src/project/Settings.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | 
 3 | object Settings {
 4 |   object Repositories {
 5 |     val apacheCommons   = "apache.commons.io" at "https://mvnrepository.com/artifact/commons-io/commons-io"
 6 |     val eclipseReleases = "eclipse-releases" at "https://repo.eclipse.org/content/groups/releases"
 7 |     val osgeoReleases   = "osgeo-releases" at "https://repo.osgeo.org/repository/release/"
 8 |     val geosolutions    = "geosolutions" at "https://maven.geo-solutions.it/"
 9 |     val ltReleases      = "locationtech-releases" at "https://repo.locationtech.org/content/repositories/releases/"
10 |     val ltSnapshots     = "locationtech-snapshots" at "https://repo.locationtech.org/content/repositories/snapshots/"
11 |     val ivy2Local       = Resolver.file("local", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns)
12 |     val mavenLocal      = Resolver.mavenLocal
13 |     val maven           = DefaultMavenRepository
14 |     val local           = Seq(ivy2Local, mavenLocal)
15 |     val external        = Seq(osgeoReleases, maven, apacheCommons, eclipseReleases, geosolutions, ltReleases, ltSnapshots)
16 |     val all             = external ++ local
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/project/Version.scala:
--------------------------------------------------------------------------------
 1 | object Version {
 2 |   val scala = "2.11.12"
 3 |   val osmesa = "0.2.0"
 4 |   val geotrellis = "3.5.1"
 5 |   val geomesa = "2.3.2"
 6 |   val vectorpipe = "2.2.0"
 7 |   val decline = "1.0.0"
 8 |   val cats = "1.6.1"
 9 |   val scalactic = "3.0.3"
10 |   val scalatest = "3.0.3"
11 |   val spark = "2.4.4"
12 |   val kryo = "4.0.0"
13 |   val snakeyaml = "1.25"
14 |   val circe = "0.11.1"
15 |   val circeYaml = "0.10.1" // not in sync with circe core
16 |   val scalaLogging = "3.5.0"
17 |   val commonsIO = "2.5"
18 |   val osmosis = "0.46"
19 |   val apacheCommonsEmail = "1.5"
20 |   val hbase = "2.2.0"
21 |   val jai = "1.1.3"
22 |   val postgresql = "42.2.9"
23 | }
24 | 


--------------------------------------------------------------------------------
/src/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
2 | 


--------------------------------------------------------------------------------
/src/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.3.0
2 | 


--------------------------------------------------------------------------------
/src/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0")
2 | 
3 | addCompilerPlugin("org.scalamacros" % "paradise" % "2.1.0" cross CrossVersion.full)
4 | 
5 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.2.27")
6 | 


--------------------------------------------------------------------------------