├── dags
    ├── utils
    │   ├── __init__.py
    │   ├── bq_utils.py
    │   ├── gcs_utils.py
    │   └── metadata_manager.py
    ├── sql
    │   ├── create_features_part_format.sql
    │   ├── join_relations_geometries.sql
    │   ├── join_nodes_geometries.sql
    │   └── join_ways_geometries.sql
    ├── schemas
    │   ├── simple_table_schema.json
    │   ├── features_table_schema.json
    │   ├── nodes_table_schema.json
    │   ├── ways_table_schema.json
    │   └── relations_table_schema.json
    └── transfer_src_file.py
├── deployment
    ├── config
    │   ├── .gitignore
    │   ├── set_env_vars.sh
    │   ├── set_env_vars_from_config.sh
    │   └── generate_config.py
    ├── delete_dag.sh
    ├── create_composer_env.sh
    ├── upload_dags_files.sh
    └── create_full.sh
├── examples
    └── clustering
    │   ├── .gitignore
    │   ├── requirements.txt
    │   ├── colors
    │       ├── README.md
    │       └── vectorize.py
    │   ├── bq_udf
    │       ├── geohash.sql
    │       ├── metrics.js
    │       ├── metrics.sql
    │       └── geohash.js
    │   ├── words
    │       ├── w2v_generate_schema.py
    │       ├── w2v_to_jsonl.py
    │       └── README.md
    │   ├── tf_idf
    │       ├── create_model.py
    │       ├── materialize.sql
    │       ├── vectorize.sql
    │       └── analyze.sql
    │   └── cities
    │       ├── query.py
    │       ├── README.md
    │       └── cities.csv
├── tasks_docker_images
    ├── generate_layers
    │   ├── keys
    │   │   └── .gitignore
    │   ├── README.md
    │   ├── Dockerfile
    │   └── src
    │   │   ├── layered_gis
    │   │       ├── 00_generate_queries.sh
    │   │       ├── building
    │   │       │   └── building.sh
    │   │       ├── aeroway
    │   │       │   └── aeroway.sh
    │   │       ├── poi_money
    │   │       │   └── poi_money.sh
    │   │       ├── cycle_route_segment
    │   │       │   └── cycle_route_segment.sh
    │   │       ├── powerline
    │   │       │   └── powerline.sh
    │   │       ├── waterway
    │   │       │   └── waterways.sh
    │   │       ├── poi_health
    │   │       │   └── poi_health.sh
    │   │       ├── route
    │   │       │   └── route.sh
    │   │       ├── poi_catering
    │   │       │   └── poi_catering.sh
    │   │       ├── traffic_waterway
    │   │       │   └── traffic_waterway.sh
    │   │       ├── barrier
    │   │       │   └── barrier.sh
    │   │       ├── query_templates_history.sh
    │   │       ├── 01_append_table.sh
    │   │       ├── poi_accommodation
    │   │       │   └── poi_accommodation.sh
    │   │       ├── natural
    │   │       │   └── natural.sh
    │   │       ├── poi_destination
    │   │       │   └── poi_destination.sh
    │   │       ├── boundary
    │   │       │   └── boundary.sh
    │   │       ├── query_templates_planet.sh
    │   │       ├── poi_leisure
    │   │       │   └── poi_leisure.sh
    │   │       ├── traffic_calming
    │   │       │   └── traffic_calming.sh
    │   │       ├── land_use
    │   │       │   └── land_use.sh
    │   │       ├── traffic
    │   │       │   └── traffic.sh
    │   │       ├── transport
    │   │       │   └── transport.sh
    │   │       ├── pofw
    │   │       │   └── pofw.sh
    │   │       ├── traffic_barrier
    │   │       │   └── traffic_barrier.sh
    │   │       ├── poi_public
    │   │       │   └── poi_public.sh
    │   │       ├── poi_shopping
    │   │       │   └── poi_shopping.sh
    │   │       ├── power
    │   │       │   └── power.sh
    │   │       ├── poi_tourism
    │   │       │   └── poi_tourism.sh
    │   │       ├── poi_miscpoi
    │   │       │   └── poi_miscpoi.sh
    │   │       └── place
    │   │       │   └── place.sh
    │   │   ├── run.sh
    │   │   └── schema
    │   │       └── layers_schema.json
    ├── osm_to_features
    │   ├── keys
    │   │   └── .gitignore
    │   ├── src
    │   │   ├── download_osm.sh
    │   │   ├── csv_to_json
    │   │   │   ├── csv-to-json.sh
    │   │   │   └── geojson-csv-to-json.pl
    │   │   ├── osm_to_features.sh
    │   │   ├── osmconf.ini
    │   │   └── osm2geojsoncsv
    │   ├── Dockerfile
    │   └── utils
    │   │   └── get_client_id.py
    ├── osm_converter_with_history_index
    │   ├── src
    │   │   ├── gdal
    │   │   │   ├── __init__.py
    │   │   │   ├── run_ogr.sh
    │   │   │   ├── osmconf.ini
    │   │   │   └── gdal_handler.py
    │   │   ├── file_service.py
    │   │   ├── gcs_service.py
    │   │   ├── elements_transformer.py
    │   │   ├── parser.py
    │   │   ├── osm_index.py
    │   │   ├── elements_processing.py
    │   │   └── cache_manager.py
    │   ├── keys
    │   │   └── .gitignore
    │   └── Dockerfile
    └── osm_to_nodes_ways_relations
    │   ├── keys
    │       └── .gitignore
    │   ├── Dockerfile
    │   └── src
    │       ├── osm_dtos.py
    │       └── pbf_parser.py
├── .gitignore
├── docs
    ├── graph.png
    └── OSM_Planet_file_processing.png
├── triggering
    └── trigger_osm_to_big_query_dg_gcf
    │   ├── requirements.txt
    │   └── main.py
├── .gcloudignore
└── utils
    └── get_client_id.py


/dags/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deployment/config/.gitignore:
--------------------------------------------------------------------------------
1 | config*


--------------------------------------------------------------------------------
/examples/clustering/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | !data/.gitkeep


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/keys/.gitignore:
--------------------------------------------------------------------------------
1 | gcloud_keys.json


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_features/keys/.gitignore:
--------------------------------------------------------------------------------
1 | gcloud_keys.json


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | venv/
3 | test_data/
4 | deployment/create_full_*


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/gdal/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/README.md:
--------------------------------------------------------------------------------
1 | # osm to nodes/ways/relations


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_nodes_ways_relations/keys/.gitignore:
--------------------------------------------------------------------------------
1 | gcloud_keys.json


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/keys/.gitignore:
--------------------------------------------------------------------------------
1 | gcloud_keys.json


--------------------------------------------------------------------------------
/docs/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gcp-pdp/geo-openstreetmap/HEAD/docs/graph.png


--------------------------------------------------------------------------------
/examples/clustering/requirements.txt:
--------------------------------------------------------------------------------
1 | textract
2 | nltk
3 | numpy
4 | scipy
5 | matplotlib
6 | sklearn
7 | 


--------------------------------------------------------------------------------
/triggering/trigger_osm_to_big_query_dg_gcf/requirements.txt:
--------------------------------------------------------------------------------
1 | requests_toolbelt==0.9.1
2 | google-auth==1.15.0


--------------------------------------------------------------------------------
/docs/OSM_Planet_file_processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gcp-pdp/geo-openstreetmap/HEAD/docs/OSM_Planet_file_processing.png


--------------------------------------------------------------------------------
/dags/utils/bq_utils.py:
--------------------------------------------------------------------------------
1 | def union_queries(queries):
2 |     union_all_sql = "\nUNION ALL\n"
3 |     return union_all_sql.join(queries)
4 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_features/src/download_osm.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | 
4 | OSM_GCS_PATH="$1"
5 | OSM_DEST_PATH="$2"
6 | 
7 | gsutil cp ${OSM_GCS_PATH} ${OSM_DEST_PATH}


--------------------------------------------------------------------------------
/dags/sql/create_features_part_format.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   '{}' AS feature_type,
 3 |   osm_id,
 4 |   osm_way_id,
 5 |   osm_version,
 6 |   osm_timestamp,
 7 |   all_tags,
 8 |   geometry
 9 | FROM
10 |   {}


--------------------------------------------------------------------------------
/deployment/delete_dag.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DAG_NAME="$1"
3 | COMPOSER_ENV_NAME="$2"
4 | REGION_LOCATION="$3"
5 | 
6 | gcloud composer environments storage dags delete \
7 |   --environment $COMPOSER_ENV_NAME  --location $REGION_LOCATION \
8 |   $DAG_NAME
9 | 


--------------------------------------------------------------------------------
/deployment/config/set_env_vars.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | OSM_ENV_VARS_STR="$1"
3 | COMPOSER_ENV_NAME="$2"
4 | REGION_LOCATION="$3"
5 | 
6 | gcloud composer environments update $COMPOSER_ENV_NAME \
7 |   --location $REGION_LOCATION \
8 |   --update-env-variables=$OSM_ENV_VARS_STR


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM google/cloud-sdk
 2 | 
 3 | # copy script files
 4 | COPY src /generate_layers/src
 5 | # set work dir
 6 | WORKDIR /generate_layers/src
 7 | 
 8 | RUN ["chmod", "+x", "run.sh"]
 9 | 
10 | CMD ./run.sh $MODE
11 | 
12 | 


--------------------------------------------------------------------------------
/deployment/create_composer_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | COMPOSER_ENV_NAME="$1"
 3 | REGION_LOCATION="$2"
 4 | DISK_SIZE="$3"
 5 | MACHINE_TYPE="$4"
 6 | 
 7 | gcloud composer environments create $COMPOSER_ENV_NAME \
 8 |     --location $REGION_LOCATION \
 9 |     --disk-size $DISK_SIZE \
10 |     --machine-type $MACHINE_TYPE
11 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/00_generate_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | mkdir "../sql"
 4 | FOLDERS="$(find . -mindepth 1 -type d)"
 5 | 
 6 | for FOLDER in $FOLDERS; do
 7 |     cd $FOLDER || exit
 8 |     FILE="$(find *.sh)"
 9 |     echo "running " $FOLDER/$FILE
10 |     bash $FILE
11 |     cd ..
12 | done


--------------------------------------------------------------------------------
/deployment/upload_dags_files.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | DAG_ELEMENT_PATH="$1"
 3 | COMPOSER_ENV_NAME="$2"
 4 | REGION_LOCATION="$3"
 5 | 
 6 | echo "Uploading $DAG_ELEMENT_PATH ..."
 7 | gcloud composer environments storage dags import \
 8 |     --environment $COMPOSER_ENV_NAME --location $REGION_LOCATION \
 9 |     --source $DAG_ELEMENT_PATH
10 | 


--------------------------------------------------------------------------------
/examples/clustering/colors/README.md:
--------------------------------------------------------------------------------
 1 | Download document with coloring scheme
 2 | ```shell script
 3 | wget -P ../data/ https://planning-org-uploaded-media.s3.amazonaws.com/document/LBCS.pdf
 4 | ```
 5 | 
 6 | Convert document sections to vectors
 7 | ```shell script
 8 | python vectorize.py > colors.jsonl
 9 | ```
10 | 
11 | Import colors.jsonl into BigQuery.
12 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/building/building.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CODE=1500
 5 | CLASS=building
 6 | K=building
 7 | V=building
 8 | N=building
 9 | F=building
10 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K')"
11 | common_query > "../../sql/$F.sql"
12 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_features/src/csv_to_json/csv-to-json.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | CSV_FILES_PATH="$1"
 4 | 
 5 | for csv_file in $(ls -d ${CSV_FILES_PATH}*.geojson.csv);
 6 | do
 7 |     echo ${csv_file}
 8 |     cat ${csv_file} \
 9 |     | perl csv_to_json/geojson-csv-to-json.pl \
10 |     2> ${csv_file}.errors.jsonl > ${csv_file}.jsonl
11 | done
12 | 


--------------------------------------------------------------------------------
/examples/clustering/bq_udf/geohash.sql:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE FUNCTION udfs.decodeGeoHash(geohash STRING)
 2 |   RETURNS STRUCT<latitude ARRAY<FLOAT64>, longitude ARRAY<FLOAT64>>
 3 |   LANGUAGE js
 4 |   OPTIONS (
 5 |     library=["gs://gcp-pdp-osm-dev-bq-udf/gis/geohash.js"]
 6 |   )
 7 |   AS
 8 | """
 9 |     return decodeGeoHash(geohash);
10 | """;
11 | SELECT udfs.decodeGeoHash('0000');


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODE="$1"
 3 | cd layered_gis
 4 | 
 5 | if [ $MODE == "planet" ]
 6 | then
 7 | cp query_templates_planet.sh query_templates.sh
 8 | else
 9 | cp query_templates_history.sh query_templates.sh
10 | fi
11 | 
12 | echo "running ./00_generate_queries.sh"
13 | ./00_generate_queries.sh
14 | echo "running ./01_append_table.sh"
15 | ./01_append_table.sh $MODE
16 | 
17 | 


--------------------------------------------------------------------------------
/examples/clustering/words/w2v_generate_schema.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | NUM_COLUMNS = 300
 4 | # Template for resulting schema word:STRING,f1:FLOAT,f2...
 5 | SCHEMA = 'word:STRING,{}'
 6 | if __name__ == '__main__':
 7 |     num_columns = NUM_COLUMNS
 8 |     if len(sys.argv) > 1:
 9 |         num_columns = int(sys.argv[1])
10 | 
11 |     print(SCHEMA.format(','.join(['f{}:FLOAT'.format(x) for x in range(1, num_columns + 1)])))
12 | 


--------------------------------------------------------------------------------
/examples/clustering/words/w2v_to_jsonl.py:
--------------------------------------------------------------------------------
 1 | import fileinput
 2 | import json
 3 | 
 4 | if __name__ == '__main__':
 5 |     num_columns = None
 6 |     for line in fileinput.input():
 7 |         columns = line.strip().split(' ')
 8 | 
 9 |         if num_columns is None:
10 |             num_columns = len(columns) - 1
11 | 
12 |         result = {'word': columns[0]}
13 |         for i in range(num_columns):
14 |             result['f{}'.format(i+1)] = columns[i+1]
15 |         print(json.dumps(result))
16 | 


--------------------------------------------------------------------------------
/dags/sql/join_relations_geometries.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     osmium_table.id,
 3 |     osmium_table.version,
 4 |     osmium_table.username,
 5 |     osmium_table.changeset,
 6 |     osmium_table.visible,
 7 |     osmium_table.osm_timestamp,
 8 |     gdal_table.geometry,
 9 |     osmium_table.all_tags,
10 |     osmium_table.members,
11 | FROM
12 |   {}.planet_relations AS osmium_table
13 | LEFT JOIN
14 |   {}.planet_features AS gdal_table
15 | ON
16 |   osmium_table.id = gdal_table.osm_id AND osmium_table.osm_timestamp = gdal_table.osm_timestamp


--------------------------------------------------------------------------------
/examples/clustering/bq_udf/metrics.js:
--------------------------------------------------------------------------------
 1 | function euclideanDistances(a, b) {
 2 |   var sum = 0;
 3 |   var n;
 4 |   for (n = 0; n < a.length; n++) {
 5 |     sum += Math.pow(a[n] - b[n], 2);
 6 |   }
 7 |   return Math.sqrt(sum);
 8 | }
 9 | 
10 | function cosineSimilarity(a, b) {
11 |   var p = 0;
12 |   var p2 = 0;
13 |   var q2 = 0;
14 |   var n;
15 |   for (var n = 0; n < a.length; n++) {
16 |     p += a[n] * b[n];
17 |     p2 += a[n] * a[n];
18 |     q2 += b[n] * b[n];
19 |   }
20 |   return p / (Math.sqrt(p2) * Math.sqrt(q2));
21 | }
22 | 


--------------------------------------------------------------------------------
/dags/sql/join_nodes_geometries.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     osmium_table.id,
 3 |     osmium_table.version,
 4 |     osmium_table.username,
 5 |     osmium_table.changeset,
 6 |     osmium_table.visible,
 7 |     osmium_table.osm_timestamp,
 8 |     gdal_table.geometry,
 9 |     osmium_table.all_tags,
10 |     osmium_table.latitude,
11 |     osmium_table.longitude
12 | FROM
13 |   {}.planet_nodes AS osmium_table
14 | LEFT JOIN
15 |   {}.planet_features AS gdal_table
16 | ON
17 |   osmium_table.id = gdal_table.osm_id AND osmium_table.osm_timestamp = gdal_table.osm_timestamp


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_features/src/osm_to_features.sh:
--------------------------------------------------------------------------------
 1 | LAYERS="$1"
 2 | 
 3 | SRC_FILE_NAME=$(basename $SRC_OSM_GCS_URI)
 4 | LOCAL_FILE_NAME=${DATA_DIR}${SRC_FILE_NAME}
 5 | CSV_FILE_PREFIX=feature
 6 | JSONL_EXT=.jsonl
 7 | 
 8 | echo 'Source GCS URI: '$SRC_OSM_GCS_URI
 9 | gsutil cp $SRC_OSM_GCS_URI $LOCAL_FILE_NAME
10 | echo $SRC_OSM_GCS_URI' copied to '$LOCAL_FILE_NAME
11 | 
12 | ./osm2geojsoncsv $LOCAL_FILE_NAME ${DATA_DIR}${CSV_FILE_PREFIX} $LAYERS
13 | ./csv_to_json/csv-to-json.sh ${DATA_DIR}
14 | gsutil cp ${DATA_DIR}*${JSONL_EXT} ${FEATURES_DIR_GCS_URI}


--------------------------------------------------------------------------------
/dags/sql/join_ways_geometries.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     osmium_table.id,
 3 |     osmium_table.version,
 4 |     osmium_table.username,
 5 |     osmium_table.changeset,
 6 |     osmium_table.visible,
 7 |     osmium_table.osm_timestamp,
 8 |     gdal_table.geometry,
 9 |     osmium_table.all_tags,
10 |     osmium_table.nodes,
11 | FROM
12 |   {}.planet_ways AS osmium_table
13 | LEFT JOIN
14 |   {}.planet_features AS gdal_table
15 | ON
16 |   (osmium_table.id = gdal_table.osm_id OR osmium_table.id = gdal_table.osm_way_id) AND osmium_table.osm_timestamp = gdal_table.osm_timestamp


--------------------------------------------------------------------------------
/examples/clustering/tf_idf/create_model.py:
--------------------------------------------------------------------------------
 1 | CREATE_MODEL_STATEMENT = """
 2 | CREATE OR REPLACE MODEL
 3 |   osm_clustering_grid_1km.kmeans_tfidf_clusters_10 
 4 | TRANSFORM(
 5 |   {}
 6 | )
 7 | OPTIONS(model_type='kmeans', num_clusters=10, max_iterations=50, EARLY_STOP=TRUE, MIN_REL_PROGRESS=0.001) AS
 8 | SELECT
 9 |   tfidf_vec
10 | FROM
11 |   osm_clustering_grid_1km.vectors_tfidf
12 | """
13 | DIMENSIONALITY = 339
14 | create_model = CREATE_MODEL_STATEMENT.format(
15 |     ', '.join(['tfidf_vec[OFFSET({})] as f{}'.format(i, i + 1) for i in range(DIMENSIONALITY)]))
16 | print(create_model)
17 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/aeroway/aeroway.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=aeroway
 5 | LAYER=( 
 6 |         "6701:aeroway=runway"
 7 |         "6702:aeroway=taxiway"
 8 | )
 9 | 
10 | for layer in "${LAYER[@]}"
11 | do
12 |   CODE="${layer%%:*}"
13 |   KVF="${layer##*:}"
14 |   K="${KVF%%=*}"
15 |   VF="${KVF##*=}"
16 |   V="${VF%%>*}"
17 |   F="${VF##*>}"
18 |   N="${F%%-*}"
19 | 
20 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
21 |   common_query > "../../sql/$F.sql"
22 | done
23 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/poi_money/poi_money.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=poi_money
 5 | LAYER=( 
 6 |         "2601:amenity=bank"
 7 |         "2602:amenity=atm"
 8 | )
 9 | 
10 | 
11 | for layer in "${LAYER[@]}"
12 | do
13 |   CODE="${layer%%:*}"
14 |   KVF="${layer##*:}"
15 |   K="${KVF%%=*}"
16 |   VF="${KVF##*=}"
17 |   V="${VF%%>*}"
18 |   F="${VF##*>}"
19 |   N="${F%%-*}"
20 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
21 |   common_query > "../../sql/$F.sql"
22 | done
23 | 


--------------------------------------------------------------------------------
/.gcloudignore:
--------------------------------------------------------------------------------
 1 | # This file specifies files that are *not* uploaded to Google Cloud Platform
 2 | # using gcloud. It follows the same syntax as .gitignore, with the addition of
 3 | # "#!include" directives (which insert the entries of the given .gitignore-style
 4 | # file at that point).
 5 | #
 6 | # For more information, run:
 7 | #   $ gcloud topic gcloudignore
 8 | #
 9 | .gcloudignore
10 | # If you would like to upload your .git directory, .gitignore file or files
11 | # from your .gitignore file, remove the corresponding line
12 | # below:
13 | .git
14 | .gitignore
15 | 
16 | node_modules
17 | #!include:.gitignore
18 | 


--------------------------------------------------------------------------------
/dags/schemas/simple_table_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "type": "INTEGER",
 4 |     "name": "id",
 5 |     "description": "Object unique ID."
 6 |   },
 7 |   {
 8 |     "type": "RECORD",
 9 |     "mode": "REPEATED",
10 |     "name": "all_tags",
11 |     "description": "Unstructured key=value attributes for this object.",
12 |     "fields": [
13 |       {
14 |         "type": "STRING",
15 |         "name": "key",
16 |         "description": "Attribute key."
17 |       },
18 |       {
19 |         "type": "STRING",
20 |         "name": "value",
21 |         "description": "Attribute value."
22 |       }
23 |     ]
24 |   }
25 | ]


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/gdal/run_ogr.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | OGRCONFIG="$1"
 4 | SRC_FILE="$2"
 5 | DEST_FILE="$3"
 6 | OGR_TYPE="$4"
 7 | 
 8 | if [ "$OGR_TYPE" = "multipolygons" ]
 9 | then
10 |     osm_fields="osm_id, osm_way_id"
11 | else
12 |     osm_fields="osm_id, NULL as osm_way_id"
13 | fi
14 | ogr2ogr \
15 |     -skipfailures \
16 |     -f GeoJSON \
17 |     $DEST_FILE $SRC_FILE \
18 |     --config OSM_CONFIG_FILE $OGRCONFIG \
19 |     -dialect sqlite \
20 |     -sql "select $osm_fields, AsGeoJSON(geometry) AS geometry, geometry from ${OGR_TYPE} where ST_IsValid(geometry) = 1" \
21 |     --debug on \
22 |     2> /dev/null


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/cycle_route_segment/cycle_route_segment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=cycle_route_segment
 5 | LAYER=(
 6 |         "9102:rcn=yes>regional_cycle_network"
 7 |         "9102:lcn=yes>local_cycle_network"
 8 | )
 9 | 
10 | for layer in "${LAYER[@]}"
11 | do
12 |   CODE="${layer%%:*}"
13 |   KVF="${layer##*:}"
14 |   K="${KVF%%=*}"
15 |   VF="${KVF##*=}"
16 |   V="${VF%%>*}"
17 |   F="${VF##*>}"
18 |   N="${F%%-*}"
19 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
20 |   common_query > "../../sql/$F.sql"
21 | done
22 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/powerline/powerline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=powerline
 5 | LAYER=( 
 6 |         "6600:power=line"
 7 |         "6601:power=minor_line"
 8 |         "6611:power=cable"
 9 |         "6611:power=minor_cable"
10 | )
11 | 
12 | for layer in "${LAYER[@]}"
13 | do
14 |   CODE="${layer%%:*}"
15 |   KVF="${layer##*:}"
16 |   K="${KVF%%=*}"
17 |   VF="${KVF##*=}"
18 |   V="${VF%%>*}"
19 |   F="${VF##*>}"
20 |   N="${F%%-*}"
21 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
22 |   common_query > "../../sql/$F.sql"
23 | done
24 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/waterway/waterways.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=waterway
 5 | LAYER=( 
 6 |         "8101:waterway=river"
 7 |         "8102:waterway=stream"
 8 |         "8103:waterway=canal"
 9 |         "8104:waterway=drain"
10 | )
11 | 
12 | for layer in "${LAYER[@]}"
13 | do
14 |   CODE="${layer%%:*}"
15 |   KVF="${layer##*:}"
16 |   K="${KVF%%=*}"
17 |   VF="${KVF##*=}"
18 |   V="${VF%%>*}"
19 |   F="${VF##*>}"
20 |   N="${F%%-*}"
21 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
22 |   common_query > "../../sql/$F.sql"
23 | done
24 | 


--------------------------------------------------------------------------------
/deployment/config/set_env_vars_from_config.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CONFIG_FILE="$1"
 3 | COMPOSER_ENV_NAME="$2"
 4 | REGION_LOCATION="$3"
 5 | 
 6 | declare -A VARS_ARRAY
 7 | while IFS="=" read -r key value
 8 | do
 9 |     VARS_ARRAY[$key]="$value"
10 | done < <(jq -r "to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]"  $CONFIG_FILE)
11 | 
12 | OSM_ENV_VARS_STR=''
13 | for key in "${!VARS_ARRAY[@]}"
14 | do
15 |     OSM_ENV_VARS_STR="${OSM_ENV_VARS_STR}${key^^}=${VARS_ARRAY[$key]},"
16 | done
17 | 
18 | OSM_ENV_VARS_STR=${OSM_ENV_VARS_STR::-1}
19 | echo $OSM_ENV_VARS_STR
20 | 
21 | gcloud composer environments update $COMPOSER_ENV_NAME \
22 |   --location $REGION_LOCATION \
23 |   --update-env-variables=$OSM_ENV_VARS_STR


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/poi_health/poi_health.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=poi_health
 5 | LAYER=( 
 6 |         "2101:amenity=pharmacy"
 7 |         "2110:amenity=hospital"
 8 |         "2120:amenity=doctors"
 9 |         "2121:amenity=dentist"
10 |         "2129:amenity=veterinary"
11 | )
12 | 
13 | for layer in "${LAYER[@]}"
14 | do
15 |   CODE="${layer%%:*}"
16 |   KVF="${layer##*:}"
17 |   K="${KVF%%=*}"
18 |   VF="${KVF##*=}"
19 |   V="${VF%%>*}"
20 |   F="${VF##*>}"
21 |   N="${F%%-*}"
22 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
23 |   common_query > "../../sql/$F.sql"
24 | done
25 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/route/route.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=route
 5 | LAYER=(
 6 |         "9001:route=bicycle"
 7 |         "9002:route=mtb"
 8 |         "9003:route=hiking"
 9 |         "9004:route=horse"
10 |         "9005:route=nordic_walking"
11 |         "9006:route=running"
12 | )
13 | 
14 | for layer in "${LAYER[@]}"
15 | do
16 |   CODE="${layer%%:*}"
17 |   KVF="${layer##*:}"
18 |   K="${KVF%%=*}"
19 |   VF="${KVF##*=}"
20 |   V="${VF%%>*}"
21 |   F="${VF##*>}"
22 |   N="${F%%-*}"
23 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
24 |   common_query > "../../sql/$F.sql"
25 | done
26 | 


--------------------------------------------------------------------------------
/examples/clustering/bq_udf/metrics.sql:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE FUNCTION udfs.euclidean_distance(a ARRAY<FLOAT64>, b ARRAY<FLOAT64>)
 2 |   RETURNS FLOAT64
 3 |   LANGUAGE js
 4 |   OPTIONS (
 5 |     library=["gs://gcp-pdp-osm-dev-bq-udf/metrics/metrics.js"]
 6 |   )
 7 |   AS
 8 | """
 9 |     return euclideanDistances(a, b);
10 | """;
11 | 
12 | SELECT EUCLIDEAN_DISTANCE([0., 0.], [1., 1.]);
13 | 
14 | CREATE OR REPLACE FUNCTION udfs.cosine_similarity(a ARRAY<FLOAT64>, b ARRAY<FLOAT64>)
15 |   RETURNS FLOAT64
16 |   LANGUAGE js
17 |   OPTIONS (
18 |     library=["gs://gcp-pdp-osm-dev-bq-udf/metrics/metrics.js"]
19 |   )
20 |   AS
21 | """
22 |     return cosineSimilarity(a, b);
23 | """;
24 | 
25 | SELECT COSINE_SIMILARITY([1., 4.], [1., 1.]);


--------------------------------------------------------------------------------
/examples/clustering/words/README.md:
--------------------------------------------------------------------------------
 1 | ## Import Glove vectors into BigQuery
 2 | Download word2vec (Glove)
 3 | ```
 4 | cd ../data
 5 | wget http://nlp.stanford.edu/data/glove.6B.zip
 6 | unzip ./glove.6B.zip
 7 | rm ./glove.6B.zip
 8 | ```
 9 | 
10 | Convert word2vec to JSONL format
11 | ```
12 | cat ./data/glove.6B.300d.txt | python3 w2v_to_jsonl.py > ./data/glove.6B.300d.jsonl
13 | ```
14 | 
15 | Upload result to GCS:
16 | ```
17 | gsutil cp ./data/glove.6B.300d.jsonl gs://gcp-pdp-osm-dev-bq-import/glove/
18 | ```
19 | 
20 | Import into BQ:
21 | ```
22 | bq load \
23 |  --source_format=NEWLINE_DELIMITED_JSON \
24 |  gcp-pdp-osm-dev:osm_clustering.w2v_glove_6B_300d \
25 |  gs://gcp-pdp-osm-dev-bq-import/glove.6B.300d.jsonl \
26 |  "$(python3 w2v_generate_schema.py 300)"
27 | ```


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/file_service.py:
--------------------------------------------------------------------------------
 1 | import errno
 2 | import os
 3 | 
 4 | 
 5 | def make_dir_for_file_if_not_exists(filename):
 6 |     if not os.path.exists(os.path.dirname(filename)):
 7 |         try:
 8 |             os.makedirs(os.path.dirname(filename))
 9 |         except OSError as exc:  # Guard against race condition
10 |             if exc.errno != errno.EEXIST:
11 |                 raise
12 | 
13 | 
14 | def file_name_from_path(file_path):
15 |     if "/" in file_path:
16 |         return file_path.split("/")[-1]
17 |     else:
18 |         return file_path
19 | 
20 | 
21 | def file_name_without_ext(file_name):
22 |     if "." in file_name:
23 |         return file_name.split(".")[0]
24 |     else:
25 |         return file_name
26 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/poi_catering/poi_catering.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=poi_catering
 5 | LAYER=( 
 6 |         "2301:amenity=restaurant"
 7 |         "2302:amenity=fast_food"
 8 |         "2303:amenity=cafe"
 9 |         "2304:amenity=pub"
10 |         "2305:amenity=bar"
11 |         "2306:amenity=food_court"
12 |         "2307:amenity=biergarten"
13 | )
14 | 
15 | for layer in "${LAYER[@]}"
16 | do
17 |   CODE="${layer%%:*}"
18 |   KVF="${layer##*:}"
19 |   K="${KVF%%=*}"
20 |   VF="${KVF##*=}"
21 |   V="${VF%%>*}"
22 |   F="${VF##*>}"
23 |   N="${F%%-*}"
24 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
25 |   common_query > "../../sql/$F.sql"
26 | done
27 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/traffic_waterway/traffic_waterway.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=traffic
 5 | LAYER=( 
 6 |         "5301:leisure=slipway"
 7 |         "5302:leisure=marina"
 8 |         "5303:man_made=pier"
 9 |         "5311:waterway=dam"
10 |         "5321:waterway=waterfall"
11 |         "5331:waterway=lock_gate"
12 |         "5332:waterway=weir"
13 | )
14 | 
15 | for layer in "${LAYER[@]}"
16 | do
17 |   CODE="${layer%%:*}"
18 |   KVF="${layer##*:}"
19 |   K="${KVF%%=*}"
20 |   VF="${KVF##*=}"
21 |   V="${VF%%>*}"
22 |   F="${VF##*>}"
23 |   N="${F%%-*}"
24 |   NAME_PREFIX=waterway_
25 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
26 |   common_query > "../../sql/$F.sql"
27 | done
28 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_features/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM osgeo/gdal
 2 | 
 3 | # update repos
 4 | RUN apt-get update -y
 5 | 
 6 | # make installation
 7 | RUN apt-get install build-essential -y
 8 | 
 9 | # gsutil installation
10 | RUN curl -sSL https://sdk.cloud.google.com | bash
11 | 
12 | # perl installation
13 | RUN cpan JSON
14 | RUN cpan Text::CSV::Encoded
15 | 
16 | # set env vars
17 | ENV PATH $PATH:/root/google-cloud-sdk/bin
18 | ENV DATA_DIR /osm_to_features/data/
19 | 
20 | # copy script files
21 | COPY src /osm_to_features/src
22 | # set work dir
23 | WORKDIR /osm_to_features/src
24 | 
25 | # set sh files as executable
26 | RUN ["chmod", "+x", "download_osm.sh"]
27 | RUN ["chmod", "+x", "csv_to_json/csv-to-json.sh"]
28 | RUN ["chmod", "+x", "osm_to_features.sh"]
29 | 
30 | # run main script
31 | CMD ./osm_to_features.sh $LAYERS


--------------------------------------------------------------------------------
/examples/clustering/cities/query.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | FIRST_CITY_SQL = 'SELECT "{}" as city_name, "{}" as city_class, {} as latitude, {} as longitude, {} as radius'
 4 | CITY_SQL = 'SELECT "{}", "{}", {}, {}, {}'
 5 | 
 6 | QUERY = """
 7 | WITH cities AS ({})
 8 | SELECT
 9 |   city_name,
10 |   city_class,
11 |   ST_GEOGPOINT(longitude, latitude) as center,
12 |   radius
13 | FROM cities
14 | """
15 | 
16 | if __name__ == '__main__':
17 |     with open('cities.csv', newline='') as csv_file:
18 |         reader = csv.reader(csv_file)
19 |         rows = [row for row in reader]
20 |         first_city = rows[1]
21 |         cities_tail = rows[2:]
22 | 
23 |         cities_sql = ' UNION ALL\n'.join([FIRST_CITY_SQL.format(*first_city)] + [CITY_SQL.format(*city) for city in cities_tail])
24 |         print(QUERY.format(cities_sql))
25 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/barrier/barrier.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=barrier
 5 | LAYER=( 
 6 |         "5501:barrier=fence>fence-barrier"
 7 |         "5501:barrier=wood_fence>fence-wood_fence"
 8 |         "5501:barrier=wire_fence>fence-wire_fence"
 9 |         "5511:barrier=hedge"
10 |         "5512:barrier=tree_row"
11 |         "5521:barrier=wall"
12 |         "5531:man_made=dyke"
13 | )
14 | 
15 | 
16 | 
17 | for layer in "${LAYER[@]}"
18 | do
19 |   CODE="${layer%%:*}"
20 |   KVF="${layer##*:}"
21 |   K="${KVF%%=*}"
22 |   VF="${KVF##*=}"
23 |   V="${VF%%>*}"
24 |   F="${VF##*>}"
25 |   N="${F%%-*}"
26 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
27 |   common_query > "../../sql/$F.sql"
28 | done
29 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_nodes_ways_relations/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM google/cloud-sdk
 2 | 
 3 | # update repos
 4 | RUN apt-get update -y
 5 | 
 6 | # install pyosmium dependencies
 7 | RUN apt-get install build-essential cmake libboost-dev \
 8 |                    libexpat1-dev zlib1g-dev libbz2-dev -y
 9 | # install python GCS sdk
10 | RUN pip3 install --upgrade google-cloud-storage
11 | 
12 | # install pyosmium
13 | RUN pip3 install osmium
14 | # install guppy3 (memory profiler)
15 | RUN pip3 install guppy3
16 | 
17 | # set env vars
18 | ENV DATA_DIR /osm_to_nodes_ways_relations/data/
19 | 
20 | # copy script files
21 | COPY src /osm_to_nodes_ways_relations/src
22 | # set work dir
23 | WORKDIR /osm_to_nodes_ways_relations/src
24 | 
25 | CMD python3 pbf_parser.py $SRC_OSM_GCS_URI $NODES_WAYS_RELATIONS_DIR_GCS_URI --num_threads $NUM_THREADS
26 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/query_templates_history.sh:
--------------------------------------------------------------------------------
 1 | common_query() {
 2 | echo "
 3 | WITH osm AS (
 4 |   SELECT id, null AS way_id, all_tags, osm_timestamp, version, geometry FROM \`${BQ_DATASET_TO_EXPORT}.history_nodes\`
 5 |   UNION ALL
 6 |   SELECT id, id AS way_id, all_tags, osm_timestamp, version, geometry FROM \`${BQ_DATASET_TO_EXPORT}.history_ways\`
 7 |   UNION ALL
 8 |   SELECT id, null AS way_id, all_tags, osm_timestamp, version, geometry FROM \`${BQ_DATASET_TO_EXPORT}.history_relations\`
 9 | )
10 | SELECT
11 | $CODE AS layer_code,
12 | '$CLASS' AS layer_class,
13 | '$NAME_PREFIX$N' AS layer_name,
14 | osm.id  AS osm_id,
15 | osm.way_id AS osm_way_id,
16 | osm.osm_timestamp AS osm_timestamp,
17 | osm.version AS osm_version,
18 | osm.all_tags,
19 | osm.geometry
20 | FROM osm
21 | WHERE osm.id IS NOT NULL
22 | $EXTRA_CONSTRAINTS
23 | "
24 | }
25 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/01_append_table.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PROCESSING_MODE="$1"
 3 | 
 4 | BQ_DATASET_TO_EXPORT_WITH_COLON=$(echo $BQ_DATASET_TO_EXPORT | sed 's/\./:/')
 5 | 
 6 | i=0
 7 | mode=""
 8 | for SQL in `find ../sql/ -type f -name '*.sql' | sort`; do
 9 |   echo $SQL
10 |   if (($i > 0)); then
11 |     mode="--append_table"
12 |   else
13 |     mode="--replace"
14 |   fi
15 | 
16 |   cmd="cat $SQL | bq query\
17 |   --project_id ${PROJECT_ID}\
18 |   --nouse_legacy_sql\
19 |   $mode\
20 |   --range_partitioning 'layer_code,0,9999,1'\
21 |   --clustering_fields 'layer_code,geometry'\
22 |   --display_name $SQL\
23 |   --destination_table '${BQ_DATASET_TO_EXPORT_WITH_COLON}.${PROCESSING_MODE}_layers'\
24 |   --destination_schema ../schema/layers_schema.json >/dev/null"
25 | 
26 |   echo "$cmd"
27 |   echo "$cmd" | bash
28 | 
29 |   ((i=i+1))
30 | done
31 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/poi_accommodation/poi_accommodation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=poi_accommodation
 5 | LAYER=( 
 6 |         "2401:tourism=hotel"
 7 |         "2402:tourism=motel"
 8 |         "2403:tourism=bed_and_breakfast"
 9 |         "2404:tourism=guest_house"
10 |         "2405:tourism=hostel"
11 |         "2406:tourism=chalet"
12 |         "2421:amenity=shelter"
13 |         "2422:tourism=camp_site"
14 |         "2423:tourism=alpine_hut"
15 |         "2424:tourism=caravan_site"
16 | )
17 | 
18 | for layer in "${LAYER[@]}"
19 | do
20 |   CODE="${layer%%:*}"
21 |   KVF="${layer##*:}"
22 |   K="${KVF%%=*}"
23 |   VF="${KVF##*=}"
24 |   V="${VF%%>*}"
25 |   F="${VF##*>}"
26 |   N="${F%%-*}"
27 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
28 |   common_query > "../../sql/$F.sql"
29 | done
30 | 


--------------------------------------------------------------------------------
/dags/schemas/features_table_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "description": null,
 4 |     "name": "osm_id",
 5 |     "type": "INTEGER"
 6 |   },
 7 |   {
 8 |     "description": null,
 9 |     "name": "osm_version",
10 |     "type": "INTEGER"
11 |   },
12 |   {
13 |     "description": null,
14 |     "name": "osm_way_id",
15 |     "type": "INTEGER"
16 |   },
17 |   {
18 |     "description": "Last-modified timestamp for this object.",
19 |     "name": "osm_timestamp",
20 |     "type": "TIMESTAMP"
21 |   },
22 |   {
23 |     "description": "GEOGRAPHY-encoded point",
24 |     "name": "geometry",
25 |     "type": "GEOGRAPHY"
26 |   },
27 |   {
28 |     "description": "Unstructured key=value attributes for this object.",
29 |     "fields": [
30 |       {
31 |         "description": "Attribute key.",
32 |         "name": "key",
33 |         "type": "STRING"
34 |       },
35 |       {
36 |         "description": "Attribute value.",
37 |         "name": "value",
38 |         "type": "STRING"
39 |       }
40 |     ],
41 |     "mode": "REPEATED",
42 |     "name": "all_tags",
43 |     "type": "RECORD"
44 |   }
45 | ]


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/natural/natural.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=natural
 5 | LAYER=( 
 6 |         "4101:natural=spring"
 7 |         "4102:natural=glacier"
 8 |         "4111:natural=peak"
 9 |         "4112:natural=cliff"
10 |         "4113:natural=volcano"
11 |         "4121:natural=tree"
12 |         "4131:natural=mine>mine-natural"
13 |         "4131:historic=mine>mine-historic"
14 |         "4131:landuse=mine>mine-landuse"
15 |         "4131:survey_point=mine>mine-survey_point"
16 |         "4131:industrial=mine>mine-industrial"
17 |         "4132:natural=cave_entrance"
18 |         "4141:natural=beach"
19 |         "8300:natural=coastline"
20 | )
21 | 
22 | 
23 | 
24 | for layer in "${LAYER[@]}"
25 | do
26 |   CODE="${layer%%:*}"
27 |   KVF="${layer##*:}"
28 |   K="${KVF%%=*}"
29 |   VF="${KVF##*=}"
30 |   V="${VF%%>*}"
31 |   F="${VF##*>}"
32 |   N="${F%%-*}"
33 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
34 |   common_query > "../../sql/$F.sql"
35 | done
36 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/poi_destination/poi_destination.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=poi_destination
 5 | LAYER=(
 6 |         "2721:tourism=attraction"
 7 |         "2722:tourism=museum"
 8 |         "2723:historic=monument"
 9 |         "2724:historic=memorial"
10 |         "2725:tourism=artwork"
11 |         "2731:historic=castle"
12 |         "2732:historic=ruins"
13 |         "2733:historic=archaeological_site"
14 |         "2734:historic=wayside_cross"
15 |         "2735:historic=wayside_shrine"
16 |         "2736:historic=battlefield"
17 |         "2737:historic=fort"
18 |         "2741:tourism=picnic_site"
19 |         "2742:tourism=viewpoint"
20 |         "2743:tourism=zoo"
21 |         "2744:tourism=theme_park"
22 | )
23 | 
24 | for layer in "${LAYER[@]}"
25 | do
26 |   CODE="${layer%%:*}"
27 |   KVF="${layer##*:}"
28 |   K="${KVF%%=*}"
29 |   VF="${KVF##*=}"
30 |   V="${VF%%>*}"
31 |   F="${VF##*>}"
32 |   N="${F%%-*}"
33 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
34 |   common_query > "../../sql/$F.sql"
35 | done
36 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/boundary/boundary.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=boundary
 5 | LAYER=(
 6 |         "1101:admin_level=1>admin_level1"
 7 |         "1102:admin_level=2>national"
 8 |         "1103:admin_level=3>admin_level3"
 9 |         "1104:admin_level=4>admin_level4"
10 |         "1105:admin_level=5>admin_level5"
11 |         "1106:admin_level=6>admin_level6"
12 |         "1107:admin_level=7>admin_level7"
13 |         "1108:admin_level=8>admin_level8"
14 |         "1109:admin_level=9>admin_level9"
15 |         "1110:admin_level=10>admin_level10"
16 |         "1111:admin_level=11>admin_level11"
17 | )
18 | 
19 | 
20 | for layer in "${LAYER[@]}"
21 | do
22 |   CODE="${layer%%:*}"
23 |   KVF="${layer##*:}"
24 |   K="${KVF%%=*}"
25 |   VF="${KVF##*=}"
26 |   V="${VF%%>*}"
27 |   F="${VF##*>}"
28 |   N="${F%%-*}"
29 | 
30 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')
31 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'boundary' AND tags.value='administrative')"
32 |   common_query > "../../sql/$F.sql"
33 | done
34 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM google/cloud-sdk
 2 | 
 3 | RUN cat /etc/os-release
 4 | # update repos
 5 | RUN apt-get update -y
 6 | 
 7 | # install pyosmium dependencies
 8 | RUN apt-get install build-essential cmake libboost-dev \
 9 |                    libexpat1-dev zlib1g-dev libbz2-dev -y
10 | # install GDAL
11 | RUN apt-get install binutils libproj-dev gdal-bin -y
12 | 
13 | # install python GCS sdk
14 | RUN pip3 install --upgrade google-cloud-storage
15 | 
16 | # install pyosmium
17 | RUN pip3 install osmium
18 | 
19 | # install guppy3 (memory profiler)
20 | RUN pip3 install guppy3
21 | # install psutil
22 | RUN pip install psutil
23 | 
24 | # set env vars
25 | ENV DATA_DIR /osm_converter_with_history_index/data/
26 | 
27 | # copy script files
28 | COPY src /osm_converter_with_history_index/src
29 | # set work dir
30 | WORKDIR /osm_converter_with_history_index/src
31 | 
32 | # (Optional) GCP credetials setup fro local runs
33 | #COPY keys /osm_converter_with_history_index/keys
34 | #ENV GOOGLE_APPLICATION_CREDENTIALS=/osm_converter_with_history_index/keys/gcloud_keys.json
35 | 
36 | CMD python3 main.py $SRC_OSM_GCS_URI --index_db_and_metadata_gcs_dir $INDEX_DB_AND_METADATA_DIR_GCS_URI --converted_gcs_dir $CONVERTED_OSM_DIR_GCS_URI --num_db_shards $NUM_DB_SHARDS --num_threads $NUM_THREADS $ADDITIONAL_ARGS


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/query_templates_planet.sh:
--------------------------------------------------------------------------------
 1 | common_query() {
 2 | echo "
 3 | WITH osm AS (
 4 |   SELECT id, null AS way_id, osm_timestamp, version, all_tags FROM \`${BQ_DATASET_TO_EXPORT}.planet_nodes\`
 5 |   UNION ALL
 6 |   SELECT id, id AS way_id, osm_timestamp, version, all_tags FROM \`${BQ_DATASET_TO_EXPORT}.planet_ways\`
 7 |   UNION ALL
 8 |   SELECT id, null AS way_id, osm_timestamp, version, all_tags FROM \`${BQ_DATASET_TO_EXPORT}.planet_relations\`
 9 | )
10 | SELECT $CODE AS layer_code, '$CLASS' AS layer_class, '$NAME_PREFIX$N' AS layer_name, f.feature_type AS gdal_type,
11 | f.osm_id  AS osm_id,
12 | f.osm_way_id AS osm_way_id,
13 | f.osm_timestamp,
14 | osm.version AS osm_version,
15 | osm.all_tags,
16 | f.geometry
17 | FROM \`${BQ_DATASET_TO_EXPORT}.planet_features\` AS f, osm
18 | WHERE osm.id = f.osm_id AND osm.osm_timestamp = f.osm_timestamp
19 | $EXTRA_CONSTRAINTS
20 | 
21 | UNION ALL
22 | 
23 | SELECT $CODE AS layer_code, '$CLASS' AS layer_class, '$NAME_PREFIX$N' AS layer_name, f.feature_type AS gdal_type,
24 | f.osm_id AS osm_id,
25 | f.osm_way_id AS osm_way_id,
26 | f.osm_timestamp,
27 | osm.version AS osm_version,
28 | osm.all_tags,
29 | f.geometry
30 | FROM \`${BQ_DATASET_TO_EXPORT}.planet_features\` AS f, osm
31 | WHERE osm.way_id = f.osm_way_id AND osm.osm_timestamp = f.osm_timestamp
32 | $EXTRA_CONSTRAINTS
33 | "
34 | }
35 | 


--------------------------------------------------------------------------------
/examples/clustering/tf_idf/materialize.sql:
--------------------------------------------------------------------------------
 1 | -- Selects geo ID, geography, TF-IDF vector, TF-IDF features,
 2 | -- lbcs category name, lbcs color, similarity with lbcs category
 3 | WITH features AS (SELECT ARRAY_AGG(word ORDER BY word) as words
 4 | FROM `gcp-pdp-osm-dev.words.w2v_glove_6B_300d_osm_tags`)
 5 | ,similarities AS (SELECT
 6 |   grid.geo_id,
 7 |   MAX(udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec)) as max_similarity
 8 | FROM
 9 |   `gcp-pdp-osm-dev.osm_clustering_grid_01km.vectors_tfidf` tfidf
10 | JOIN `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` grid USING(geo_id)
11 | CROSS JOIN `gcp-pdp-osm-dev.lbcs.lbcs_tfidf` lbcs
12 | WHERE lbcs.dimension = 'Function'
13 | AND udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec) > 0
14 | GROUP BY grid.geo_id)
15 | SELECT
16 |   grid.geo_id,
17 |   grid.geog,
18 |   grid.city_name,
19 |   tfidf.tfidf_vec,
20 |   features.words as tfidf_features,
21 |   lbcs.name,
22 |   lbcs.color,
23 |   udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec) as similarity
24 | FROM
25 |   `gcp-pdp-osm-dev.osm_clustering_grid_01km.vectors_tfidf` tfidf
26 | JOIN `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` grid USING(geo_id)
27 | CROSS JOIN features
28 | CROSS JOIN `gcp-pdp-osm-dev.lbcs.lbcs_tfidf` lbcs
29 | JOIN similarities ON similarities.geo_id = tfidf.geo_id AND similarities.max_similarity = udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec)
30 | AND lbcs.dimension = 'Function'
31 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_features/src/osmconf.ini:
--------------------------------------------------------------------------------
 1 | #
 2 | # Configuration file for OSM import
 3 | #
 4 | 
 5 | # put here the name of keys, or key=value, for ways that are assumed to be polygons if they are closed
 6 | # see http://wiki.openstreetmap.org/wiki/Map_Features
 7 | closed_ways_are_polygons=aeroway,amenity,boundary,building,craft,geological,historic,landuse,leisure,military,natural,office,place,shop,sport,tourism,highway=platform,public_transport=platform
 8 | 
 9 | # comment to avoid laundering of keys ( ':' turned into '_' )
10 | attribute_name_laundering=yes
11 | 
12 | # keys that should NOT be reported in the "other_tags" field
13 | ignore=created_by,converted_by,source,time,ele,note,openGeoDB:,fixme,FIXME
14 | 
15 | [lines]
16 | # common attributes
17 | osm_id=yes
18 | osm_version=yes
19 | osm_timestamp=yes
20 | other_tags=no
21 | # create "all_tags" field
22 | all_tags=yes
23 | 
24 | [multilinestrings]
25 | # common attributes
26 | osm_id=yes
27 | osm_version=yes
28 | osm_timestamp=yes
29 | other_tags=no
30 | # create "all_tags" field
31 | all_tags=yes
32 | 
33 | [multipolygons]
34 | # common attributes
35 | osm_id=yes
36 | osm_version=yes
37 | osm_timestamp=yes
38 | other_tags=no
39 | # create "all_tags" field
40 | all_tags=yes
41 | 
42 | [other_relations]
43 | # common attributes
44 | osm_id=yes
45 | osm_version=yes
46 | osm_timestamp=yes
47 | other_tags=no
48 | # create "all_tags" field
49 | all_tags=yes
50 | 
51 | [points]
52 | # common attributes
53 | osm_id=yes
54 | osm_version=yes
55 | osm_timestamp=yes
56 | other_tags=no
57 | # create "all_tags" field
58 | all_tags=yes
59 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/gdal/osmconf.ini:
--------------------------------------------------------------------------------
 1 | #
 2 | # Configuration file for OSM import
 3 | #
 4 | 
 5 | # put here the name of keys, or key=value, for ways that are assumed to be polygons if they are closed
 6 | # see http://wiki.openstreetmap.org/wiki/Map_Features
 7 | closed_ways_are_polygons=aeroway,amenity,boundary,building,craft,geological,historic,landuse,leisure,military,natural,office,place,shop,sport,tourism,highway=platform,public_transport=platform
 8 | 
 9 | # comment to avoid laundering of keys ( ':' turned into '_' )
10 | attribute_name_laundering=yes
11 | 
12 | # keys that should NOT be reported in the "other_tags" field
13 | ignore=created_by,converted_by,source,time,ele,note,openGeoDB:,fixme,FIXME
14 | 
15 | [lines]
16 | # common attributes
17 | osm_id=yes
18 | osm_version=no
19 | osm_timestamp=no
20 | other_tags=no
21 | # create "all_tags" field
22 | all_tags=no
23 | 
24 | [multilinestrings]
25 | # common attributes
26 | osm_id=yes
27 | osm_version=no
28 | osm_timestamp=no
29 | other_tags=no
30 | # create "all_tags" field
31 | all_tags=no
32 | 
33 | [multipolygons]
34 | # common attributes
35 | osm_id=yes
36 | osm_version=no
37 | osm_timestamp=no
38 | other_tags=no
39 | # create "all_tags" field
40 | all_tags=no
41 | 
42 | [other_relations]
43 | # common attributes
44 | osm_id=yes
45 | osm_version=no
46 | osm_timestamp=no
47 | other_tags=no
48 | # create "all_tags" field
49 | all_tags=no
50 | 
51 | [points]
52 | # common attributes
53 | osm_id=yes
54 | osm_version=no
55 | osm_timestamp=no
56 | other_tags=no
57 | # create "all_tags" field
58 | all_tags=no
59 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/poi_leisure/poi_leisure.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=poi_leisure
 5 | LAYER=( 
 6 |         "2201:amenity=theatre"
 7 |         "2202:amenity=nightclub"
 8 |         "2203:amenity=cinema"
 9 |         "2204:leisure=park"
10 |         "2205:leisure=playground"
11 |         "2206:leisure=dog_park"
12 |         "2251:leisure=sports_centre"
13 |         "2252:leisure=pitch"
14 |         "2254:sport=tennis>tennis_court"
15 |         "2255:leisure=golf_course"
16 |         "2256:leisure=stadium"
17 |         "2257:leisure=ice_rink"
18 | )
19 | 
20 | for layer in "${LAYER[@]}"
21 | do
22 |   CODE="${layer%%:*}"
23 |   KVF="${layer##*:}"
24 |   K="${KVF%%=*}"
25 |   VF="${KVF##*=}"
26 |   V="${VF%%>*}"
27 |   F="${VF##*>}"
28 |   N="${F%%-*}"
29 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
30 |   common_query > "../../sql/$F.sql"
31 | done
32 | 
33 | CODE=2253
34 | N=swimming_pool
35 | F=swimming_pool
36 | EXTRA_CONSTRAINTS="
37 |   AND (
38 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amenity' AND tags.value='swimming_pool')
39 |       OR
40 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'leisure' AND tags.value='swimming_pool')
41 |       OR
42 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'sport' AND tags.value='swimming')
43 |       OR
44 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'leisure' AND tags.value='water_park')
45 |   )"
46 | common_query > "../../sql/$F.sql"
47 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/traffic_calming/traffic_calming.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=traffic
 5 | LAYER=( 
 6 |         "5231:traffic_calming=hump"
 7 |         "5232:traffic_calming=bump"
 8 |         "5233:traffic_calming=table"
 9 |         "5234:traffic_calming=chicane"
10 |         "5235:traffic_calming=cushion"
11 | )
12 | 
13 | for layer in "${LAYER[@]}"
14 | do
15 |   CODE="${layer%%:*}"
16 |   KVF="${layer##*:}"
17 |   K="${KVF%%=*}"
18 |   VF="${KVF##*=}"
19 |   V="${VF%%>*}"
20 |   F="${VF##*>}"
21 |   N="${F%%-*}"
22 |   NAME_PREFIX=calming_
23 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
24 |   common_query > "../../sql/$NAME_PREFIX$F.sql"
25 | done
26 | 
27 | #5230
28 | CODE=5230
29 | N=calming
30 | F=calming
31 | NAME_PREFIX=""
32 | EXTRA_CONSTRAINTS="
33 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming')
34 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming' AND tags.value='hump')
35 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming' AND tags.value='bump')
36 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming' AND tags.value='table')
37 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming' AND tags.value='chicane')
38 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming' AND tags.value='cushion')"
39 | common_query > "../../sql/$F.sql"
40 | 


--------------------------------------------------------------------------------
/dags/schemas/nodes_table_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "type": "INTEGER",
 4 |     "name": "id",
 5 |     "description": "Object unique ID."
 6 |   },
 7 |   {
 8 |     "type": "INTEGER",
 9 |     "name": "version",
10 |     "description": "Version number for this object."
11 |   },
12 |   {
13 |     "type": "STRING",
14 |     "name": "username",
15 |     "description": "Name of user who created this version of the object."
16 |   },
17 |   {
18 |     "type": "INTEGER",
19 |     "name": "changeset",
20 |     "description": "Changeset number for this object."
21 |   },
22 |   {
23 |     "type": "BOOLEAN",
24 |     "name": "visible",
25 |     "description": "Is this version of the object visible?"
26 |   },
27 |   {
28 |     "type": "TIMESTAMP",
29 |     "name": "osm_timestamp",
30 |     "description": "Last-modified timestamp for this object."
31 |   },
32 |   {
33 |     "type": "GEOGRAPHY",
34 |     "name": "geometry",
35 |     "description": "GEOGRAPHY-encoded point"
36 |   },
37 |   {
38 |     "type": "RECORD",
39 |     "mode": "REPEATED",
40 |     "name": "all_tags",
41 |     "description": "Unstructured key=value attributes for this object.",
42 |     "fields": [
43 |       {
44 |         "type": "STRING",
45 |         "name": "key",
46 |         "description": "Attribute key."
47 |       },
48 |       {
49 |         "type": "STRING",
50 |         "name": "value",
51 |         "description": "Attribute value."
52 |       }
53 |     ]
54 |   },
55 |   {
56 |     "description": null,
57 |     "name": "latitude",
58 |     "type": "NUMERIC"
59 |   },
60 |   {
61 |     "description": null,
62 |     "name": "longitude",
63 |     "type": "NUMERIC"
64 |   }
65 | ]


--------------------------------------------------------------------------------
/dags/schemas/ways_table_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |    {
 3 |       "type":"INTEGER",
 4 |       "name":"id",
 5 |       "description": "Object unique ID."
 6 |    },
 7 |    {
 8 |       "type":"INTEGER",
 9 |       "name":"version",
10 |       "description": "Version number for this object."
11 |    },
12 |    {
13 |       "type":"STRING",
14 |       "name":"username",
15 |       "description": "Name of user who created this version of the object."
16 |    },
17 |    {
18 |       "type":"INTEGER",
19 |       "name":"changeset",
20 |       "description": "Changeset number for this object."
21 |    },
22 |    {
23 |       "type":"BOOLEAN",
24 |       "name":"visible",
25 |       "description": "Is this version of the object visible?"
26 |    },
27 |    {
28 |       "type":"TIMESTAMP",
29 |       "name":"osm_timestamp",
30 |       "description": "Last-modified timestamp for this object."
31 |    },
32 |    {
33 |       "type":"GEOGRAPHY",
34 |       "name":"geometry",
35 |       "description": "GEOGRAPHY-encoded bounding box"
36 |    },
37 |    {
38 |       "type":"RECORD",
39 |       "mode":"REPEATED",
40 |       "name":"nodes",
41 |       "fields":[
42 |           {
43 |               "type":"INTEGER",
44 |               "name":"id",
45 |               "description": "Nodes that are part of this way"
46 |           }
47 |       ]
48 |    },
49 |    {
50 |       "type":"RECORD",
51 |       "mode":"REPEATED",
52 |       "name":"all_tags",
53 |       "description": "Unstructured key=value attributes for this object.",
54 |       "fields":[
55 |          {
56 |             "type":"STRING",
57 |             "name":"key",
58 |             "description": "Attribute key."
59 |          },
60 |          {
61 |             "type":"STRING",
62 |             "name":"value",
63 |             "description": "Attribute value."
64 |          }
65 |       ]
66 |    }
67 | ]


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/land_use/land_use.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=land_use
 5 | LAYER=( 
 6 |         "7201:landuse=forest>forest-landuse"
 7 |         "7201:natural=wood>forest-natural"
 8 |         "7202:leisure=park>park-park"
 9 |         "7202:leisure=common>park-common"
10 |         "7203:landuse=residential"
11 |         "7204:landuse=industrial"
12 |         "7206:amenity=grave_yard>cemetery-amenity"
13 |         "7206:landuse=cemetery>cemetery-landuse"
14 |         "7207:landuse=allotments"
15 |         "7208:landuse=meadow"
16 |         "7209:landuse=commercial"
17 |         "7210:leisure=nature_reserve"
18 |         "7211:leisure=recreation_ground>recreation_ground-leisure"
19 |         "7211:landuse=recreation_ground>recreation_ground-landuse"
20 |         "7212:landuse=retail"
21 |         "7213:landuse=military"
22 |         "7214:landuse=quarry"
23 |         "7215:landuse=orchard"
24 |         "7216:landuse=vineyard"
25 |         "7217:landuse=scrub"
26 |         "7218:landuse=grass"
27 |         "7219:landuse=heath"
28 |         "7220:boundary=national_park"
29 |         "7221:landuse=basin"
30 |         "7222:landuse=village_green"
31 |         "7223:landuse=plant_nursery"
32 |         "7224:landuse=brownfield"
33 |         "7225:landuse=greenfield"
34 |         "7226:landuse=construction"
35 |         "7227:landuse=railway"
36 |         "7228:landuse=farmland"
37 |         "7229:landuse=farmyard"
38 | 
39 | )
40 | 
41 | for layer in "${LAYER[@]}"
42 | do
43 |   CODE="${layer%%:*}"
44 |   KVF="${layer##*:}"
45 |   K="${KVF%%=*}"
46 |   VF="${KVF##*=}"
47 |   V="${VF%%>*}"
48 |   F="${VF##*>}"
49 |   N="${F%%-*}"
50 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
51 |   common_query > "../../sql/$F.sql"
52 | done
53 | 


--------------------------------------------------------------------------------
/examples/clustering/tf_idf/vectorize.sql:
--------------------------------------------------------------------------------
 1 | WITH objects_with_terms AS (SELECT osm_id, geometry, term
 2 | FROM `gcp-pdp-osm-dev.osm_cities.cities_objects` as objects
 3 | JOIN UNNEST(SPLIT(CONCAT(layer_class, "_", layer_name), "_")) as term
 4 | WHERE objects.city_name = 'Kyiv')
 5 | , data AS (
 6 | SELECT
 7 |   grid.geo_id,
 8 |   objects.term
 9 |   FROM
10 |   objects_with_terms AS objects,
11 |   `gcp-pdp-osm-dev.osm_cities.cities_population_grid_1km` as grid
12 | WHERE ST_INTERSECTS(grid.geog, objects.geometry)
13 | )
14 | , counts AS (SELECT
15 |   geo_id,
16 |   term,
17 |   COUNT(term) OVER(partition by CONCAT(geo_id, term)) as term_count,
18 |   COUNT(term) OVER(partition by geo_id) as terms_in_cell
19 | FROM data)
20 | , tf AS (SELECT geo_id, term, ANY_VALUE(term_count)/ANY_VALUE(terms_in_cell) as tf
21 | FROM counts
22 | GROUP BY geo_id, term)
23 | , term_in_cells AS (
24 |   SELECT term, COUNT(DISTINCT geo_id) in_cells
25 |   FROM data
26 |   GROUP BY 1
27 | )
28 | , total_cells AS (
29 |   SELECT COUNT(DISTINCT geo_id) total_cells
30 |   FROM data
31 | )
32 | , idf AS (
33 |   SELECT term, LOG(total_cells.total_cells/in_cells) idf
34 |   FROM term_in_cells
35 |   CROSS JOIN total_cells
36 | )
37 | , tf_idf AS (
38 | SELECT
39 | geo_id,
40 | term,
41 | tf.tf * idf.idf tfidf,
42 | CONCAT(term, ': ', CAST(tf.tf * idf.idf AS STRING)) as term_and_tfidf
43 | FROM tf
44 | JOIN idf
45 | USING(term)
46 | ORDER BY geo_id, tfidf DESC
47 | )
48 | , features_matrix AS (SELECT geo_id, word
49 | FROM `gcp-pdp-osm-dev.words.w2v_glove_6B_300d_osm_tags`
50 | CROSS JOIN (SELECT geo_id FROM data GROUP BY geo_id)
51 | ORDER BY geo_id, word)
52 | SELECT
53 |   fm.geo_id, ARRAY_AGG(fm.word ORDER BY fm.word) as words, ARRAY_AGG(IFNULL(tf_idf.tfidf, 0.0) ORDER BY fm.word) as tfidf_vec
54 | FROM features_matrix fm
55 | LEFT JOIN tf_idf ON tf_idf.term = fm.word AND tf_idf.geo_id = fm.geo_id
56 | GROUP BY geo_id
57 | ORDER BY geo_id


--------------------------------------------------------------------------------
/dags/schemas/relations_table_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |    {
 3 |       "type":"INTEGER",
 4 |       "name":"id",
 5 |       "description": "Object unique ID."
 6 |    },
 7 |    {
 8 |       "type":"INTEGER",
 9 |       "name":"version",
10 |       "description": "Version number for this object."
11 |    },
12 |    {
13 |       "type":"STRING",
14 |       "name":"username",
15 |       "description": "Name of user who created this version of the object."
16 |    },
17 |    {
18 |       "type":"INTEGER",
19 |       "name":"changeset",
20 |       "description": "Changeset number for this object."
21 |    },
22 |    {
23 |       "type":"BOOLEAN",
24 |       "name":"visible",
25 |       "description": "Is this version of the object visible?"
26 |    },
27 |    {
28 |       "type":"TIMESTAMP",
29 |       "name":"osm_timestamp",
30 |       "description": "Last-modified timestamp for this object."
31 |    },
32 |    {
33 |       "type":"GEOGRAPHY",
34 |       "name":"geometry",
35 |       "description": "GEOGRAPHY-encoded bounding box"
36 |    },
37 |    {
38 |       "type":"RECORD",
39 |       "mode":"REPEATED",
40 |       "name":"members",
41 |       "fields":[
42 |           {
43 |               "type":"STRING",
44 |               "name":"type",
45 |               "description": null
46 |           },
47 |           {
48 |               "type":"INTEGER",
49 |               "name":"id",
50 |               "description": "Relations that are part of this relation"
51 |           },
52 |           {
53 |               "type":"STRING",
54 |               "name":"role",
55 |               "description": "Role of this relation, if any."
56 |           }
57 |       ]
58 |    },
59 |    {
60 |       "type":"RECORD",
61 |       "mode":"REPEATED",
62 |       "name":"all_tags",
63 |       "description": "Unstructured key=value attributes for this object.",
64 |       "fields":[
65 |          {
66 |             "type":"STRING",
67 |             "name":"key",
68 |             "description": "Attribute key."
69 |          },
70 |          {
71 |             "type":"STRING",
72 |             "name":"value",
73 |             "description": "Attribute value."
74 |          }
75 |       ]
76 |    }
77 | ]


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/schema/layers_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |    {
 3 |       "type":"INTEGER",
 4 |       "name":"layer_code",
 5 |       "description": "Geofabrik layer code. Example: 1003. Layers are hierarchical; The mask code=10xx corresponds to 'place' layers, and code=1003 corresponds to a 'village' type of place."
 6 |    },
 7 |    {
 8 |       "type":"STRING",
 9 |       "name":"layer_class",
10 |       "description": "Geofabrik layer class, a friendly name for layer_code. Example: 'place'"
11 |    },
12 |    {
13 |       "type":"STRING",
14 |       "name":"layer_name",
15 |       "description": "Geofabrik layer name, a friendly name for layer_code. Example: 'village'"
16 |    },
17 |    {
18 |       "type":"STRING",
19 |       "name":"gdal_type",
20 |       "description":"OpenStreetMap feature type. One of: point, line, multilinestring, multipolygon, other_relation"
21 |    },
22 |    {
23 |       "type":"INTEGER",
24 |       "name":"osm_id",
25 |       "description": "OSM Id taken from the id of this feature (node_id or relation_id) in the OSM database."
26 |    },
27 |    {
28 |       "type":"INTEGER",
29 |       "name":"osm_way_id",
30 |       "description": "OSM Way Id taken from the id of this feature (way_id) in the OSM database."
31 |    },
32 |    {
33 |       "type":"INTEGER",
34 |       "name":"osm_version",
35 |       "description": "Version number for this object."
36 |    },
37 |    {
38 |       "type":"TIMESTAMP",
39 |       "name":"osm_timestamp",
40 |       "description": "Last-modified timestamp for this object."
41 |    },
42 |    {
43 |       "type":"RECORD",
44 |       "mode":"REPEATED",
45 |       "name":"all_tags",
46 |       "description": "Unstructured key=value attributes for this object.",
47 |       "fields":[
48 |          {
49 |             "type":"STRING",
50 |             "name":"key",
51 |             "description": "Attribute key."
52 |          },
53 |          {
54 |             "type":"STRING",
55 |             "name":"value",
56 |             "description": "Attribute value."
57 |          }
58 |       ]
59 |    },
60 |    {
61 |       "type":"GEOGRAPHY",
62 |       "name":"geometry",
63 |       "description": "GEOGRAPHY-encoded object"
64 |    }
65 | ]
66 | 


--------------------------------------------------------------------------------
/utils/get_client_id.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | import logging
 4 | 
 5 | logging.getLogger().setLevel(logging.INFO)
 6 | 
 7 | def get_client_id(project_id, location, composer_environment):
 8 |     import google.auth
 9 |     import google.auth.transport.requests
10 |     import requests
11 |     import six.moves.urllib.parse
12 | 
13 |     # Authenticate with Google Cloud.
14 |     # See: https://cloud.google.com/docs/authentication/getting-started
15 |     credentials, _ = google.auth.default(
16 |         scopes=['https://www.googleapis.com/auth/cloud-platform'])
17 |     authed_session = google.auth.transport.requests.AuthorizedSession(
18 |         credentials)
19 | 
20 |     environment_url = (
21 |         'https://composer.googleapis.com/v1beta1/projects/{}/locations/{}'
22 |         '/environments/{}').format(project_id, location, composer_environment)
23 |     composer_response = authed_session.request('GET', environment_url)
24 |     environment_data = composer_response.json()
25 |     airflow_uri = environment_data['config']['airflowUri']
26 | 
27 |     # The Composer environment response does not include the IAP client ID.
28 |     # Make a second, unauthenticated HTTP request to the web server to get the
29 |     # redirect URI.
30 |     redirect_response = requests.get(airflow_uri, allow_redirects=False)
31 |     redirect_location = redirect_response.headers['location']
32 | 
33 |     # Extract the client_id query parameter from the redirect.
34 |     parsed = six.moves.urllib.parse.urlparse(redirect_location)
35 |     query_string = six.moves.urllib.parse.parse_qs(parsed.query)
36 |     return query_string['client_id'][0]
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     parser = argparse.ArgumentParser(
41 |         description=__doc__,
42 |         formatter_class=argparse.RawDescriptionHelpFormatter)
43 |     parser.add_argument('project_id', help='Your Project ID.')
44 |     parser.add_argument(
45 |         'location', help='Region of the Cloud Composer environent.')
46 |     parser.add_argument(
47 |         'composer_environment', help='Name of the Cloud Composer environent.')
48 | 
49 |     args = parser.parse_args()
50 |     logging.info(args)
51 |     client_id = get_client_id(args.project_id, args.location, args.composer_environment)
52 |     print(client_id)
53 |     sys.exit(0)
54 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/traffic/traffic.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=traffic
 5 | LAYER=( 
 6 |         "5201:highway=traffic_signals"
 7 |         "5202:highway=mini_roundabout"
 8 |         "5203:highway=stop"
 9 |         "5204:highway=crossing>crossing-highway"
10 |         "5204:railway=level_crossing>crossing-railway"
11 |         "5205:highway=ford"
12 |         "5206:highway=motorway_junction"
13 |         "5207:highway=turning_circle"
14 |         "5208:highway=speed_camera"
15 |         "5209:highway=street_lamp"
16 |         "5250:amenity=fuel"
17 |         "5251:highway=services>services"
18 |         "5251:highway=service>service"
19 |         "5270:amenity=bicycle_parking"
20 | )
21 | 
22 | for layer in "${LAYER[@]}"
23 | do
24 |   CODE="${layer%%:*}"
25 |   KVF="${layer##*:}"
26 |   K="${KVF%%=*}"
27 |   VF="${KVF##*=}"
28 |   V="${VF%%>*}"
29 |   F="${VF##*>}"
30 |   N="${F%%-*}"
31 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
32 |   common_query > "../../sql/$F.sql"
33 | done
34 | 
35 | LAYER=( 
36 |         "5261:parking=surface>parking_site"
37 |         "5262:parking=multi-storey>parking_multistorey"
38 |         "5263:parking=underground>parking_underground"
39 | )
40 | for layer in "${LAYER[@]}"
41 | do
42 |   CODE="${layer%%:*}"
43 |   KVF="${layer##*:}"
44 |   K="${KVF%%=*}"
45 |   VF="${KVF##*=}"
46 |   V="${VF%%>*}"
47 |   F="${VF##*>}"
48 |   N="${F%%-*}"
49 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')
50 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amenity' AND tags.value='parking')"
51 |   common_query > "../../sql/$F.sql"
52 | done
53 | 
54 | 
55 | CODE=5260
56 | N=parking
57 | F=parking
58 | EXTRA_CONSTRAINTS="
59 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amenity' AND tags.value='parking')
60 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'parking' AND tags.value='surface')
61 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'parking' AND tags.value='multi-storey')
62 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'parking' AND tags.value='underground')"
63 | common_query > "../../sql/$F.sql"
64 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/gcs_service.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from google.cloud import storage
 5 | 
 6 | 
 7 | def parse_uri_to_bucket_and_filename(file_path):
 8 |     """Divides file uri to bucket name and file name"""
 9 |     path_parts = file_path.split("//")
10 |     if len(path_parts) >= 2:
11 |         main_part = path_parts[1]
12 |         if "/" in main_part:
13 |             divide_index = main_part.index("/")
14 |             bucket_name = main_part[:divide_index]
15 |             file_name = main_part[divide_index + 1 - len(main_part):]
16 | 
17 |             return bucket_name, file_name
18 |     return "", ""
19 | 
20 | 
21 | def from_gcs_to_local_file(src_gcs_bucket, src_gcs_name, local_file_path):
22 |     storage_client = storage.Client(os.environ['PROJECT_ID'])
23 |     # Create a bucket object for our bucket
24 |     bucket = storage_client.get_bucket(src_gcs_bucket)
25 |     # Create a blob object from the filepath
26 |     blob = bucket.blob(src_gcs_name)
27 |     # Download the file to a destination
28 |     logging.info("Downloading gs://{}/{} to {}...".format(src_gcs_bucket, src_gcs_name, local_file_path))
29 |     blob.download_to_filename(local_file_path)
30 |     logging.info("Successfully downloaded gs://{}/{} to {}".format(src_gcs_bucket, src_gcs_name, local_file_path))
31 | 
32 | 
33 | def is_gcs_blob_exists(bucket, blob_name):
34 |     storage_client = storage.Client(os.environ['PROJECT_ID'])
35 |     # Create a bucket object for our bucket
36 |     bucket = storage_client.get_bucket(bucket)
37 |     # Create a blob object from the filepath
38 |     blob = bucket.blob(blob_name)
39 |     return blob.exists()
40 | 
41 | 
42 | def upload_file_to_gcs(filename, destination_bucket_name, destination_blob_name):
43 |     """
44 |     Uploads a file to a given Cloud Storage bucket and returns the public url
45 |     to the new object.
46 |     """
47 |     bucket = storage.Client().bucket(destination_bucket_name)
48 |     blob = bucket.blob(destination_blob_name)
49 |     logging.info("Uploading of {} to gs://{}/{}...".format(filename, destination_bucket_name, destination_blob_name))
50 |     blob.upload_from_filename(
51 |         filename,
52 |         content_type="text/plain")
53 |     logging.info(
54 |         "Finished uploading of {} to gs://{}/{}".format(filename, destination_bucket_name, destination_blob_name))
55 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/transport/transport.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=transport
 5 | LAYER=( 
 6 |         "5621:highway=bus_stop>bus_stop-highway"
 7 |         "5622:amenity=bus_station"
 8 |         "5641:amenity=taxi"
 9 |         "5652:aeroway=airfield>airfield-aeroway"
10 |         "5652:military=airfield>airfield-military"
11 |         "5655:aeroway=helipad"
12 |         "5656:aeroway=apron"
13 |         "5661:amenity=ferry_terminal"
14 |         "5671:aerialway=station>aerialway_station"
15 | )
16 | 
17 | for layer in "${LAYER[@]}"
18 | do
19 |   CODE="${layer%%:*}"
20 |   KVF="${layer##*:}"
21 |   K="${KVF%%=*}"
22 |   VF="${KVF##*=}"
23 |   V="${VF%%>*}"
24 |   F="${VF##*>}"
25 |   N="${F%%-*}"
26 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
27 |   common_query > "../../sql/$F.sql"
28 | done
29 | 
30 | CODE=5621
31 | N=bus_stop
32 | F=bus_stop-public_transport
33 | #highway=bus_stop, or public_transport=stop_position + bus=yes
34 | EXTRA_CONSTRAINTS="
35 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'public_transport' AND tags.value='stop_position')
36 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'bus' AND tags.value='yes')
37 |   AND COALESCE(osm.id,osm.way_id) = COALESCE(f.osm_id,f.osm_way_id)"
38 | common_query > "../../sql/$F.sql"
39 | 
40 | CODE=5651
41 | N=airport
42 | F=airport
43 | #amenity=airport or aeroway=aerodrome unless type=airstrip
44 | EXTRA_CONSTRAINTS="
45 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE (tags.key = 'type' AND tags.value='airstrip'))
46 |   AND (
47 |       EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amenity' AND tags.value='airport')
48 |       OR
49 |       EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'aeroway' AND tags.value='aerodrome')
50 |       )"
51 | common_query > "../../sql/$F.sql"
52 | 
53 | CODE=5652
54 | N=airfield
55 | F=airfield-airstrip
56 | #aeroway=airfield, military=airfield, aeroway=aeroway with type=airstrip
57 | EXTRA_CONSTRAINTS="
58 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'aeroway' AND tags.value='aeroway')
59 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'type' AND tags.value='airstrip')"
60 | common_query > "../../sql/$F.sql"
61 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/pofw/pofw.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=pofw
 5 | LAYER=( 
 6 |         "3100:religion=christian"
 7 |         "3200:religion=jewish"
 8 |         "3300:religion=muslim"
 9 |         "3400:religion=buddhist"
10 |         "3500:religion=hindu"
11 |         "3600:religion=taoist"
12 |         "3700:religion=shinto"
13 |         "3800:religion=sikh"
14 | )
15 | 
16 | for layer in "${LAYER[@]}"
17 | do
18 |   CODE="${layer%%:*}"
19 |   KVF="${layer##*:}"
20 |   K="${KVF%%=*}"
21 |   VF="${KVF##*=}"
22 |   V="${VF%%>*}"
23 |   F="${VF##*>}"
24 |   N="${F%%-*}"
25 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
26 |   common_query > "../../sql/$F.sql"
27 | done
28 | 
29 | LAYER=(
30 |         "3101:denomination=anglican>christian_anglican"
31 |         "3102:denomination=catholic>christian_catholic"
32 |         "3103:denomination=evangelical>christian_evangelical"
33 |         "3104:denomination=lutheran>christian_lutheran"
34 |         "3105:denomination=methodist>christian_methodist"
35 |         "3106:denomination=orthodox>christian_orthodox"
36 |         "3107:denomination=protestant>christian_protestant"
37 |         "3108:denomination=baptist>christian_baptist"
38 |         "3109:denomination=mormon>christian_mormon"
39 | )
40 | for layer in "${LAYER[@]}"
41 | do
42 |   CODE="${layer%%:*}"
43 |   KVF="${layer##*:}"
44 |   K="${KVF%%=*}"
45 |   VF="${KVF##*=}"
46 |   V="${VF%%>*}"
47 |   F="${VF##*>}"
48 |   N="${F%%-*}"
49 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')
50 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'religion' AND tags.value='christian')"
51 |   common_query > "../../sql/$F.sql"
52 | done
53 | 
54 | LAYER=(
55 |         "3301:denomination=sunni>muslim_sunni"
56 |         "3302:denomination=shia>muslim_shia"
57 | )
58 | for layer in "${LAYER[@]}"
59 | do
60 |   CODE="${layer%%:*}"
61 |   KVF="${layer##*:}"
62 |   K="${KVF%%=*}"
63 |   VF="${KVF##*=}"
64 |   V="${VF%%>*}"
65 |   F="${VF##*>}"
66 |   N="${F%%-*}"
67 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')
68 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'religion' AND tags.value='muslim')"
69 |   common_query > "../../sql/$F.sql"
70 | done
71 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_features/utils/get_client_id.py:
--------------------------------------------------------------------------------
 1 | """Get the client ID associated with a Cloud Composer environment."""
 2 | 
 3 | import argparse
 4 | 
 5 | 
 6 | def get_client_id(project_id, location, composer_environment):
 7 |     # [START composer_get_environment_client_id]
 8 |     import google.auth
 9 |     import google.auth.transport.requests
10 |     import requests
11 |     import urllib.parse
12 | 
13 |     # Authenticate with Google Cloud.
14 |     # See: https://cloud.google.com/docs/authentication/getting-started
15 |     credentials, _ = google.auth.default(
16 |         scopes=['https://www.googleapis.com/auth/cloud-platform'])
17 |     authed_session = google.auth.transport.requests.AuthorizedSession(
18 |         credentials)
19 | 
20 |     # project_id = 'YOUR_PROJECT_ID'
21 |     # location = 'us-central1'
22 |     # composer_environment = 'YOUR_COMPOSER_ENVIRONMENT_NAME'
23 | 
24 |     environment_url = (
25 |         'https://composer.googleapis.com/v1beta1/projects/{}/locations/{}'
26 |         '/environments/{}').format(project_id, location, composer_environment)
27 |     composer_response = authed_session.request('GET', environment_url)
28 |     environment_data = composer_response.json()
29 |     airflow_uri = environment_data['config']['airflowUri']
30 | 
31 |     # The Composer environment response does not include the IAP client ID.
32 |     # Make a second, unauthenticated HTTP request to the web server to get the
33 |     # redirect URI.
34 |     redirect_response = requests.get(airflow_uri, allow_redirects=False)
35 |     redirect_location = redirect_response.headers['location']
36 | 
37 |     # Extract the client_id query parameter from the redirect.
38 |     parsed = urllib.parse.urlparse(redirect_location)
39 |     query_string = urllib.parse.parse_qs(parsed.query)
40 |     print(query_string['client_id'][0])
41 |     # [END composer_get_environment_client_id]
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser(
46 |         description=__doc__,
47 |         formatter_class=argparse.RawDescriptionHelpFormatter)
48 |     parser.add_argument('project_id', help='Your Project ID.')
49 |     parser.add_argument(
50 |         'location', help='Region of the Cloud Composer environent.')
51 |     parser.add_argument(
52 |         'composer_environment', help='Name of the Cloud Composer environent.')
53 | 
54 |     args = parser.parse_args()
55 |     get_client_id(
56 |         args.project_id, args.location, args.composer_environment)


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_features/src/csv_to_json/geojson-csv-to-json.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | use warnings;
 4 | use JSON;
 5 | use Text::CSV::Encoded;
 6 | use Encode qw( decode );
 7 | 
 8 | 
 9 | #geometry,osm_id,osm_way_id,osm_version,osm_timestamp,all_tags
10 | 
11 | my $csv = Text::CSV::Encoded->new({ sep_char => ',',escape_char => '"',encoding_in  => "iso-8859-1" });
12 | my @field_names;
13 | my $i = 1;
14 | while (my $line = <>) {
15 |   chomp $line;
16 |   $line = decode( 'iso-8859-1', $line );
17 |   if ( $i == 1 ) {
18 |     @field_names = split /,/, $line;
19 |   }
20 |   else {
21 |       if ($csv->parse($line)) {
22 |       my @fields = $csv->fields();
23 |       my $geometry = JSON::decode_json($fields[0]);
24 |       my $osm_id = $fields[1];
25 |       my $osm_way_id = $fields[2];
26 |       my $osm_version = $fields[3];
27 |       my $osm_timestamp = $fields[4];
28 |       my $all_tags = $fields[5];
29 |       $all_tags =~ s/""/\\"/g;
30 |       $all_tags =~ s/\r/\\r/gs;
31 |       $all_tags =~ s/\t/\\t/gs;
32 |       $all_tags =~ s/\\\\/DOUBLEBACKSLASH/g;
33 |       my @tags = ();
34 |       while ( $all_tags =~ m/\G.*?"(.*?[^\\])"=>"(.*?[^\\])"(,|$)/g ) {
35 |         my $k = $1;
36 |         my $v = $2;
37 |         if ( $v =~ m/\\"$/ ) { warn "MATCHED '\\'"; $v .= '"'; }
38 |         $k =~ s/DOUBLEBACKSLASH/\\\\/;
39 |         $v =~ s/DOUBLEBACKSLASH/\\\\/;
40 |         #warn "$k\t=>\t$v";
41 |         push @tags, {"key" => $k, "value" => $v};
42 |       }
43 |       my $json_tags = '[' . join(',', map { '{"key":"' . $_->{key} . '","value":"' . $_->{value} . '"}' } @tags) . ']';
44 |       #$all_tags = '[' . join(",",(map{qq({"key":"$_","value":"$at{$_}"})} keys %at)) . ']';
45 |       eval {
46 |         $json_tags = JSON::encode_json(JSON::decode_json($json_tags));
47 |       };
48 |       if ( $@ ) {
49 |         print STDERR "failed to JSON encode at line $i: $@, offending data:\n";
50 |         print STDERR "\torig: $all_tags\n";
51 |         print STDERR "\tjson: $json_tags\n";
52 |       }
53 |       else {
54 |         my $genc = JSON::encode_json($geometry);
55 |         $genc =~ s/"/\\"/g;
56 |         print sprintf(qq({"geometry":"%s","osm_id":"%s","osm_way_id":"%s","osm_version":%d,"osm_timestamp":"%s","all_tags":%s}\n),
57 |           $genc, $osm_id, $osm_way_id, $osm_version, $osm_timestamp, $json_tags
58 |         );
59 |       }
60 |     }
61 |     else {
62 |       print STDERR "failed to parse $i:\t$line\n";
63 |     }
64 |   }
65 |   $i++;
66 | }
67 | 


--------------------------------------------------------------------------------
/dags/utils/gcs_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from google.cloud import storage
 5 | 
 6 | 
 7 | def parse_uri_to_bucket_and_filename(file_path):
 8 |     """Divides file uri to bucket name and file name"""
 9 |     path_parts = file_path.split("//")
10 |     if len(path_parts) >= 2:
11 |         main_part = path_parts[1]
12 |         if "/" in main_part:
13 |             divide_index = main_part.index("/")
14 |             bucket_name = main_part[:divide_index]
15 |             file_name = main_part[divide_index + 1 - len(main_part):]
16 | 
17 |             return bucket_name, file_name
18 |         else:
19 |             raise Exception("Wrong file_path format: {}".format(file_path))
20 |     else:
21 |         raise Exception("Wrong file_path format: {}".format(file_path))
22 | 
23 | 
24 | def from_gcs_to_local_file(src_gcs_bucket, src_gcs_name, local_file_path):
25 |     storage_client = storage.Client(os.environ['PROJECT_ID'])
26 |     # Create a bucket object for our bucket
27 |     bucket = storage_client.get_bucket(src_gcs_bucket)
28 |     # Create a blob object from the filepath
29 |     blob = bucket.blob(src_gcs_name)
30 |     # Download the file to a destination
31 |     logging.info("Downloading gs://{}/{} to {}...".format(src_gcs_bucket, src_gcs_name, local_file_path))
32 |     blob.download_to_filename(local_file_path)
33 |     logging.info("Successfully downloaded gs://{}/{} to {}".format(src_gcs_bucket, src_gcs_name, local_file_path))
34 | 
35 | 
36 | def is_gcs_blob_exists(bucket, blob_name):
37 |     storage_client = storage.Client(os.environ['PROJECT_ID'])
38 |     # Create a bucket object for our bucket
39 |     bucket = storage_client.get_bucket(bucket)
40 |     # Create a blob object from the filepath
41 |     blob = bucket.blob(blob_name)
42 |     return blob.exists()
43 | 
44 | 
45 | def upload_file_to_gcs(filename, destination_bucket_name, destination_blob_name):
46 |     """
47 |     Uploads a file to a given Cloud Storage bucket and returns the public url
48 |     to the new object.
49 |     """
50 |     bucket = storage.Client().bucket(destination_bucket_name)
51 |     blob = bucket.blob(destination_blob_name)
52 |     logging.info("Uploading of {} to gs://{}/{}...".format(filename, destination_bucket_name, destination_blob_name))
53 |     blob.upload_from_filename(
54 |         filename,
55 |         content_type="text/plain")
56 |     logging.info(
57 |         "Finished uploading of {} to gs://{}/{}".format(filename, destination_bucket_name, destination_blob_name))


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/traffic_barrier/traffic_barrier.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=traffic
 5 | LAYER=( 
 6 |         "5211:barrier=gate"
 7 |         "5212:barrier=bollard"
 8 |         "5213:barrier=lift_gate"
 9 |         "5214:barrier=stile>stile-barrier"
10 |         "5214:highway=stile>stile-highway"
11 |         "5215:barrier=cycle_barrier>cycle"
12 |         "5216:barrier=fence"
13 |         "5217:barrier=toll_booth>toll"
14 |         "5218:barrier=block"
15 |         "5219:barrier=kissing_gate"
16 |         "5220:barrier=cattle_grid"
17 | )
18 | 
19 | for layer in "${LAYER[@]}"
20 | do
21 |   CODE="${layer%%:*}"
22 |   KVF="${layer##*:}"
23 |   K="${KVF%%=*}"
24 |   VF="${KVF##*=}"
25 |   V="${VF%%>*}"
26 |   F="${VF##*>}"
27 |   N="${F%%-*}"
28 |   NAME_PREFIX=barrier_
29 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
30 |   common_query > "../../sql/$NAME_PREFIX$F.sql"
31 | done
32 | 
33 | CODE=5210
34 | V=barrier
35 | N=barrier
36 | F=barrier
37 | NAME_PREFIX=""
38 | EXTRA_CONSTRAINTS="
39 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K')
40 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='gate')
41 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='bollard')
42 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='lift_gate')
43 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='stile')
44 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'highway' AND tags.value='stile')
45 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='cycle_barrier')
46 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='fence')
47 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='toll_booth')
48 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='block')
49 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='kissing_gate')
50 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='cattle_grid')"
51 | common_query > "../../sql/$F.sql"
52 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/poi_public/poi_public.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=poi_public
 5 | LAYER=(
 6 |         "2001:amenity=police"
 7 |         "2002:amenity=fire_station"
 8 |         "2004:amenity=post_box"
 9 |         "2005:amenity=post_office"
10 |         "2006:amenity=telephone"
11 |         "2007:amenity=library"
12 |         "2008:amenity=townhall>town_hall"
13 |         "2009:amenity=courthouse"
14 |         "2010:amenity=prison"
15 |         "2011:amenity=embassy"
16 |         "2012:amenity=community_centre"
17 |         "2013:amenity=nursing_home"
18 |         "2014:amenity=arts_centre"
19 |         "2015:amenity=grave_yard>graveyard-amenity"
20 |         "2015:landuse=cemetery>graveyard-landuse"
21 |         "2016:amenity=marketplace"
22 |         "2081:amenity=university"
23 |         "2082:amenity=school"
24 |         "2083:amenity=kindergarten"
25 |         "2084:amenity=college"
26 |         "2099:amenity=public_building"
27 | )
28 | 
29 | for layer in "${LAYER[@]}"
30 | do
31 |   CODE="${layer%%:*}"
32 |   KVF="${layer##*:}"
33 |   K="${KVF%%=*}"
34 |   VF="${KVF##*=}"
35 |   V="${VF%%>*}"
36 |   F="${VF##*>}"
37 |   N="${F%%-*}"
38 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
39 |   common_query > "../../sql/$F.sql"
40 | done
41 | 
42 | LAYER=(
43 |         "2031:glass=yes>recycling_glass"
44 |         "2032:paper=yes>recycling_paper"
45 |         "2033:clothes=yes>recycling_clothes"
46 |         "2034:metal=yes>recycling_metal"
47 | )
48 | for layer in "${LAYER[@]}"
49 | do
50 |   CODE="${layer%%:*}"
51 |   KVF="${layer##*:}"
52 |   K="${KVF%%=*}"
53 |   VF="${KVF##*=}"
54 |   V="${VF%%>*}"
55 |   F="${VF##*>}"
56 |   N="${F%%-*}"
57 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'recycling:$K' AND tags.value='$V')"
58 |   common_query > "../../sql/$F.sql"
59 | done
60 | 
61 | CODE=2030
62 | N=recycling
63 | F=recycling
64 | EXTRA_CONSTRAINTS="
65 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'amenity' AND tags.value='recycling')
66 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'recycling:glass' AND tags.value='yes')
67 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'recycling:paper' AND tags.value='yes')
68 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'recycling:clothes' AND tags.value='yes')
69 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'recycling:scrap_metal' AND tags.value='yes')
70 | "
71 | common_query > "../../sql/$F.sql"
72 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/poi_shopping/poi_shopping.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=poi_shopping
 5 | LAYER=( 
 6 |         "2501:shop=supermarket"
 7 |         "2502:shop=bakery"
 8 |         "2503:shop=kiosk"
 9 |         "2504:shop=mall"
10 |         "2505:shop=department_store"
11 |         "2511:shop=convenience"
12 |         "2512:shop=clothes"
13 |         "2513:shop=florist"
14 |         "2514:shop=chemist"
15 |         "2515:shop=books"
16 |         "2516:shop=butcher"
17 |         "2517:shop=shoes"
18 |         "2518:shop=alcohol>beverages-alcohol"
19 |         "2518:shop=beverages>beverages-beverages"
20 |         "2519:shop=optician"
21 |         "2520:shop=jewelry"
22 |         "2521:shop=gift"
23 |         "2522:shop=sports"
24 |         "2523:shop=stationery"
25 |         "2524:shop=outdoor"
26 |         "2525:shop=mobile_phone"
27 |         "2526:shop=toys"
28 |         "2527:shop=newsagent"
29 |         "2528:shop=greengrocer"
30 |         "2529:shop=beauty"
31 |         "2530:shop=video"
32 |         "2541:shop=car"
33 |         "2542:shop=bicycle"
34 |         "2543:shop=doityourself>doityourself-doityourself"
35 |         "2543:shop=hardware>doityourself-hardware"
36 |         "2544:shop=furniture"
37 |         "2546:shop=computer"
38 |         "2547:shop=garden_centre"
39 |         "2561:shop=hairdresser"
40 |         "2562:shop=car_repair"
41 |         "2563:amenity=car_rental"
42 |         "2564:amenity=car_wash"
43 |         "2565:amenity=car_sharing"
44 |         "2566:amenity=bicycle_rental"
45 |         "2567:shop=travel_agency"
46 |         "2568:shop=laundry>laundry-laundry"
47 |         "2568:shop=dry_cleaning>laundry-dry_cleaning"
48 |         "2591:vending=cigarettes>vending_cigarette"
49 |         "2592:vending=parking_tickets>vending_parking"
50 | )
51 | 
52 | for layer in "${LAYER[@]}"
53 | do
54 |   CODE="${layer%%:*}"
55 |   KVF="${layer##*:}"
56 |   K="${KVF%%=*}"
57 |   VF="${KVF##*=}"
58 |   V="${VF%%>*}"
59 |   F="${VF##*>}"
60 |   N="${F%%-*}"
61 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
62 |   common_query > "../../sql/$F.sql"
63 | done
64 | 
65 | CODE=2590
66 | N=vending_machine
67 | F=vending_machine
68 | EXTRA_CONSTRAINTS="
69 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amenity' AND tags.value='vending_machine')
70 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'vending' AND tags.value='cigarettes')
71 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'vending' AND tags.value='parking_tickets')"
72 | common_query > "../../sql/$F.sql"
73 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/power/power.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=power
 5 | LAYER=(
 6 |         "6411:source=nuclear>station_nuclear"
 7 |         "6412:source=solar>station_solar-solar"
 8 |         "6413:source=gas>station_fossil-gas"
 9 |         "6413:source=coal>station_fossil-coal"
10 |         "6413:source=oil>station_fossil-oil"
11 |         "6413:source=diesel>station_fossil-diesel"
12 |         "6414:source=hydro>station_water-generator"
13 |         "6415:source=wind>station_wind-generator"
14 | )
15 | for layer in "${LAYER[@]}"
16 | do
17 |   CODE="${layer%%:*}"
18 |   KVF="${layer##*:}"
19 |   K="${KVF%%=*}"
20 |   VF="${KVF##*=}"
21 |   V="${VF%%>*}"
22 |   F="${VF##*>}"
23 |   N="${F%%-*}"
24 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'generator:$K' AND tags.value='$V')"
25 |   common_query > "../../sql/$F.sql"
26 | done
27 | 
28 | LAYER=(
29 |         "6204:power=pole>pole"
30 |         "6401:power=tower>tower"
31 |         "6412:power_source=photovoltaic>station_solar-photovoltaic"
32 |         "6414:power_source=hydro>station_water-power"
33 |         "6415:power_source=wind>station_wind-power"
34 |         "6422:power=station>substation-station"
35 |         "6422:power=sub_station>substation-sub_station"
36 |         "6423:power=transformer>transformer"
37 | )
38 | for layer in "${LAYER[@]}"
39 | do
40 |   CODE="${layer%%:*}"
41 |   KVF="${layer##*:}"
42 |   K="${KVF%%=*}"
43 |   VF="${KVF##*=}"
44 |   V="${VF%%>*}"
45 |   F="${VF##*>}"
46 |   N="${F%%-*}"
47 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
48 |   common_query > "../../sql/$F.sql"
49 | done
50 | 
51 | 
52 | CODE=6410
53 | N=station
54 | F=station
55 | EXTRA_CONSTRAINTS="
56 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'power' AND tags.value='generator')
57 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE
58 |        (  tags.key = 'generator:source' AND tags.value = 'nuclear' ) 
59 |     OR ( (tags.key = 'generator:source' AND tags.value = 'solar') OR (tags.key = 'power_source' AND tags.value = 'photovoltaic') ) 
60 |     OR (  tags.key = 'generator:source' AND tags.value IN ('gas','coal','oil','diesel')  ) 
61 |     OR ( (tags.key = 'generator:source' AND tags.value = 'hydro') OR (tags.key = 'power_source' AND tags.value = 'hydro') ) 
62 |     OR ( (tags.key = 'generator:source' AND tags.value = 'wind') OR (tags.key = 'power_source' AND tags.value = 'wind') ) 
63 |     OR ( (tags.key = 'power' AND tags.value = 'station') OR (tags.key = 'power' AND tags.value = 'sub_station') ) 
64 |     OR (  tags.key = 'power' AND tags.value = 'transformer' ) 
65 |   )"
66 | common_query > "../../sql/$F.sql"
67 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/gdal/gdal_handler.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import logging
 3 | import json
 4 | import time
 5 | import os
 6 | 
 7 | class GDALHandler(object):
 8 | 
 9 |     def __init__(self, script_path, config_path, work_dir):
10 |         self.script_path = script_path
11 |         self.config_path = config_path
12 |         self.work_dir = work_dir
13 |         self.type_layers = {"ways": ["lines", "multipolygons"],
14 |                             "relations": ["multipolygons", "other_relations", "points", "multilinestrings", "lines"]}
15 | 
16 |     def osm_to_geojson(self, src_osm_filename, entity_type, result_ids):
17 |         def geometry_from_geojson_features(geojson_features, feature_index):
18 |             return geojson_features[feature_index]["properties"]["geometry"]
19 |         try:
20 |             file_size = os.path.getsize(src_osm_filename)
21 |         except Exception:
22 |             file_size = -1
23 |         logging.info("Working with {}, size: {}".format(src_osm_filename, str(file_size)))
24 |         start = time.time()
25 | 
26 |         id_geometry_map = {}
27 |         layers = self.type_layers[entity_type]
28 |         for layer in layers:
29 |             temp_geojson_file_name = self.work_dir + "{}.geojson".format(layer)
30 |             cmd = "sh {} {} {} {} {}".format(self.script_path, self.config_path, src_osm_filename,
31 |                                              temp_geojson_file_name, layer)
32 |             process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
33 |             process.communicate()
34 | 
35 |             geojson_file = open(temp_geojson_file_name, "r")
36 |             geojson_data = json.load(geojson_file)
37 |             geojson_file.close()
38 |             os.remove(temp_geojson_file_name)
39 | 
40 |             geojson_features = geojson_data["features"]
41 |             if len(geojson_features) > 0:
42 |                 for index in range(len(geojson_features)):
43 |                     current_id = geojson_features[index]["properties"]["osm_id"]
44 |                     if not current_id:
45 |                         current_id = geojson_features[index]["properties"]["osm_way_id"]
46 |                     current_id = int(current_id)
47 |                     if current_id in result_ids:
48 |                         id_geometry_map[current_id] = geometry_from_geojson_features(geojson_features, index)
49 |                         result_ids.remove(current_id)
50 |                         if len(result_ids) == 0:
51 |                             break
52 |             if len(result_ids) == 0:
53 |                 break
54 |         os.remove(src_osm_filename)
55 |         logging.info("Finish working with {}. Time spent: {}s".format(src_osm_filename, (time.time() - start)))
56 |         return id_geometry_map
57 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/poi_tourism/poi_tourism.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=poi_tourism
 5 | LAYER=( 
 6 |         "2721:tourism=attraction"
 7 |         "2722:tourism=museum"
 8 |         "2723:historic=monument"
 9 |         "2724:historic=memorial"
10 |         "2725:tourism=artwork>art"
11 |         "2731:historic=castle"
12 |         "2732:historic=ruins"
13 |         "2733:historic=archaeological_site>archaeological"
14 |         "2734:historic=wayside_cross"
15 |         "2735:historic=wayside_shrine"
16 |         "2736:historic=battlefield"
17 |         "2737:historic=fort"
18 |         "2741:tourism=picnic_site"
19 |         "2742:tourism=viewpoint"
20 |         "2743:tourism=zoo"
21 |         "2744:tourism=theme_park"
22 | )
23 | 
24 | for layer in "${LAYER[@]}"
25 | do
26 |   CODE="${layer%%:*}"
27 |   KVF="${layer##*:}"
28 |   K="${KVF%%=*}"
29 |   VF="${KVF##*=}"
30 |   V="${VF%%>*}"
31 |   F="${VF##*>}"
32 |   N="${F%%-*}"
33 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
34 |   common_query > "../../sql/$F.sql"
35 | done
36 | 
37 | CODE=2701
38 | N=tourist_info
39 | F=tourist_info
40 | #2701
41 | EXTRA_CONSTRAINTS="
42 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tourism' AND tags.value='information')
43 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='map')
44 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='board')
45 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='guidepost')"
46 | common_query > "../../sql/$F.sql"
47 | 
48 | CODE=2704
49 | N=tourist_map
50 | F=tourist_map
51 | EXTRA_CONSTRAINTS="
52 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tourism' AND tags.value='information')
53 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='map')"
54 | common_query > "../../sql/$F.sql"
55 | 
56 | CODE=2705
57 | N=tourist_board
58 | F=tourist_board
59 | EXTRA_CONSTRAINTS="
60 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tourism' AND tags.value='information')
61 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='board')"
62 | common_query > "../../sql/$F.sql"
63 | 
64 | CODE=2706
65 | N=tourist_guidepost
66 | F=tourist_guidepost
67 | EXTRA_CONSTRAINTS="
68 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tourism' AND tags.value='information')
69 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='guidepost')"
70 | common_query > "../../sql/$F.sql"
71 | 


--------------------------------------------------------------------------------
/examples/clustering/tf_idf/analyze.sql:
--------------------------------------------------------------------------------
 1 | -- assign tile to lbcs category
 2 | WITH similarities AS (SELECT
 3 |   grid.geo_id,
 4 |   MAX(udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec)) as max_similarity
 5 | FROM
 6 |   `gcp-pdp-osm-dev.osm_clustering_grid_01km.vectors_tfidf` tfidf
 7 | JOIN `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` grid USING(geo_id)
 8 | CROSS JOIN `gcp-pdp-osm-dev.lbcs.lbcs_tfidf` lbcs
 9 | WHERE grid.city_name = "Madrid"
10 | AND lbcs.dimension = 'Function'
11 | AND udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec) > 0
12 | GROUP BY grid.geo_id)
13 | SELECT
14 |   grid.geo_id,
15 |   grid.geog,
16 |   lbcs.name,
17 |   lbcs.color,
18 |   udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec) as similarity
19 | FROM
20 |   `gcp-pdp-osm-dev.osm_clustering_grid_01km.vectors_tfidf` tfidf
21 | JOIN `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` grid USING(geo_id)
22 | CROSS JOIN `gcp-pdp-osm-dev.lbcs.lbcs_tfidf` lbcs
23 | JOIN similarities ON similarities.geo_id = tfidf.geo_id AND similarities.max_similarity = udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec)
24 | WHERE grid.city_name = "Madrid"
25 | AND lbcs.dimension = 'Function'
26 | ORDER BY similarity DESC
27 | 
28 | -- Selects tile terms
29 | WITH objects_with_terms AS (SELECT osm_id, geometry, term
30 | FROM `gcp-pdp-osm-dev.osm_cities.cities_objects` as objects
31 | JOIN UNNEST(SPLIT(CONCAT(layer_class, "_", layer_name), "_")) as term
32 | WHERE objects.city_name = 'Madrid')
33 | , data AS (
34 | SELECT
35 |   grid.geo_id,
36 |   objects.term
37 |   FROM
38 |   objects_with_terms AS objects,
39 |   `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` as grid
40 | WHERE ST_INTERSECTS(grid.geog, objects.geometry)
41 | )
42 | , counts AS (SELECT
43 |   geo_id,
44 |   term,
45 |   COUNT(term) OVER(partition by CONCAT(geo_id, term)) as term_count,
46 |   COUNT(term) OVER(partition by geo_id) as terms_in_cell
47 | FROM data)
48 | , tf AS (SELECT geo_id, term, ANY_VALUE(term_count)/ANY_VALUE(terms_in_cell) as tf
49 | FROM counts
50 | GROUP BY geo_id, term)
51 | , term_in_cells AS (
52 |   SELECT term, COUNT(DISTINCT geo_id) in_cells
53 |   FROM data
54 |   GROUP BY 1
55 | )
56 | , total_cells AS (
57 |   SELECT COUNT(DISTINCT geo_id) total_cells
58 |   FROM data
59 | )
60 | , idf AS (
61 |   SELECT term, LOG(total_cells.total_cells/in_cells) idf
62 |   FROM term_in_cells
63 |   CROSS JOIN total_cells
64 | )
65 | , tf_idf AS (
66 | SELECT
67 | geo_id,
68 | term,
69 | tf.tf * idf.idf tfidf,
70 | CONCAT(term, ': ', CAST(tf.tf * idf.idf AS STRING)) as term_and_tfidf
71 | FROM tf
72 | JOIN idf
73 | USING(term)
74 | ORDER BY tfidf DESC
75 | )
76 | SELECT
77 |   geo_id,
78 |   ANY_VALUE(grid.geog) as geog,
79 |   ARRAY_TO_STRING(ARRAY_AGG(term_and_tfidf ORDER BY tfidf DESC), ',<br/>') as terms
80 | FROM tf_idf
81 | JOIN `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` as grid USING(geo_id)
82 | WHERE grid.city_name = "Madrid"
83 | GROUP BY geo_id


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_features/src/osm2geojsoncsv:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Build BQ GeoJSON dataset from OSM dump file
 3 | # The driver will categorize features into 5 layers :
 4 | 
 5 | # points : “node” features that have significant tags attached.
 6 | # lines : “way” features that are recognized as non-area.
 7 | # multilinestrings : “relation” features that form a multilinestring
 8 | # (type = ‘multilinestring’ or type = ‘route’).
 9 | # multipolygons : “relation” features that form a multipolygon
10 | # (type = ‘multipolygon’ or type = ‘boundary’), and “way” features that
11 | # are recognized as area.
12 | # other_relations : “relation” features that do not belong to the above 2 layers.
13 | # Note: for recent GDAL option "OGR_INTERLEAVED_READING=YES" is not required
14 | # Use as
15 | # time sh osm2geojsoncsv germany-latest.osm.pbf germany-latest
16 | set -e
17 | 
18 | # use custom GDAL configuration
19 | OGRCONFIG=osmconf.ini
20 | 
21 | 
22 | if [ "$#" -ne 3 ]
23 | then
24 |     echo "Use as: $0 INPUT_FILENAME_OSM_PBF OUTPUT_BASENAME LAYERS"
25 |     exit 1
26 | fi
27 | 
28 | # input file name
29 | OSMNAME="$1"
30 | # output file basename (without extension)
31 | NAME="$2"
32 | LAYERS="$3"
33 | 
34 | # check input file
35 | if [ ! -f "$OSMNAME" ]
36 | then
37 |     echo "Input file '$1' doesn't exist"
38 |     exit 1
39 | fi
40 | # check input file
41 | if [ ! -r "$OSMNAME" ]
42 | then
43 |     echo "Input file '$1' is not readable"
44 |     exit 1
45 | fi
46 | if [ ! -s "$OSMNAME" ]
47 | then
48 |     echo "Input file '$1' is empty"
49 |     exit 1
50 | fi
51 | BASENAME=$(basename "$OSMNAME")
52 | if [ $(basename "$BASENAME" .pbf) = "$BASENAME" ]
53 | then
54 |     echo "Input file '$1' is not PBF Format ('Protocolbuffer Binary Format') file"
55 |     exit 1
56 | fi
57 | 
58 | # the option below can be helpful for some hardware configurations:
59 | # --config OSM_COMPRESS_NODES YES
60 | # GDAL_CACHEMAX and OSM_MAX_TMPFILE_SIZE defined in MB
61 | # for GDAL_CACHEMAX=4000 and OSM_MAX_TMPFILE_SIZE=4000 recommended RAM=60GB
62 | for ogrtype in $(echo $LAYERS | sed "s/,/ /g")
63 | do
64 |     if [ "$ogrtype" = "multipolygons" ]
65 |     then
66 |         osm_fields="osm_id,osm_way_id,osm_version,osm_timestamp"
67 |     else
68 |         osm_fields="osm_id,NULL AS osm_way_id,osm_version,osm_timestamp"
69 |     fi
70 |     echo "Processing ${ogrtype} with OSM fields ${osm_fields}"
71 | 
72 |     ogr2ogr \
73 |     -skipfailures \
74 |     -f CSV \
75 |     "${NAME}-${ogrtype}.geojson.csv" "${OSMNAME}" \
76 |     --config OSM_CONFIG_FILE "${OGRCONFIG}" \
77 |     --config OGR_INTERLEAVED_READING YES \
78 |     --config GDAL_CACHEMAX 20000 \
79 |     --config OSM_MAX_TMPFILE_SIZE 100000 \
80 |     -dialect sqlite \
81 |     -sql "select AsGeoJSON(geometry) AS geometry, ${osm_fields}, replace(all_tags,X'0A','') as all_tags from ${ogrtype} where ST_IsValid(geometry) = 1" \
82 |     --debug on \
83 |     2>"${NAME}-${ogrtype}.debug.log"
84 | done
85 | echo "Complete"
86 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_nodes_ways_relations/src/osm_dtos.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from datetime import datetime
 3 | 
 4 | from osmium.osm._osm import Node
 5 | from osmium.osm._osm import Way
 6 | from osmium.osm._osm import Relation
 7 | from osmium.osm._osm import OSMObject
 8 | from osmium.osm._osm import RelationMember
 9 | 
10 | @dataclass
11 | class OsmObjectDTO(object):
12 |     id: int
13 |     version: int
14 |     username: str
15 |     changeset: int
16 |     visible: bool
17 |     timestamp: int
18 |     tags = []
19 | 
20 |     def __init__(self, osm_entity: OSMObject):
21 |         self.id = osm_entity.id
22 |         self.version = osm_entity.version
23 |         self.username = osm_entity.user
24 |         self.changeset = osm_entity.changeset
25 |         self.visible = osm_entity.visible
26 |         self.timestamp = int(datetime.timestamp(osm_entity.timestamp))
27 |         self.tags = [(tag.k, tag.v) for tag in osm_entity.tags]
28 | 
29 |     def __dict__(self):
30 |         tags_dict = [{"key": tag[0], "value": tag[1]} for tag in self.tags]
31 |         return {"id": self.id, "version": self.version, "username": self.username, "changeset": self.changeset,
32 |                 "visible": self.visible, "osm_timestamp": self.timestamp, "all_tags": tags_dict}
33 | 
34 | 
35 | @dataclass
36 | class NodeDTO(OsmObjectDTO):
37 |     latitude: float
38 |     longitude: float
39 | 
40 |     def __init__(self, node_entity: Node):
41 |         OsmObjectDTO.__init__(self, node_entity)
42 |         self.latitude = node_entity.location.lat
43 |         self.longitude = node_entity.location.lon
44 | 
45 |     def __dict__(self):
46 |         dict_repr = super(NodeDTO, self).__dict__()
47 |         dict_repr["latitude"] = self.latitude
48 |         dict_repr["longitude"] = self.longitude
49 |         return dict_repr
50 | 
51 | 
52 | @dataclass
53 | class WayDTO(OsmObjectDTO):
54 |     nodes: list
55 | 
56 |     def __init__(self, way_entity: Way):
57 |         OsmObjectDTO.__init__(self, way_entity)
58 |         self.nodes = [node.ref for node in way_entity.nodes]
59 | 
60 |     def __dict__(self):
61 |         dict_repr = super(WayDTO, self).__dict__()
62 |         dict_repr["nodes"] = [{"id": node} for node in self.nodes]
63 |         return dict_repr
64 | 
65 | 
66 | @dataclass
67 | class RelationDTO(OsmObjectDTO):
68 |     members: list
69 | 
70 |     def __init__(self, relation_entity: Relation):
71 |         OsmObjectDTO.__init__(self, relation_entity)
72 |         self.members = [RelationMemberDTO(member) for member in iter(relation_entity.members)]
73 | 
74 |     def __dict__(self):
75 |         dict_repr = super(RelationDTO, self).__dict__()
76 |         dict_repr["members"] = [member.__dict__() for member in self.members]
77 |         return dict_repr
78 | 
79 | 
80 | @dataclass
81 | class RelationMemberDTO(object):
82 |     type: str
83 |     id: int
84 |     role: str
85 | 
86 |     def __init__(self, relation_entity: RelationMember):
87 |         self.type = relation_entity.type
88 |         self.id = relation_entity.ref
89 |         self.role = relation_entity.role
90 | 
91 |     def __dict__(self):
92 |         return {"type": self.type, "id": self.id, "role": self.role}


--------------------------------------------------------------------------------
/examples/clustering/bq_udf/geohash.js:
--------------------------------------------------------------------------------
  1 | // geohash.js
  2 | // Geohash library for Javascript
  3 | // (c) 2008 David Troy
  4 | // Distributed under the MIT License
  5 | 
  6 | BITS = [16, 8, 4, 2, 1];
  7 | 
  8 | BASE32 = 											   "0123456789bcdefghjkmnpqrstuvwxyz";
  9 | NEIGHBORS = { right  : { even :  "bc01fg45238967deuvhjyznpkmstqrwx" },
 10 | 							left   : { even :  "238967debc01fg45kmstqrwxuvhjyznp" },
 11 | 							top    : { even :  "p0r21436x8zb9dcf5h7kjnmqesgutwvy" },
 12 | 							bottom : { even :  "14365h7k9dcfesgujnmqp0r2twvyx8zb" } };
 13 | BORDERS   = { right  : { even : "bcfguvyz" },
 14 | 							left   : { even : "0145hjnp" },
 15 | 							top    : { even : "prxz" },
 16 | 							bottom : { even : "028b" } };
 17 | 
 18 | NEIGHBORS.bottom.odd = NEIGHBORS.left.even;
 19 | NEIGHBORS.top.odd = NEIGHBORS.right.even;
 20 | NEIGHBORS.left.odd = NEIGHBORS.bottom.even;
 21 | NEIGHBORS.right.odd = NEIGHBORS.top.even;
 22 | 
 23 | BORDERS.bottom.odd = BORDERS.left.even;
 24 | BORDERS.top.odd = BORDERS.right.even;
 25 | BORDERS.left.odd = BORDERS.bottom.even;
 26 | BORDERS.right.odd = BORDERS.top.even;
 27 | 
 28 | function refine_interval(interval, cd, mask) {
 29 | 	if (cd&mask)
 30 | 		interval[0] = (interval[0] + interval[1])/2;
 31 |   else
 32 | 		interval[1] = (interval[0] + interval[1])/2;
 33 | }
 34 | 
 35 | function calculateAdjacent(srcHash, dir) {
 36 | 	srcHash = srcHash.toLowerCase();
 37 | 	var lastChr = srcHash.charAt(srcHash.length-1);
 38 | 	var type = (srcHash.length % 2) ? 'odd' : 'even';
 39 | 	var base = srcHash.substring(0,srcHash.length-1);
 40 | 	if (BORDERS[dir][type].indexOf(lastChr)!=-1)
 41 | 		base = calculateAdjacent(base, dir);
 42 | 	return base + BASE32[NEIGHBORS[dir][type].indexOf(lastChr)];
 43 | }
 44 | 
 45 | function decodeGeoHash(geohash) {
 46 | 	var is_even = 1;
 47 | 	var lat = []; var lon = [];
 48 | 	lat[0] = -90.0;  lat[1] = 90.0;
 49 | 	lon[0] = -180.0; lon[1] = 180.0;
 50 | 	lat_err = 90.0;  lon_err = 180.0;
 51 | 	
 52 | 	for (i=0; i<geohash.length; i++) {
 53 | 		c = geohash[i];
 54 | 		cd = BASE32.indexOf(c);
 55 | 		for (j=0; j<5; j++) {
 56 | 			mask = BITS[j];
 57 | 			if (is_even) {
 58 | 				lon_err /= 2;
 59 | 				refine_interval(lon, cd, mask);
 60 | 			} else {
 61 | 				lat_err /= 2;
 62 | 				refine_interval(lat, cd, mask);
 63 | 			}
 64 | 			is_even = !is_even;
 65 | 		}
 66 | 	}
 67 | 	lat[2] = (lat[0] + lat[1])/2;
 68 | 	lon[2] = (lon[0] + lon[1])/2;
 69 | 
 70 | 	return { latitude: lat, longitude: lon};
 71 | }
 72 | 
 73 | function encodeGeoHash(latitude, longitude) {
 74 | 	var is_even=1;
 75 | 	var i=0;
 76 | 	var lat = []; var lon = [];
 77 | 	var bit=0;
 78 | 	var ch=0;
 79 | 	var precision = 12;
 80 | 	geohash = "";
 81 | 
 82 | 	lat[0] = -90.0;  lat[1] = 90.0;
 83 | 	lon[0] = -180.0; lon[1] = 180.0;
 84 | 	
 85 | 	while (geohash.length < precision) {
 86 | 	  if (is_even) {
 87 | 			mid = (lon[0] + lon[1]) / 2;
 88 | 	    if (longitude > mid) {
 89 | 				ch |= BITS[bit];
 90 | 				lon[0] = mid;
 91 | 	    } else
 92 | 				lon[1] = mid;
 93 | 	  } else {
 94 | 			mid = (lat[0] + lat[1]) / 2;
 95 | 	    if (latitude > mid) {
 96 | 				ch |= BITS[bit];
 97 | 				lat[0] = mid;
 98 | 	    } else
 99 | 				lat[1] = mid;
100 | 	  }
101 | 
102 | 		is_even = !is_even;
103 | 	  if (bit < 4)
104 | 			bit++;
105 | 	  else {
106 | 			geohash += BASE32[ch];
107 | 			bit = 0;
108 | 			ch = 0;
109 | 	  }
110 | 	}
111 | 	return geohash;
112 | }
113 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/poi_miscpoi/poi_miscpoi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=poi_miscpoi
 5 | LAYER=( 
 6 |         "2901:amenity=toilets>toilet"
 7 |         "2902:amenity=bench"
 8 |         "2903:amenity=drinking_water"
 9 |         "2904:amenity=fountain"
10 |         "2905:amenity=hunting_stand"
11 |         "2906:amenity=waste_basket"
12 |         "2907:man_made=surveillance>camera_surveillance"
13 |         "2923:highway=emergency_access_point>emergency_access"
14 |         "2952:man_made=water_tower"
15 |         "2954:man_made=windmill"
16 |         "2955:man_made=lighthouse"
17 |         "2961:man_made=wastewater_plant"
18 |         "2962:man_made=water_well"
19 |         "2963:man_made=watermill>water_mill"
20 |         "2964:man_made=water_works"
21 | )
22 | 
23 | for layer in "${LAYER[@]}"
24 | do
25 |   CODE="${layer%%:*}"
26 |   KVF="${layer##*:}"
27 |   K="${KVF%%=*}"
28 |   VF="${KVF##*=}"
29 |   V="${VF%%>*}"
30 |   F="${VF##*>}"
31 |   N="${F%%-*}"
32 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
33 |   common_query > "../../sql/$F.sql"
34 | done
35 | 
36 | CODE=2950
37 | N=tower
38 | F=tower
39 | EXTRA_CONSTRAINTS="
40 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='tower')
41 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tower:type' AND tags.value='communication')
42 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='water_tower')
43 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tower:type' AND tags.value='observation')
44 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='windmill')
45 |   AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='lighthouse')"
46 | common_query > "../../sql/$F.sql"
47 | 
48 | CODE=2951
49 | N=tower_comms
50 | F=tower_comms
51 | EXTRA_CONSTRAINTS="
52 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='tower')
53 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tower:type' AND tags.value='communication')"
54 | common_query > "../../sql/$F.sql"
55 | 
56 | CODE=2953
57 | N=tower_observation
58 | F=tower_observation
59 | EXTRA_CONSTRAINTS="
60 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='tower')
61 |   AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tower:type' AND tags.value='observation')"
62 | common_query > "../../sql/$F.sql"
63 | 
64 | 
65 | CODE=2921
66 | N=emergency_phone
67 | F=emergency_phone
68 | EXTRA_CONSTRAINTS="
69 |   AND (
70 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amnenity' AND tags.value='emergency_phone')
71 |       OR
72 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'emergency' AND tags.value='phone')
73 |   )"
74 | common_query > "../../sql/$F.sql"
75 | 
76 | CODE=2922
77 | N=fire_hydrant
78 | F=fire_hydrant
79 | EXTRA_CONSTRAINTS="
80 |   AND (
81 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amnenity' AND tags.value='fire_hydrant')
82 |       OR
83 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'emergency' AND tags.value='fire_hydrant')
84 |   )"
85 | common_query > "../../sql/$F.sql"
86 | 


--------------------------------------------------------------------------------
/triggering/trigger_osm_to_big_query_dg_gcf/main.py:
--------------------------------------------------------------------------------
 1 | from google.auth.transport.requests import Request
 2 | from google.oauth2 import id_token
 3 | import requests
 4 | import os
 5 | 
 6 | 
 7 | IAM_SCOPE = 'https://www.googleapis.com/auth/iam'
 8 | OAUTH_TOKEN_URI = 'https://www.googleapis.com/oauth2/v4/token'
 9 | 
10 | 
11 | def trigger_dag(data, context=None):
12 |     """Makes a POST request to the Composer DAG Trigger API
13 |     When called via Google Cloud Functions (GCF),
14 |     data and context are Background function parameters.
15 |     For more info, refer to
16 |     https://cloud.google.com/functions/docs/writing/background#functions_background_parameters-python
17 |     To call this function from a Python script, omit the ``context`` argument
18 |     and pass in a non-null value for the ``data`` argument.
19 |     """
20 | 
21 |     # Fill in with your Composer info here
22 |     # Navigate to your webserver's login page and get this from the URL
23 |     # Or use the script found at
24 |     # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/composer/rest/get_client_id.py
25 |     client_id = os.getenv("COMPOSER_CLIENT_ID")
26 |     # This should be part of your webserver's URL:
27 |     # {tenant-project-id}.appspot.com
28 |     webserver_id = os.getenv("COMPOSER_WEBSERVER_ID")
29 |     # The name of the DAG you wish to trigger
30 |     dag_name = os.getenv("DAG_NAME")
31 |     webserver_url = (
32 |         'https://'
33 |         + webserver_id
34 |         + '.appspot.com/api/experimental/dags/'
35 |         + dag_name
36 |         + '/dag_runs'
37 |     )
38 |     # Make a POST request to IAP which then Triggers the DAG
39 |     make_iap_request(
40 |         webserver_url, client_id, method='POST', json={"conf": data})
41 | 
42 | 
43 | # This code is copied from
44 | # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/iap/make_iap_request.py
45 | # START COPIED IAP CODE
46 | def make_iap_request(url, client_id, method='GET', **kwargs):
47 |     """Makes a request to an application protected by Identity-Aware Proxy.
48 |     Args:
49 |       url: The Identity-Aware Proxy-protected URL to fetch.
50 |       client_id: The client ID used by Identity-Aware Proxy.
51 |       method: The request method to use
52 |               ('GET', 'OPTIONS', 'HEAD', 'POST', 'PUT', 'PATCH', 'DELETE')
53 |       **kwargs: Any of the parameters defined for the request function:
54 |                 https://github.com/requests/requests/blob/master/requests/api.py
55 |                 If no timeout is provided, it is set to 90 by default.
56 |     Returns:
57 |       The page body, or raises an exception if the page couldn't be retrieved.
58 |     """
59 |     # Set the default timeout, if missing
60 |     if 'timeout' not in kwargs:
61 |         kwargs['timeout'] = 90
62 | 
63 |     # Obtain an OpenID Connect (OIDC) token from metadata server or using service
64 |     # account.
65 |     google_open_id_connect_token = id_token.fetch_id_token(Request(), client_id)
66 | 
67 |     # Fetch the Identity-Aware Proxy-protected URL, including an
68 |     # Authorization header containing "Bearer " followed by a
69 |     # Google-issued OpenID Connect token for the service account.
70 |     resp = requests.request(
71 |         method, url,
72 |         headers={'Authorization': 'Bearer {}'.format(
73 |             google_open_id_connect_token)}, **kwargs)
74 |     if resp.status_code == 403:
75 |         raise Exception('Service account does not have permission to '
76 |                         'access the IAP-protected application.')
77 |     elif resp.status_code != 200:
78 |         raise Exception(
79 |             'Bad response from application: {!r} / {!r} / {!r}'.format(
80 |                 resp.status_code, resp.headers, resp.text))
81 |     else:
82 |         return resp.text


--------------------------------------------------------------------------------
/deployment/config/generate_config.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import json
 4 | 
 5 | if __name__ == '__main__':
 6 |     logging.getLogger().setLevel(logging.INFO)
 7 | 
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument('config_file', help='Config file to save parameters')
10 | 
11 |     parser.add_argument('--project_id', help='Your Project ID.', required=True)
12 | 
13 |     parser.add_argument('--zone', help='Location zone',
14 |                         required=True)
15 | 
16 |     parser.add_argument('--osm_url', help='URL of the source OSM file', required=True)
17 |     parser.add_argument('--osm_md5_url', help='URL of the source OSM file\'s MD5 hash', required=True)
18 | 
19 |     parser.add_argument('--gcs_transfer_bucket', help='GCS bucket to make transferring source file to project\'s GCS',
20 |                         required=True)
21 |     parser.add_argument('--transfer_index_files_gcs_uri', help='GCS URI to Storage Transfer index file',
22 |                         required=True)
23 | 
24 |     parser.add_argument('--gcs_work_bucket', help='GCS bucket to save intermediate results', required=True)
25 | 
26 |     parser.add_argument('--osm_to_features_image', help='osm_to_features image name', required=True)
27 |     parser.add_argument('--osm_to_nodes_ways_relations_image', help='osm_to_nodes_ways_relations image name',
28 |                         required=True)
29 |     parser.add_argument('--generate_layers_image', help='generate_layers image name', required=True)
30 |     parser.add_argument('--osm_converter_with_history_index_image',
31 |                         help='osm_converter_with_history_index_image image name', required=True)
32 | 
33 |     parser.add_argument('--gke_main_cluster_name', help='Name of the main GKE cluster',
34 |                         required=True)
35 |     parser.add_argument('--addt_sn_gke_pool', help='GKE pool name for additional operations (single node pool)',
36 |                         required=True)
37 |     parser.add_argument('--addt_sn_pool_machine_type',
38 |                         help='Machine type for additional operations GKE pool (single node pool)',
39 |                         required=True)
40 |     parser.add_argument('--addt_sn_pool_disk_size',
41 |                         help='Disk size for additional operations GKE pool (single node pool)',
42 |                         required=True)
43 |     parser.add_argument('--addt_sn_pool_num_nodes',
44 |                         help='Number of nodes for additional operations GKE pool (single node pool)',
45 |                         required=True)
46 |     parser.add_argument('--addt_sn_pool_max_num_treads', help='Maximum number of threads for addt_sn_gke_pool',
47 |                         required=True)
48 | 
49 |     parser.add_argument('--addt_mn_gke_pool', help='GKE pool name for additional operations (multiple nodes pool)',
50 |                         required=True)
51 |     parser.add_argument('--addt_mn_pool_machine_type',
52 |                         help='Machine type for additional operations GKE pool (multiple nodes pool)',
53 |                         required=True)
54 |     parser.add_argument('--addt_mn_pool_disk_size',
55 |                         help='Disk size for additional operations GKE pool (multiple nodes pool)',
56 |                         required=True)
57 |     parser.add_argument('--addt_mn_pool_num_nodes',
58 |                         help='Number of nodes for additional operations GKE pool (multiple nodes pool)',
59 |                         required=True)
60 |     parser.add_argument('--addt_mn_pod_requested_memory', help='addt_mn GKE POD requested memory', required=True)
61 | 
62 |     parser.add_argument('--bq_dataset_to_export', help='BigQuery dataset name to export results', required=True)
63 | 
64 |     args = parser.parse_args()
65 |     args_filtered = {}
66 |     for k, v in vars(args).items():
67 |         if v:
68 |             print(v)
69 |             args_filtered[k] = v
70 | 
71 |     with open(args.config_file, 'w') as fp:
72 |         json.dump(args_filtered, fp, indent=4)
73 | 


--------------------------------------------------------------------------------
/tasks_docker_images/generate_layers/src/layered_gis/place/place.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ../query_templates.sh
 3 | 
 4 | CLASS=place
 5 | LAYER=(
 6 |         "1001:place=city"
 7 |         "1002:place=town"
 8 |         "1003:place=village"
 9 |         "1004:place=hamlet"
10 |         "1010:place=suburb"
11 |         "1020:place=island"
12 |         "1030:place=farm"
13 |         "1031:place=isolated_dwelling>dwelling"
14 |         "1040:place=region"
15 |         "1041:place=county"
16 |         "1050:place=locality"
17 | )
18 | 
19 | for layer in "${LAYER[@]}"
20 | do
21 |   CODE="${layer%%:*}"
22 |   KVF="${layer##*:}"
23 |   K="${KVF%%=*}"
24 |   VF="${KVF##*=}"
25 |   V="${VF%%>*}"
26 |   F="${VF##*>}"
27 |   N="${F%%-*}"
28 | 
29 |   EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')"
30 |   common_query > "../../sql/$F.sql"
31 | done
32 | 
33 | #1005
34 | CODE=1005
35 | N=national_capital
36 | F=national_capital
37 | EXTRA_CONSTRAINTS="
38 | AND (
39 |   (
40 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND
41 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'is_capital' AND tags.value='country')
42 |   )
43 |   OR
44 |   (
45 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND
46 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'admin_level' AND tags.value = '2')
47 |   )
48 |   OR 
49 |   (
50 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND
51 |     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'capital' AND tags.value='yes') AND
52 |     NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'admin_level')
53 |   )
54 | )
55 | "
56 | common_query > "../../sql/$F.sql"
57 | 
58 | CODE=1099
59 | N=named_place
60 | F=named_place
61 | EXTRA_CONSTRAINTS="
62 | AND     EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'area'  AND tags.value='yes')
63 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city')
64 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='town')
65 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='village')
66 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='hamlet')
67 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='suburb')
68 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='island')
69 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='farm')
70 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='isolated_dwelling')
71 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='region')
72 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='county')
73 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='locality')
74 | AND (
75 |   (
76 |     NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND
77 |     NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'is_capital' AND tags.value='country')
78 |   )
79 |    OR
80 |   (
81 |     NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND
82 |     NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'admin_level' AND tags.value = '2')
83 |   )
84 |    OR (
85 |     NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND
86 |     NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'capital' AND tags.value='yes') AND
87 |         EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'admin_level')
88 |    )
89 | )
90 | "
91 | common_query > "../../sql/$F.sql"
92 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/elements_transformer.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import osmium
  3 | 
  4 | def osm_timestamp_from_osm_entity(osm_entity):
  5 |     return int(datetime.timestamp(osm_entity.timestamp))
  6 | 
  7 | def osm_obj_to_dict(osm_entity, geometry, is_simplified, with_uid, tags_to_bq, osm_timestamp):
  8 |     base_dict = {
  9 |         "id": osm_entity.id,
 10 |         "version": osm_entity.version,
 11 |         "osm_timestamp": osm_timestamp if osm_timestamp else osm_timestamp_from_osm_entity(osm_entity)
 12 |     }
 13 |     if not is_simplified:
 14 |         base_dict["username"] = osm_entity.user
 15 |         base_dict["changeset"] = osm_entity.changeset
 16 |         base_dict["visible"] = osm_entity.visible
 17 |         base_dict["geometry"] = geometry
 18 |         base_dict["all_tags"] = [{"key": tag.k, "value": tag.v} for tag in osm_entity.tags] \
 19 |             if tags_to_bq else [(tag.k, tag.v) for tag in osm_entity.tags]
 20 |     if with_uid:
 21 |         base_dict["uid"] = osm_entity.uid
 22 |     return base_dict
 23 | 
 24 | 
 25 | def osm_entity_node_dict(osm_node_entity,
 26 |                          geometry=None,
 27 |                          is_simplified=False,
 28 |                          with_uid=False,
 29 |                          tags_to_bq=True,
 30 |                          osm_timestamp=None):
 31 |     base_dict = osm_obj_to_dict(osm_node_entity, geometry, is_simplified, with_uid, tags_to_bq, osm_timestamp)
 32 |     if osm_node_entity.location.valid():
 33 |         base_dict.update({
 34 |             "latitude": osm_node_entity.location.lat,
 35 |             "longitude": osm_node_entity.location.lon
 36 |         })
 37 |     return base_dict
 38 | 
 39 | 
 40 | def osm_entity_way_dict(osm_way_entity, geometry=None, is_simplified=False, with_uid=False, tags_to_bq=True,
 41 |                          osm_timestamp=None):
 42 |     base_dict = osm_obj_to_dict(osm_way_entity, geometry, is_simplified, with_uid, tags_to_bq, osm_timestamp)
 43 |     base_dict["nodes"] = [node.ref for node in osm_way_entity.nodes]
 44 |     return base_dict
 45 | 
 46 | 
 47 | def osm_entity_relation_dict(osm_relation_entity, geometry=None, is_simplified=False, with_uid=False, tags_to_bq=True,
 48 |                          osm_timestamp=None):
 49 |     base_dict = osm_obj_to_dict(osm_relation_entity, geometry, is_simplified, with_uid, tags_to_bq, osm_timestamp)
 50 |     base_dict["members"] = [(member.type, member.ref, member.role) for member in iter(osm_relation_entity.members)]
 51 |     return base_dict
 52 | 
 53 | 
 54 | def get_osm_obj_from_dict(obj_dict):
 55 |     return osmium.osm.mutable.OSMObject(id=obj_dict["id"],
 56 |                                         version=obj_dict["version"],
 57 |                                         visible=obj_dict["visible"] if "visible" in obj_dict else None,
 58 |                                         changeset=obj_dict["changeset"] if "changeset" in obj_dict else None,
 59 |                                         timestamp=datetime.fromtimestamp(obj_dict["osm_timestamp"]),
 60 |                                         uid=obj_dict["uid"] if "uid" in obj_dict else None,
 61 |                                         tags=obj_dict["all_tags"] if "all_tags" in obj_dict else None
 62 |                                         )
 63 | 
 64 | 
 65 | def get_osm_node_from_dict(node_dict):
 66 |     lon = node_dict["longitude"]
 67 |     lat = node_dict["latitude"]
 68 |     location_tuple = (lon, lat) if lon and lat else None
 69 |     return osmium.osm.mutable.Node(get_osm_obj_from_dict(node_dict), location_tuple)
 70 | 
 71 | 
 72 | def get_osm_way_from_dict(way_dict):
 73 |     return osmium.osm.mutable.Way(get_osm_obj_from_dict(way_dict), way_dict["nodes"])
 74 | 
 75 | 
 76 | def get_osm_relation_from_dict(relation_dict):
 77 |     return osmium.osm.mutable.Relation(get_osm_obj_from_dict(relation_dict), relation_dict["members"])
 78 | 
 79 | 
 80 | def edit_osm_obj_dict_according_to_bq_schema(obj_dict):
 81 |     obj_dict["all_tags"] = [{"key": tag_key, "value": tag_value} for tag_key, tag_value in obj_dict["all_tags"]]
 82 |     return obj_dict
 83 | 
 84 | 
 85 | def edit_node_dict_according_to_bq_schema(node_dict):
 86 |     return edit_osm_obj_dict_according_to_bq_schema(node_dict)
 87 | 
 88 | 
 89 | def edit_way_dict_according_to_bq_schema(way_dict):
 90 |     way_dict = edit_osm_obj_dict_according_to_bq_schema(way_dict)
 91 |     way_dict["nodes"] = [{"id": node_id} for node_id in way_dict["nodes"]]
 92 |     return way_dict
 93 | 
 94 | 
 95 | def edit_relation_dict_according_to_bq_schema(relation_dict):
 96 |     relation_dict = edit_osm_obj_dict_according_to_bq_schema(relation_dict)
 97 |     relation_dict["members"] = [{"type": member_type, "id": member_ref, "role": member_role}
 98 |                                 for member_type, member_ref, member_role in relation_dict["members"]]
 99 |     return relation_dict
100 | 
101 | 
102 | def is_node_dict_with_location(node_dict):
103 |     return node_dict["longitude"] and node_dict["latitude"]
104 | 
105 | 
106 | def get_way_nodes(way_dict):
107 |     return way_dict["nodes"]
108 | 
109 | 
110 | def get_relation_members(relation_dict):
111 |     return relation_dict["members"]
112 | 


--------------------------------------------------------------------------------
/examples/clustering/cities/README.md:
--------------------------------------------------------------------------------
  1 | # Cities
  2 | 
  3 | ## List of Cities
  4 | 
  5 | In the `cities.csv` there is a list of cities from the [Globalization and World Cities Research Network](https://en.wikipedia.org/wiki/Globalization_and_World_Cities_Research_Network) wiki page.
  6 | Each city row contains lat/long and radius of the manually defined circle that approximately covers city infrastructure and agglomeration.
  7 | 
  8 | The `query.py` script can be used to transform CSV into the SQL with cities data.
  9 | In our example result of the query is saved into the `osm_cities.cities` table using the BigQuery console.
 10 | 
 11 | ## OSM Objects within cities
 12 | 
 13 | Query to select all objects within city circle area.
 14 | 
 15 | ```
 16 | SELECT
 17 |   cities.city_name,
 18 |   planet.*
 19 | FROM
 20 |   `bigquery-public-data.geo_openstreetmap.planet_layers` as planet, `gcp-pdp-osm-dev.osm_cities.cities` as cities
 21 | WHERE ST_DWITHIN(cities.center, planet.geometry, cities.radius)
 22 | ```
 23 | 
 24 | Result is saved in `osm_cities.cities_objects` in order to reduce scanning overhead in next stages of analysis.
 25 | 
 26 | ## Population grid within cities
 27 | 
 28 | ### 1km resolution
 29 | 
 30 | Query to select [Worldpop](https://www.worldpop.org/) population grid cells within city circle area.
 31 | ```
 32 | SELECT
 33 |   cities.city_name,
 34 |   grid.*
 35 | FROM `bigquery-public-data.worldpop.population_grid_1km` AS grid,
 36 | gcp-pdp-osm-dev.osm_cities.cities AS cities
 37 | WHERE last_updated = '2020-01-01'
 38 | AND ST_DWITHIN(cities.center, grid.geog, cities.radius)
 39 | ```
 40 | 
 41 | Result is saved in `osm_cities.cities_population_grid_1km` in order to reduce scanning overhead in next stages of analysis.
 42 | 
 43 | ### 0.5km resolution
 44 | 
 45 | Query to simply divide 1km resolution grid
 46 | ```
 47 | WITH divided_grid AS (SELECT
 48 |   long1 + x*(long2 - long1)/2 as long1,
 49 |   lat1 + y*(lat2 - lat1)/2 as lat1,
 50 |   long1 + (x + 1)*(long2 - long1)/2 as long2,
 51 |   lat1 + (y + 1)*(lat2 - lat1)/2 as lat2,
 52 |   city_name,
 53 |   country_name,
 54 |   geo_id,
 55 |   population,
 56 |   alpha_3_code,
 57 |   last_updated
 58 | FROM (
 59 | WITH quadrants AS
 60 | (SELECT 0 as x, 0 as y UNION ALL
 61 |   SELECT 1, 0 UNION ALL
 62 |   SELECT 0, 1 UNION ALL
 63 |   SELECT 1, 1)
 64 | SELECT
 65 |   CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][0][0]') AS FLOAT64) as long1,
 66 |   CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][0][1]') AS FLOAT64) as lat1,
 67 |   CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][2][0]') AS FLOAT64) as long2,
 68 |   CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][2][1]') AS FLOAT64) as lat2,
 69 |   quadrants.x,
 70 |   quadrants.y,
 71 |   city_name,
 72 |   country_name,
 73 |   CONCAT(geo_id,x,y) as geo_id,
 74 |   population/4 as population,
 75 |   alpha_3_code,
 76 |   last_updated
 77 | FROM `osm_cities.cities_population_grid_1km`
 78 | CROSS JOIN quadrants
 79 | ))
 80 | SELECT
 81 |   city_name,
 82 |   country_name,
 83 |   geo_id,
 84 |   population,
 85 |   (long1 + long2) / 2 as longitude_centroid,
 86 |   (lat1 + lat2) / 2  as latitude_centroid,
 87 |   alpha_3_code,
 88 |   ST_MAKEPOLYGON(ST_MAKELINE([
 89 |     ST_MAKELINE(ST_GEOGPOINT(long1, lat1), ST_GEOGPOINT(long1, lat2)),
 90 |     ST_MAKELINE(ST_GEOGPOINT(long1, lat2), ST_GEOGPOINT(long2, lat2)),
 91 |     ST_MAKELINE(ST_GEOGPOINT(long2, lat2), ST_GEOGPOINT(long2, lat1)),
 92 |     ST_MAKELINE(ST_GEOGPOINT(long2, lat1), ST_GEOGPOINT(long1, lat1))
 93 |   ])) as geog,
 94 |   last_updated
 95 | FROM divided_grid
 96 | ```
 97 | Result is saved in `osm_cities.cities_population_grid_05km`.
 98 | 
 99 | ### 0.25km resolution
100 | 
101 | Query to simply divide 0.5km resolution grid
102 | ```
103 | WITH divided_grid AS (SELECT
104 |   long1 + x*(long2 - long1)/2 as long1,
105 |   lat1 + y*(lat2 - lat1)/2 as lat1,
106 |   long1 + (x + 1)*(long2 - long1)/2 as long2,
107 |   lat1 + (y + 1)*(lat2 - lat1)/2 as lat2,
108 |   city_name,
109 |   country_name,
110 |   geo_id,
111 |   population,
112 |   alpha_3_code,
113 |   last_updated
114 | FROM (
115 | WITH quadrants AS
116 | (SELECT 0 as x, 0 as y UNION ALL
117 |   SELECT 1, 0 UNION ALL
118 |   SELECT 0, 1 UNION ALL
119 |   SELECT 1, 1)
120 | SELECT
121 |   CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][0][0]') AS FLOAT64) as long1,
122 |   CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][0][1]') AS FLOAT64) as lat1,
123 |   CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][2][0]') AS FLOAT64) as long2,
124 |   CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][2][1]') AS FLOAT64) as lat2,
125 |   quadrants.x,
126 |   quadrants.y,
127 |   city_name,
128 |   country_name,
129 |   CONCAT(geo_id,x,y) as geo_id,
130 |   population/4 as population,
131 |   alpha_3_code,
132 |   last_updated
133 | FROM `osm_cities.cities_population_grid_05km`
134 | CROSS JOIN quadrants
135 | ))
136 | SELECT
137 |   city_name,
138 |   country_name,
139 |   geo_id,
140 |   population,
141 |   (long1 + long2) / 2 as longitude_centroid,
142 |   (lat1 + lat2) / 2  as latitude_centroid,
143 |   alpha_3_code,
144 |   ST_MAKEPOLYGON(ST_MAKELINE([
145 |     ST_MAKELINE(ST_GEOGPOINT(long1, lat1), ST_GEOGPOINT(long1, lat2)),
146 |     ST_MAKELINE(ST_GEOGPOINT(long1, lat2), ST_GEOGPOINT(long2, lat2)),
147 |     ST_MAKELINE(ST_GEOGPOINT(long2, lat2), ST_GEOGPOINT(long2, lat1)),
148 |     ST_MAKELINE(ST_GEOGPOINT(long2, lat1), ST_GEOGPOINT(long1, lat1))
149 |   ])) as geog,
150 |   last_updated
151 | FROM divided_grid
152 | ```
153 | Result is saved in `osm_cities.cities_population_grid_025km`.
154 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/parser.py:
--------------------------------------------------------------------------------
  1 | import osmium
  2 | import logging
  3 | import time
  4 | 
  5 | import elements_transformer
  6 | import elements_processing
  7 | 
  8 | from xml.sax import handler
  9 | import psutil
 10 | 
 11 | 
 12 | def to_mb(bytes_num):
 13 |     return int(bytes_num / (1024 * 1024))
 14 | 
 15 | 
 16 | class OsmParser(osmium.SimpleHandler):
 17 | 
 18 |     def __init__(self, processing_counter, logging_range_count, pool_size=1, pool_index=0):
 19 |         osmium.SimpleHandler.__init__(self)
 20 | 
 21 |         self.processing_counter = processing_counter
 22 |         self.last_log_time = time.time()
 23 |         self.logging_range_count = logging_range_count
 24 |         self.current_entity_type = ""
 25 |         self.pool_index = pool_index
 26 |         self.pool_size = pool_size
 27 | 
 28 |     def current_pool_index(self):
 29 |         return self.processing_counter[self.current_entity_type] % self.pool_size
 30 | 
 31 |     def is_item_index_for_current_pool_index(self):
 32 |         return self.current_pool_index() == self.pool_index
 33 | 
 34 |     def log_processing(self):
 35 |         self.processing_counter[self.current_entity_type] = self.processing_counter[self.current_entity_type] + 1
 36 |         if self.processing_counter[self.current_entity_type] % self.logging_range_count == 0:
 37 |             virtual_memory = psutil.virtual_memory()
 38 |             logging.info(self.current_entity_type + " ({}/{}) ".format(self.pool_index + 1, self.pool_size)
 39 |                          + str(self.processing_counter[self.current_entity_type])
 40 |                          + " " + str(time.time() - self.last_log_time)
 41 |                          + " Memory: usage {}, free {} MB, used {} MB"
 42 |                          .format(virtual_memory.percent, to_mb(virtual_memory.free), to_mb(virtual_memory.used)))
 43 |             self.last_log_time = time.time()
 44 | 
 45 |     def node(self, node):
 46 |         self.current_entity_type = "nodes"
 47 |         self.log_processing()
 48 | 
 49 |     def way(self, way):
 50 |         self.current_entity_type = "ways"
 51 |         self.log_processing()
 52 | 
 53 |     def relation(self, relation):
 54 |         self.current_entity_type = "relations"
 55 |         self.log_processing()
 56 | 
 57 | 
 58 | class IndexCreatorWithXmlParser(handler.ContentHandler):
 59 | 
 60 |     def __init__(self, osm_indexer_map,
 61 |                  processing_counter, num_shards,
 62 |                  is_id_hash_partitioned_shards,
 63 |                  pool_size=1, pool_index=0,
 64 |                  batch_size_to_commit=1000000,
 65 |                  logging_range_count=1000000):
 66 |         handler.ContentHandler.__init__(self)
 67 |         self.processing_counter = processing_counter
 68 |         self.last_log_time = time.time()
 69 |         self.logging_range_count = logging_range_count
 70 |         self.current_entity_type = ""
 71 |         self.pool_index = pool_index
 72 |         self.pool_size = pool_size
 73 | 
 74 |         self.xml_hierarchy = []
 75 |         self.current_obj = {}
 76 | 
 77 |         self.xml_entity_map = {"node": "nodes", "way": "ways", "relation": "relations"}
 78 | 
 79 |         self.osm_indexer_map = osm_indexer_map
 80 |         self.num_shards = num_shards
 81 |         self.batch_size_to_commit = batch_size_to_commit
 82 |         self.is_id_hash_partitioned_shards = is_id_hash_partitioned_shards
 83 | 
 84 |         self.current_indexer = None
 85 | 
 86 |     def log_processing(self):
 87 |         self.processing_counter[self.current_entity_type] = self.processing_counter[self.current_entity_type] + 1
 88 |         if self.processing_counter[self.current_entity_type] % self.logging_range_count == 0:
 89 |             logging.info(self.current_entity_type + " ({}/{}) ".format(self.pool_index + 1, self.pool_size)
 90 |                          + str(self.processing_counter[self.current_entity_type])
 91 |                          + " " + str(time.time() - self.last_log_time))
 92 |             self.last_log_time = time.time()
 93 | 
 94 |     def startDocument(self):
 95 |         pass
 96 | 
 97 |     def get_current_xml_hierarchy_level(self):
 98 |         return self.xml_hierarchy[len(self.xml_hierarchy) - 1]
 99 | 
100 |     def process_element(self, name, attributes):
101 |         if name == "node":
102 |             self.current_obj = elements_transformer.osm_entity_node_dict(attributes,
103 |                                                                          is_simplified=True,
104 |                                                                          is_xml_attributes=True)
105 | 
106 |     def startElement(self, name, attributes):
107 |         self.xml_hierarchy.append(name)
108 | 
109 |         if name in self.xml_entity_map:
110 |             self.current_entity_type = self.xml_entity_map[name]
111 |             self.log_processing()
112 | 
113 |             if not self.is_id_hash_partitioned_shards:
114 |                 batch_index = self.processing_counter[self.current_entity_type] % self.num_shards
115 |             else:
116 |                 obj_id = attributes["id"]
117 |                 batch_index = elements_processing.get_uniformly_shard_index_from_id(obj_id, self.num_shards)
118 |             if batch_index in self.osm_indexer_map:
119 |                 self.process_element(name, attributes)
120 |                 self.current_indexer = self.osm_indexer_map[batch_index]
121 |             else:
122 |                 self.current_indexer = None
123 | 
124 |     def endElement(self, name, *args):
125 |         if name == "node" and self.current_indexer:
126 |             self.on_node_element(self.current_obj)
127 |             self.current_indexer = None
128 |         del self.xml_hierarchy[-1]
129 | 
130 |     def characters(self, data):
131 |         pass
132 | 
133 |     def on_node_element(self, node_dict):
134 |         pass
135 | 
136 |     def on_way_element(self, way_dict):
137 |         pass
138 | 
139 |     def on_relation_element(self, relation_dict):
140 |         pass
141 | 


--------------------------------------------------------------------------------
/deployment/create_full.sh:
--------------------------------------------------------------------------------
  1 | ##!/bin/bash
  2 | 
  3 | # 1. Read intput parameters
  4 | OSM_URL="$1"
  5 | OSM_MD5_URL="$2"
  6 | REGION_LOCATION="$3"
  7 | ZONE="$4"
  8 | SUFFIX="$5"
  9 | 
 10 | BASE_COMPOSER_CLUSTER_MACHINE_TYPE="$6"
 11 | BASE_COMPOSER_CLUSTER_NODES="$7"
 12 | 
 13 | ADDT_SN_CORES="$8"
 14 | ADDT_SN_DISK_SIZE="$9"
 15 | 
 16 | ADDT_MN_CORES="${10}"
 17 | ADDT_MN_DISK_SIZE="${11}"
 18 | ADDT_MN_NODES="${12}"
 19 | 
 20 | MODE="${13}"
 21 | 
 22 | # 2. Print all parameters
 23 | for PARAM in "$@"; do
 24 |   echo "$PARAM"
 25 | done
 26 | 
 27 | # 3. Retrieve PROJECT_ID
 28 | PROJECT_ID=`gcloud config get-value project`
 29 | 
 30 | # 4. Create GCS buckets
 31 | TRANSFER_BUCKET_NAME=${PROJECT_ID}-transfer-${SUFFIX}
 32 | gsutil mb gs://${TRANSFER_BUCKET_NAME}/
 33 | 
 34 | WORK_BUCKET_NAME=${PROJECT_ID}-work-${SUFFIX}
 35 | gsutil mb gs://${WORK_BUCKET_NAME}/
 36 | 
 37 | # 5. Create BigQuery dataset
 38 | BQ_DATASET_SHORT=osm_to_bq_${SUFFIX}
 39 | BQ_DATASET=${PROJECT_ID}.${BQ_DATASET_SHORT}
 40 | bq mk ${PROJECT_ID}:${BQ_DATASET_SHORT}
 41 | #TODO temp
 42 | #BQ_DATASET=bigquery-public-data.geo_openstreetmap
 43 | 
 44 | # 6. Build and push to Container Registry Docker containers
 45 | IMAGE_HOSTNAME=gcr.io
 46 | 
 47 | GENERATE_LAYERS_IMAGE=$IMAGE_HOSTNAME/$PROJECT_ID/generate_layers_${SUFFIX}
 48 | docker build -t $GENERATE_LAYERS_IMAGE tasks_docker_images/generate_layers/
 49 | docker push $GENERATE_LAYERS_IMAGE
 50 | 
 51 | if [ "$MODE" = "planet" ]
 52 | then
 53 |   OSM_TO_FEATURES_IMAGE=$IMAGE_HOSTNAME/$PROJECT_ID/osm_to_features_${SUFFIX}
 54 |   docker build -t $OSM_TO_FEATURES_IMAGE tasks_docker_images/osm_to_features/
 55 |   docker push $OSM_TO_FEATURES_IMAGE
 56 | 
 57 |   OSM_TO_NODES_WAYS_RELATIONS_IMAGE=$IMAGE_HOSTNAME/$PROJECT_ID/osm_to_nodes_ways_relations_${SUFFIX}
 58 |   docker build -t $OSM_TO_NODES_WAYS_RELATIONS_IMAGE tasks_docker_images/osm_to_nodes_ways_relations/
 59 |   docker push $OSM_TO_NODES_WAYS_RELATIONS_IMAGE
 60 | else
 61 |   OSM_CONVERTER_WITH_HISTORY_INDEX_IMAGE=$IMAGE_HOSTNAME/$PROJECT_ID/osm_converter_with_history_index_${SUFFIX}
 62 |   docker build -t $OSM_CONVERTER_WITH_HISTORY_INDEX_IMAGE tasks_docker_images/osm_converter_with_history_index/
 63 |   docker push $OSM_CONVERTER_WITH_HISTORY_INDEX_IMAGE
 64 | fi
 65 | 
 66 | # 7. Create Cloud Composer environment
 67 | COMPOSER_ENV_NAME=osm-to-bq-${SUFFIX}
 68 | gcloud composer environments create $COMPOSER_ENV_NAME \
 69 |     --location $REGION_LOCATION \
 70 |     --zone $ZONE \
 71 |     --node-count $BASE_COMPOSER_CLUSTER_NODES \
 72 |     --machine-type $BASE_COMPOSER_CLUSTER_MACHINE_TYPE \
 73 |     --airflow-configs=broker_transport_options-visibility_timeout=2592000
 74 | 
 75 | # 8. Retrieve Cloud Composer environment's params
 76 | GKE_CLUSTER_FULL_NAME=$(gcloud composer environments describe $COMPOSER_ENV_NAME \
 77 |         --location $REGION_LOCATION --format json | jq -r '.config.gkeCluster')
 78 | GKE_CLUSTER_NAME=$(echo $GKE_CLUSTER_FULL_NAME | awk -F/ '{print $6}')
 79 | 
 80 | # 9. Define additional Kubernetes clusters parameters
 81 | ADDT_SN_POOL_NUM_CORES=$ADDT_SN_CORES
 82 | ADDT_SN_POOL_DISK_SIZE=$ADDT_SN_DISK_SIZE
 83 | ADDT_SN_POOL_NAME=osm-addt-sn-pool-${SUFFIX}
 84 | ADDT_SN_POOL_MACHINE_TYPE=n1-highmem-$ADDT_SN_POOL_NUM_CORES
 85 | ADDT_SN_POOL_NUM_NODES=1
 86 | ADDT_SN_POOL_MAX_NUM_TREADS=$((ADDT_SN_POOL_NUM_CORES/4))
 87 | 
 88 | 
 89 | ADDT_MN_POOL_NUM_CORES=$ADDT_MN_CORES
 90 | ADDT_MN_POOL_DISK_SIZE=$ADDT_MN_DISK_SIZE
 91 | ADDT_MN_POOL_NAME=osm-addt-mn-pool-${SUFFIX}
 92 | ADDT_MN_POOL_MACHINE_TYPE=n1-highmem-$ADDT_MN_POOL_NUM_CORES
 93 | ADDT_MN_POOL_NUM_NODES=$ADDT_MN_NODES
 94 | ADDT_MN_POD_REQUESTED_MEMORY=$((ADDT_MN_POOL_NUM_CORES*4))G
 95 | 
 96 | # 10. Build config file with Cloud Composer env vars
 97 | CONFIG_FILE=deployment/config/config_${SUFFIX}.json
 98 | python3 deployment/config/generate_config.py $CONFIG_FILE \
 99 |     --project_id=$PROJECT_ID \
100 |     --zone=$ZONE \
101 |     --osm_url=$OSM_URL \
102 |     --osm_md5_url=$OSM_MD5_URL \
103 |     --gcs_transfer_bucket=$TRANSFER_BUCKET_NAME \
104 |     --gcs_work_bucket=$WORK_BUCKET_NAME \
105 |     --transfer_index_files_gcs_uri=gs://$WORK_BUCKET_NAME/gsc_transfer_index/ \
106 |     --osm_to_features_image=$OSM_TO_FEATURES_IMAGE \
107 |     --osm_to_nodes_ways_relations_image=$OSM_TO_NODES_WAYS_RELATIONS_IMAGE \
108 |     --generate_layers_image=$GENERATE_LAYERS_IMAGE \
109 |     --osm_converter_with_history_index_image=$OSM_CONVERTER_WITH_HISTORY_INDEX_IMAGE \
110 |     --gke_main_cluster_name=$GKE_CLUSTER_NAME \
111 |     --addt_sn_gke_pool=$ADDT_SN_POOL_NAME \
112 |     --addt_sn_pool_machine_type=$ADDT_SN_POOL_MACHINE_TYPE \
113 |     --addt_sn_pool_disk_size=$ADDT_SN_POOL_DISK_SIZE \
114 |     --addt_sn_pool_num_nodes=$ADDT_SN_POOL_NUM_NODES \
115 |     --addt_sn_pool_max_num_treads=$ADDT_SN_POOL_MAX_NUM_TREADS \
116 |     --addt_mn_gke_pool=$ADDT_MN_POOL_NAME \
117 |     --addt_mn_pool_machine_type=$ADDT_MN_POOL_MACHINE_TYPE \
118 |     --addt_mn_pool_disk_size=$ADDT_MN_POOL_DISK_SIZE \
119 |     --addt_mn_pool_num_nodes=$ADDT_MN_POOL_NUM_NODES \
120 |     --addt_mn_pod_requested_memory=$ADDT_MN_POD_REQUESTED_MEMORY \
121 |     --bq_dataset_to_export=$BQ_DATASET
122 | 
123 | # 11. Deploy Cloud Composer env vars
124 | deployment/config/set_env_vars_from_config.sh $CONFIG_FILE $COMPOSER_ENV_NAME $REGION_LOCATION
125 | 
126 | # 12. Crete Cloud Function for triggering main DAG
127 | COMPOSER_CLIENT_ID=$(python3 utils/get_client_id.py $PROJECT_ID $REGION_LOCATION $COMPOSER_ENV_NAME)
128 | COMPOSER_WEBSERVER_ID=$(gcloud composer environments describe $COMPOSER_ENV_NAME \
129 |         --location $REGION_LOCATION --format json | \
130 |         jq -r '.config.airflowUri' | \
131 |         awk -F/ '{print $3}' | \
132 |         cut -d '.' -f1)
133 | DAG_NAME=osm_to_big_query_${MODE}
134 | 
135 | TRIGGER_FUNCTION_NAME=trigger_osm_to_big_query_dg_gcf_${SUFFIX}
136 | gcloud functions deploy $TRIGGER_FUNCTION_NAME \
137 |     --source triggering/trigger_osm_to_big_query_dg_gcf \
138 |     --entry-point trigger_dag \
139 |     --runtime python37 \
140 |     --trigger-resource $TRANSFER_BUCKET_NAME \
141 |     --trigger-event google.storage.object.finalize \
142 |     --set-env-vars COMPOSER_CLIENT_ID=$COMPOSER_CLIENT_ID,COMPOSER_WEBSERVER_ID=$COMPOSER_WEBSERVER_ID,DAG_NAME=$DAG_NAME
143 | 
144 | # 13. Deploy DAG files and its dependencies
145 | if [ "$MODE" = "planet" ]
146 | then
147 |   DAGS_PATH='dags/osm_to_big_query_planet.py dags/transfer_src_file.py  dags/*/'
148 | else
149 |   DAGS_PATH='dags/osm_to_big_query_history.py dags/transfer_src_file.py  dags/*/'
150 | fi
151 | for DAG_ELEMENT in $DAGS_PATH; do
152 |   deployment/upload_dags_files.sh $DAG_ELEMENT $COMPOSER_ENV_NAME $REGION_LOCATION
153 | done
154 | 


--------------------------------------------------------------------------------
/dags/transfer_src_file.py:
--------------------------------------------------------------------------------
  1 | import airflow
  2 | import os
  3 | import logging
  4 | import datetime
  5 | import json
  6 | import base64
  7 | import binascii
  8 | import time
  9 | 
 10 | import googleapiclient.discovery
 11 | 
 12 | from urllib import request
 13 | from airflow.operators import python_operator
 14 | from google.cloud import storage
 15 | 
 16 | from utils import gcs_utils
 17 | 
 18 | year_start = datetime.datetime(2020, 1, 1)
 19 | 
 20 | OSM_TRANSFER_INDEX_FILE_NAME_BASE = "osm_transfer_index"
 21 | OSM_TRANSFER_INDEX_FILE_NAME_EXT = ".tsv"
 22 | OSM_TRANSFER_INDEX_FILE_NAME = OSM_TRANSFER_INDEX_FILE_NAME_BASE + OSM_TRANSFER_INDEX_FILE_NAME_EXT
 23 | 
 24 | project_id = os.environ.get('PROJECT_ID')
 25 | osm_url = os.environ.get('OSM_URL')
 26 | osm_md5_url = os.environ.get('OSM_MD5_URL')
 27 | transfer_index_files_dir_gcs_uri = os.environ.get('TRANSFER_INDEX_FILES_GCS_URI')
 28 | gcs_transfer_bucket = os.environ.get('GCS_TRANSFER_BUCKET')
 29 | 
 30 | default_args = {
 31 |     'retries': 1,
 32 |     'retry_delay': datetime.timedelta(minutes=1),
 33 | }
 34 | 
 35 | with airflow.DAG(
 36 |         'transferring_src_osm_file',
 37 |         catchup=False,
 38 |         default_args=default_args,
 39 |         start_date=year_start,
 40 |         schedule_interval="@weekly") as dag:
 41 | 
 42 |     def transfer_to_gcs():
 43 |         md5_file_lines = read_file_lines_from_url(osm_md5_url)
 44 |         logging.info(md5_file_lines)
 45 | 
 46 |         md5_hex = get_md5_hash_from_md5_file_lines(md5_file_lines)
 47 |         logging.info(md5_hex)
 48 | 
 49 |         base64_md5_file_hash = md5_hex_to_base64(md5_hex)
 50 | 
 51 |         content_length = get_content_length_from_url(osm_url)
 52 |         logging.info(content_length)
 53 | 
 54 |         osm_transfer_index_file_name = create_transfer_index_tsv(OSM_TRANSFER_INDEX_FILE_NAME,
 55 |                                                                  osm_url,
 56 |                                                                  content_length,
 57 |                                                                  base64_md5_file_hash)
 58 |         index_gcs_bucket, index_gcs_dir = gcs_utils.parse_uri_to_bucket_and_filename(transfer_index_files_dir_gcs_uri)
 59 |         list_url = upload_file_to_gcs_as_public(osm_transfer_index_file_name,
 60 |                                                 index_gcs_bucket,
 61 |                                                 index_gcs_dir)
 62 | 
 63 |         job_dict = create_transfer_job_dict(project_id, list_url, gcs_transfer_bucket)
 64 |         execute_transfer_job(job_dict)
 65 | 
 66 | 
 67 |     def read_file_lines_from_url(url):
 68 |         logging.info(url)
 69 | 
 70 |         request.urlcleanup()
 71 |         data = request.urlopen(url)
 72 |         return [byte_str_to_str(line) for line in data]
 73 | 
 74 | 
 75 |     def byte_str_to_str(byte_str):
 76 |         return byte_str.decode("utf-8")
 77 | 
 78 | 
 79 |     def get_md5_hash_from_md5_file_lines(lines):
 80 |         first_line = lines[0]
 81 |         return first_line.split()[0]
 82 | 
 83 | 
 84 |     def get_content_length_from_url(url):
 85 |         data = request.urlopen(url)
 86 |         meta = data.info()
 87 |         return meta.get(name="Content-Length")
 88 | 
 89 | 
 90 |     def md5_hex_to_base64(md5_hex):
 91 |         return byte_str_to_str(to_base64(from_hex_to_binary(md5_hex)))
 92 | 
 93 | 
 94 |     def from_hex_to_binary(hex):
 95 |         return binascii.unhexlify(hex)
 96 | 
 97 | 
 98 |     def to_base64(byte_str):
 99 |         return base64.b64encode(byte_str)
100 | 
101 | 
102 |     def create_transfer_index_tsv(osm_transfer_index_file_name, url, content_length, md5_hash):
103 |         header_line = "TsvHttpData-1.0"
104 |         lines = [header_line, "\t".join([url, content_length, md5_hash])]
105 |         lines = [line + "\n" for line in lines]
106 |         with open(osm_transfer_index_file_name, "w") as osm_transfer_index_file:
107 |             osm_transfer_index_file.writelines(lines)
108 |         return osm_transfer_index_file_name
109 | 
110 | 
111 |     def upload_file_to_gcs_as_public(osm_transfer_index_file_name, gcs_data_bucket, osm_transfer_index_gcs_dir):
112 |         client = storage.Client()
113 | 
114 |         osm_transfer_index_gcs_name = osm_transfer_index_gcs_dir \
115 |                                       + add_timestamped_suffix(OSM_TRANSFER_INDEX_FILE_NAME_BASE) \
116 |                                       + OSM_TRANSFER_INDEX_FILE_NAME_EXT
117 |         bucket = client.get_bucket(gcs_data_bucket)
118 |         dest_blob = bucket.blob(osm_transfer_index_gcs_name)
119 |         dest_blob.upload_from_filename(osm_transfer_index_file_name)
120 | 
121 |         dest_blob.make_public()
122 | 
123 |         return dest_blob.public_url
124 | 
125 | 
126 |     def add_timestamped_suffix(name):
127 |         return name + "_" + str(time.time()).split(".")[0]
128 | 
129 | 
130 |     def bucket_name_and_file_name_from_gcs_uri(gcs_uri):
131 |         gcs_uri_without_gs_part = gcs_uri.split("//")[-1]
132 |         uri_parts = gcs_uri_without_gs_part.split("/")
133 | 
134 |         return uri_parts[0], "/".join(uri_parts[1:])
135 | 
136 | 
137 |     def create_transfer_job_dict(project_id, list_url, transfer_bucket):
138 |         now_datetime = datetime.datetime.now()
139 |         transfer_datetime = now_datetime + datetime.timedelta(minutes=3)
140 | 
141 |         job_description = "transfer--{}".format(transfer_datetime.strftime("%Y-%m-%d--%H-%M-%S"))
142 |         job_name = "transferJobs/{}".format(job_description)
143 |         overwrite_objects_already_existing_in_sink = True
144 | 
145 |         transfer_date = {
146 |             "day": transfer_datetime.day,
147 |             "month": transfer_datetime.month,
148 |             "year": transfer_datetime.year
149 |         }
150 |         transfer_time = {
151 |             "hours": transfer_datetime.hour,
152 |             "minutes": transfer_datetime.minute,
153 |             "seconds": transfer_datetime.second
154 |         }
155 |         status = "ENABLED"
156 |         transfer_job = {
157 |             "name": job_name,
158 |             "description": job_description,
159 |             "transferSpec": {
160 |                 "httpDataSource": {
161 |                     "listUrl": list_url
162 |                 },
163 |                 "gcsDataSink": {
164 |                     "bucketName": transfer_bucket
165 |                 },
166 |                 "transferOptions": {
167 |                     "overwriteObjectsAlreadyExistingInSink":
168 |                         overwrite_objects_already_existing_in_sink
169 |                 }
170 |             },
171 |             "projectId": project_id,
172 |             "schedule": {
173 |                 "scheduleEndDate": transfer_date,
174 |                 "scheduleStartDate": transfer_date,
175 |                 "startTimeOfDay": transfer_time
176 |             },
177 |             "status": status
178 |         }
179 |         return transfer_job
180 | 
181 | 
182 |     def execute_transfer_job(job_dict):
183 |         storage_transfer = googleapiclient.discovery.build('storagetransfer', 'v1')
184 |         logging.info('Requesting transferJob: {}'.format(
185 |             job_dict))
186 |         result = storage_transfer.transferJobs().create(body=job_dict).execute()
187 |         logging.info('Returned transferJob: {}'.format(
188 |             json.dumps(result, indent=4)))
189 | 
190 | 
191 |     transferring_to_gcs = python_operator.PythonOperator(
192 |         task_id='transferring_to_gcs',
193 |         python_callable=transfer_to_gcs)
194 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/osm_index.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import json
  3 | import logging
  4 | import time
  5 | 
  6 | 
  7 | class OsmIndex(object):
  8 |     def __init__(self):
  9 |         pass
 10 | 
 11 |     def create(self):
 12 |         pass
 13 | 
 14 |     def save(self):
 15 |         pass
 16 | 
 17 |     def close(self):
 18 |         pass
 19 | 
 20 | 
 21 | class SQLiteOsmIndex(OsmIndex):
 22 | 
 23 |     def __init__(self, db_file_path):
 24 |         super().__init__()
 25 |         self.db_file_path = db_file_path
 26 |         self.sqlite3_connection = sqlite3.connect(db_file_path)
 27 |         self.osm_index_db_cursor = self.sqlite3_connection.cursor()
 28 |         self.query_time = 0
 29 |         self.query_counter = 0
 30 | 
 31 |         self.tables_and_fields = {"nodes": {
 32 |             "id": "INT",
 33 |             "version": "INT",
 34 |             "osm_timestamp": "INT",
 35 |             "longitude": "REAL",
 36 |             "latitude": "REAL",
 37 |         }, "ways": {
 38 |             "id": "INT",
 39 |             "version": "INT",
 40 |             "osm_timestamp": "INT",
 41 |             "nodes": "TEXT"
 42 |         }, "relations": {
 43 |             "id": "INT",
 44 |             "version": "INT",
 45 |             "osm_timestamp": "INT",
 46 |             "nodes": "TEXT"
 47 |         }}
 48 | 
 49 |         self.nodes_fields_list = list(self.tables_and_fields["nodes"].keys())
 50 |         self.ways_fields_list = list(self.tables_and_fields["ways"].keys())
 51 |         self.relations_fields_list = list(self.tables_and_fields["relations"].keys())
 52 | 
 53 |         self.nodes_fields_str = ",".join(self.nodes_fields_list)
 54 |         self.ways_fields_str = ",".join(self.ways_fields_list)
 55 |         self.relations_fields_str = ",".join(self.relations_fields_list)
 56 | 
 57 |     def get_db_file_path(self):
 58 |         return self.db_file_path
 59 | 
 60 |     def get_query_time(self):
 61 |         return self.query_time
 62 | 
 63 |     def reset_query_time(self):
 64 |         self.query_time = 0
 65 | 
 66 |     def get_query_counter(self):
 67 |         return self.query_counter
 68 | 
 69 |     def reset_query_counter(self):
 70 |         self.query_counter = 0
 71 | 
 72 |     def create(self):
 73 |         self.init_all_tables()
 74 | 
 75 |     def save(self):
 76 |         self.sqlite3_connection.commit()
 77 | 
 78 |     def close(self):
 79 |         self.save()
 80 |         self.sqlite3_connection.close()
 81 | 
 82 |     def init_all_tables(self):
 83 |         for table, fields_dicts in self.tables_and_fields.items():
 84 |             fields = ["{} {}".format(field_name, field_type) for field_name, field_type in fields_dicts.items()]
 85 |             self.osm_index_db_cursor.execute('CREATE TABLE {} ({})'.format(table, ",".join(fields)))
 86 |             self.osm_index_db_cursor.execute('CREATE INDEX idx_{}_id_version ON {} (id, version)'.format(table, table))
 87 |             self.save()
 88 | 
 89 |     def execute_query(self, query, values=None):
 90 |         query_start_timestamp = time.time()
 91 |         if values:
 92 |             self.osm_index_db_cursor.execute(query, values)
 93 |         else:
 94 |             self.osm_index_db_cursor.execute(query)
 95 |         self.query_time += (time.time() - query_start_timestamp)
 96 |         self.query_counter += 1
 97 | 
 98 |     def add_values_to_sqlite_table(self, table_name, values):
 99 |         placeholders = ",".join(["?"] * len(values))
100 |         query = "INSERT INTO {} VALUES ({})".format(table_name, placeholders)
101 |         self.execute_query(query, values)
102 | 
103 |     def get_id_version_timestamp_all_tags_from_osm_obj(self, osm_obj):
104 |         return str(osm_obj["id"]), str(osm_obj["version"]), str(osm_obj["osm_timestamp"])
105 | 
106 |     def add_node_to_index(self, node_dict):
107 |         osm_id, ver, timestamp = self.get_id_version_timestamp_all_tags_from_osm_obj(node_dict)
108 |         lon = node_dict["longitude"] if "longitude" in node_dict else None
109 |         lat = node_dict["latitude"] if "latitude" in node_dict else None
110 |         self.add_values_to_sqlite_table("nodes", [osm_id, ver, timestamp, lon, lat])
111 | 
112 |     def add_way_to_index(self, way_dict):
113 |         osm_id, ver, timestamp = self.get_id_version_timestamp_all_tags_from_osm_obj(way_dict)
114 |         node_ids = json.dumps(way_dict["nodes"])
115 |         self.add_values_to_sqlite_table("ways", [osm_id, ver, timestamp, node_ids])
116 | 
117 |     def add_relation_to_index(self, relation_dict):
118 |         osm_id, ver, timestamp = self.get_id_version_timestamp_all_tags_from_osm_obj(relation_dict)
119 |         members = json.dumps(relation_dict["members"])
120 |         self.add_values_to_sqlite_table("relations", [osm_id, ver, timestamp, members])
121 | 
122 |     def get_row_from_index_by_timestamp(self, table_name, id, timestamp, fields_str=None):
123 |         query = "SELECT {} FROM {} table_name WHERE id={} AND osm_timestamp<{} ORDER BY osm_timestamp DESC" \
124 |             .format(fields_str if fields_str else "*", table_name, id, timestamp)
125 |         self.execute_query(query)
126 |         return self.osm_index_db_cursor.fetchone()
127 | 
128 |     def get_node_from_index_by_timestamp(self, node_id, timestamp):
129 |         node_data = self.get_row_from_index_by_timestamp("nodes", node_id, timestamp)
130 |         if not node_data:
131 |             return
132 | 
133 |         node_dict = {field: node_data[index] for index, field in enumerate(self.nodes_fields_list)}
134 |         return node_dict
135 | 
136 |     def get_way_from_index_by_timestamp(self, way_id, timestamp):
137 |         way_data = self.get_row_from_index_by_timestamp("ways", way_id, timestamp)
138 |         if not way_data:
139 |             return
140 | 
141 |         way_dict = {}
142 |         for index, field in enumerate(self.ways_fields_list):
143 |             if field == "nodes":
144 |                 way_dict["nodes"] = json.loads(way_data[index])
145 |             else:
146 |                 way_dict[field] = way_data[index]
147 |         return way_dict
148 | 
149 |     def get_relation_from_index_by_timestamp(self, relation_id, timestamp):
150 |         relation_data = self.get_row_from_index_by_timestamp("relations", relation_id, timestamp)
151 |         if not relation_data:
152 |             return
153 | 
154 |         relation_dict = {}
155 |         for index, field in enumerate(self.relations_fields_list):
156 |             if field == "members":
157 |                 relation_dict["members"] = json.loads(relation_data[index])
158 |             else:
159 |                 relation_dict[field] = relation_data[index]
160 |         return relation_dict
161 | 
162 |     def merge_identical_db(self, db_file_to_merge):
163 |         tables = list(self.tables_and_fields.keys())
164 |         db_to_merge_temp_name = "dbToMerge"
165 |         self.execute_query("ATTACH '{}' as {}".format(db_file_to_merge, db_to_merge_temp_name))
166 |         for table in tables:
167 |             self.execute_query("INSERT into {} SELECT * FROM {}.{}"
168 |                                .format(table, db_to_merge_temp_name, table))
169 |             self.sqlite3_connection.commit()
170 |         self.execute_query("DETACH {}".format(db_to_merge_temp_name))
171 | 
172 | 
173 | def merge_dbs(new_db_file, db_paths, db_exists):
174 |     new_db = SQLiteOsmIndex(new_db_file)
175 |     if not db_exists:
176 |         new_db.create()
177 |     for db_path in db_paths:
178 |         logging.info("Merging {} into {}".format(db_path, new_db_file))
179 |         new_db.merge_identical_db(db_path)
180 |     new_db.close()
181 |     return new_db_file
182 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_to_nodes_ways_relations/src/pbf_parser.py:
--------------------------------------------------------------------------------
  1 | import osmium
  2 | import logging
  3 | import json
  4 | import argparse
  5 | import os
  6 | import errno
  7 | import time
  8 | import threading
  9 | import multiprocessing
 10 | import json
 11 | 
 12 | from datetime import datetime
 13 | from google.cloud import storage
 14 | 
 15 | 
 16 | def osm_entity_to_dict(osm_entity):
 17 |     all_tags = [{"key": tag.k, "value": tag.v} for tag in osm_entity.tags]
 18 |     return {"id": osm_entity.id, "all_tags": all_tags}
 19 | 
 20 | 
 21 | def osm_entity_to_dict_full(osm_entity):
 22 |     base_dict = osm_entity_to_dict(osm_entity)
 23 |     base_dict.update({
 24 |         "version": osm_entity.version,
 25 |         "username": osm_entity.user,
 26 |         "changeset": osm_entity.changeset,
 27 |         "visible": osm_entity.visible,
 28 |         "osm_timestamp": int(datetime.timestamp(osm_entity.timestamp)),
 29 |     })
 30 |     return base_dict
 31 | 
 32 | 
 33 | def osm_entity_node_dict(osm_node_entity):
 34 |     base_dict = osm_entity_to_dict_full(osm_node_entity)
 35 |     if osm_node_entity.location.valid():
 36 |         base_dict["latitude"] = osm_node_entity.location.lat
 37 |         base_dict["longitude"] = osm_node_entity.location.lon
 38 |     else:
 39 |         base_dict["latitude"] = None
 40 |         base_dict["longitude"] = None
 41 |     return base_dict
 42 | 
 43 | 
 44 | def osm_entity_way_dict(osm_way_entity):
 45 |     base_dict = osm_entity_to_dict_full(osm_way_entity)
 46 |     base_dict["nodes"] = [{"id": node.ref} for node in osm_way_entity.nodes]
 47 |     return base_dict
 48 | 
 49 | 
 50 | def osm_entity_relation_dict(osm_relation_entity):
 51 |     base_dict = osm_entity_to_dict_full(osm_relation_entity)
 52 |     base_dict["members"] = [{"type": member.type, "id": member.ref, "role": member.role}
 53 |                             for member in iter(osm_relation_entity.members)]
 54 |     return base_dict
 55 | 
 56 | 
 57 | class CustomHandler(osmium.SimpleHandler):
 58 | 
 59 |     def __init__(self, files_dict, pool_size, pool_index):
 60 |         osmium.SimpleHandler.__init__(self)
 61 |         self.entities_out_files_dict = files_dict
 62 |         self.processing_counter = 0
 63 | 
 64 |         self.last_log_time = time.time()
 65 |         self.pool_size = pool_size
 66 |         self.pool_index = pool_index
 67 |         self._lock = threading.Lock()
 68 |         self.geo_json_factory = osmium.geom.GeoJSONFactory()
 69 | 
 70 |     def log_processing(self, entity_type):
 71 |         if self.processing_counter % 1000000 == 0:
 72 |             logging.info(entity_type + " (pool_index {}) ".format(str(self.pool_index)) + str(self.processing_counter)
 73 |                          + " " + str(time.time() - self.last_log_time))
 74 |             self.last_log_time = time.time()
 75 | 
 76 |     def node(self, node):
 77 |         self.processing_counter = self.processing_counter + 1
 78 | 
 79 |         self.log_processing("nodes")
 80 |         if self.processing_counter % self.pool_size == self.pool_index:
 81 |             node_dict = osm_entity_node_dict(node)
 82 |             self.write_to_dict("nodes", node_dict)
 83 | 
 84 |     def way(self, way):
 85 |         self.processing_counter = self.processing_counter + 1
 86 | 
 87 |         self.log_processing("ways")
 88 |         if self.processing_counter % self.pool_size == self.pool_index:
 89 |             way_dict = osm_entity_way_dict(way)
 90 |             self.write_to_dict("ways", way_dict)
 91 | 
 92 |     def relation(self, relation):
 93 |         self.processing_counter = self.processing_counter + 1
 94 | 
 95 |         self.log_processing("relations")
 96 |         if self.processing_counter % self.pool_size == self.pool_index:
 97 |             relation_dict = osm_entity_relation_dict(relation)
 98 |             self.write_to_dict("relations", relation_dict)
 99 | 
100 |     def process_as_base_osm_entity(self, osm_entity, entity_type):
101 |         self.processing_counter = self.processing_counter + 1
102 | 
103 |         self.log_processing(entity_type)
104 |         if self.processing_counter % self.pool_size == self.pool_index:
105 |             node_dict = osm_entity_to_dict_full(osm_entity)
106 |             self.write_to_dict(entity_type, node_dict)
107 | 
108 |     def write_to_dict(self, entity_type, entity_dict):
109 |         with self._lock:
110 |             entities_out_files_dict[entity_type].write(json.dumps(entity_dict) + "\n")
111 | 
112 | 
113 | def make_dir_for_file_if_not_exists(filename):
114 |     if not os.path.exists(os.path.dirname(filename)):
115 |         try:
116 |             os.makedirs(os.path.dirname(filename))
117 |         except OSError as exc:  # Guard against race condition
118 |             if exc.errno != errno.EEXIST:
119 |                 raise
120 | 
121 | 
122 | def from_gcs_to_local_file(src_gcs_bucket, src_gcs_name, local_file_path):
123 |     storage_client = storage.Client(os.environ['PROJECT_ID'])
124 |     # Create a bucket object for our bucket
125 |     bucket = storage_client.get_bucket(src_gcs_bucket)
126 |     # Create a blob object from the filepath
127 |     blob = bucket.blob(src_gcs_name)
128 |     # Download the file to a destination
129 |     logging.info("Downloading gs://{}/{} to {}...".format(src_gcs_bucket, src_gcs_name, local_file_path))
130 |     blob.download_to_filename(local_file_path)
131 |     logging.info("Successfully downloaded gs://{}/{} to {}".format(src_gcs_bucket, src_gcs_name, local_file_path))
132 | 
133 | 
134 | def upload_file_to_gcs(filename, destination_bucket_name, destination_blob_name):
135 |     """
136 |     Uploads a file to a given Cloud Storage bucket and returns the public url
137 |     to the new object.
138 |     """
139 |     bucket = storage.Client().bucket(destination_bucket_name)
140 |     blob = bucket.blob(destination_blob_name)
141 |     logging.info("Uploading of {} to gs://{}/{}...".format(filename, destination_bucket_name, destination_blob_name))
142 |     blob.upload_from_filename(
143 |         filename,
144 |         content_type="text/plain")
145 |     logging.info(
146 |         "Finished uploading of {} to gs://{}/{}".format(filename, destination_bucket_name, destination_blob_name))
147 | 
148 | 
149 | def parse_uri_to_bucket_and_filename(file_path):
150 |     """Divides file uri to bucket name and file name"""
151 |     path_parts = file_path.split("//")
152 |     if len(path_parts) >= 2:
153 |         main_part = path_parts[1]
154 |         if "/" in main_part:
155 |             divide_index = main_part.index("/")
156 |             bucket_name = main_part[:divide_index]
157 |             file_name = main_part[divide_index + 1 - len(main_part):]
158 | 
159 |             return bucket_name, file_name
160 |     return "", ""
161 | 
162 | 
163 | def process_pbf(pool_index):
164 |     simple_handler = CustomHandler(entities_out_files_dict, pool_size, pool_index)
165 |     simple_handler.apply_file(dest_local_path)
166 | 
167 | 
168 | def run_pbf_processing_in_parallel(pool_size):
169 |     pool = multiprocessing.Pool(pool_size)
170 |     for pool_index in range(pool_size):
171 |         pool.apply_async(process_pbf, [pool_index])
172 |     pool.close()
173 |     pool.join()
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     logging.getLogger().setLevel(logging.INFO)
178 | 
179 |     parser = argparse.ArgumentParser()
180 |     parser.add_argument("src_pbf_file_uri", help="The source PBF file to be converted")
181 |     parser.add_argument("dest_gcs_dir", help="URI of GCS dir to save result files")
182 |     parser.add_argument("--num_threads", help="Number of parallel threads for processing", default="3")
183 | 
184 |     args = parser.parse_args()
185 | 
186 |     src_bucket, src_name = parse_uri_to_bucket_and_filename(args.src_pbf_file_uri)
187 | 
188 |     data_dir = os.environ['DATA_DIR']
189 |     dest_local_path = data_dir + "planet.osm.pbf"
190 |     make_dir_for_file_if_not_exists(dest_local_path)
191 |     from_gcs_to_local_file(src_bucket, src_name, dest_local_path)
192 | 
193 |     entities = ["nodes", "ways", "relations"]
194 | 
195 |     entities_out_files_dict = {}
196 |     results_local_paths = []
197 |     for entity in entities:
198 |         path = data_dir + "{}.jsonl".format(entity)
199 |         results_local_paths.append(path)
200 | 
201 |         make_dir_for_file_if_not_exists(path)
202 |         entities_out_files_dict[entity] = open(path, "w")
203 | 
204 |     logging.info("Creating {} files".format(str(results_local_paths)))
205 | 
206 |     pool_size = int(args.num_threads)
207 |     run_pbf_processing_in_parallel(pool_size)
208 | 
209 |     for entity, out_file in entities_out_files_dict.items():
210 |         out_file.close()
211 | 
212 |     dest_bucket, dest_dir_name = parse_uri_to_bucket_and_filename(args.dest_gcs_dir)
213 |     for path in results_local_paths:
214 |         dest_file_gcs_name = dest_dir_name + path.split("/")[-1]
215 |         upload_file_to_gcs(path, dest_bucket, dest_file_gcs_name)
216 | 


--------------------------------------------------------------------------------
/examples/clustering/colors/vectorize.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import textract
  3 | from pathlib import Path
  4 | import numpy as np
  5 | import json
  6 | 
  7 | import nltk
  8 | from nltk.corpus import stopwords
  9 | from nltk.stem import WordNetLemmatizer
 10 | 
 11 | from sklearn.feature_extraction.text import TfidfVectorizer
 12 | 
 13 | nltk.download('punkt')
 14 | 
 15 | nltk.download('stopwords')
 16 | stop_words = set(stopwords.words('english')).union({'etc', 'note', 'also', 'occur'})
 17 | 
 18 | nltk.download('wordnet')
 19 | lemmatizer = WordNetLemmatizer()
 20 | 
 21 | embeddings_dict = {}
 22 | 
 23 | with open("../data/glove.6B.300d.txt", 'r', encoding="utf-8") as f:
 24 |     for line in f:
 25 |         values = line.split()
 26 |         word = values[0]
 27 |         vector = np.asarray(values[1:], "float32")
 28 |         embeddings_dict[word] = vector
 29 | 
 30 | rows = [
 31 |     {'dimension': 'Activity', 'code': 1000, 'color': 'FF00FF', 'name': 'Residential activities'},
 32 |     {'dimension': 'Activity', 'code': 2000, 'color': 'FF0000', 'name': 'Shopping, business, or trade activities'},
 33 |     {'dimension': 'Activity', 'code': 3000, 'color': 'A0F020',
 34 |      'name': 'Industrial, manufacturing, and waste- related activities'},
 35 |     {'dimension': 'Activity', 'code': 4000, 'color': '00FF00',
 36 |      'name': 'Social, institutional, or infrastructure- related activities'},
 37 |     {'dimension': 'Activity', 'code': 5000, 'color': 'BEBEBE', 'name': 'Travel or movement activities'},
 38 |     {'dimension': 'Activity', 'code': 6000, 'color': '2F4F4F', 'name': 'Mass assembly of people'},
 39 |     {'dimension': 'Activity', 'code': 7000, 'color': '9090EE', 'name': 'Leisure activities'},
 40 |     {'dimension': 'Activity', 'code': 8000, 'color': '22228B', 'name': 'Natural resources-related activities'},
 41 |     {'dimension': 'Activity', 'code': 9000, 'color': 'FFFFFF', 'name': 'No human activity or unclassifiable activity'},
 42 | 
 43 |     {'dimension': 'Function', 'code': 1000, 'color': 'FF00FF', 'name': 'Residence or accommodation functions'},
 44 |     {'dimension': 'Function', 'code': 2000, 'color': 'FF0000', 'name': 'General sales or services'},
 45 |     {'dimension': 'Function', 'code': 3000, 'color': 'A0F020', 'name': 'Manufacturing and wholesale trade'},
 46 |     {'dimension': 'Function', 'code': 4000, 'color': 'BEBEBE',
 47 |      'name': 'Transportation, communication, information, and utilities'},
 48 |     {'dimension': 'Function', 'code': 5000, 'color': '9090EE', 'name': 'Arts, entertainment, and recreation'},
 49 |     {'dimension': 'Function', 'code': 6000, 'color': '00FF00',
 50 |      'name': 'Education, public admin., health care, andother inst.'},
 51 |     {'dimension': 'Function', 'code': 7000, 'color': '008B8B', 'name': 'Construction-related businesses'},
 52 |     {'dimension': 'Function', 'code': 8000, 'color': '558B00', 'name': 'Mining and extraction establishments'},
 53 |     {'dimension': 'Function', 'code': 9000, 'color': '22228B', 'name': 'Agriculture, forestry, fishing and hunting'},
 54 | 
 55 |     {'dimension': 'Ownership', 'code': 1000, 'color': 'F5DCF5', 'name': 'No constraints--private ownership'},
 56 |     {'dimension': 'Ownership', 'code': 2000, 'color': '00FF00',
 57 |      'name': 'Some constraints--easements or other use restrictions'},
 58 |     {'dimension': 'Ownership', 'code': 3000, 'color': '008B00',
 59 |      'name': 'Limited restrictions--leased and other tenancy restrictions'},
 60 |     {'dimension': 'Ownership', 'code': 4000, 'color': '9090EE',
 61 |      'name': 'Public restrictions--local, state, and federal ownership'},
 62 |     {'dimension': 'Ownership', 'code': 5000, 'color': '000064',
 63 |      'name': 'Other public use restrictions--regional, special districts, etc'},
 64 |     {'dimension': 'Ownership', 'code': 6000, 'color': '6B238E', 'name': 'Nonprofit ownership restrictions'},
 65 |     {'dimension': 'Ownership', 'code': 7000, 'color': 'BEBEBE', 'name': 'Joint ownership character--public entities'},
 66 |     {'dimension': 'Ownership', 'code': 8000, 'color': '000000',
 67 |      'name': 'Joint ownership character--public, private, nonprofit, etc.'},
 68 |     {'dimension': 'Ownership', 'code': 9000, 'color': 'FFFFFF', 'name': 'Not applicable to this dimension'},
 69 | 
 70 |     {'dimension': 'Site', 'code': 1000, 'color': '9090EE', 'name': 'Site in natural state'},
 71 |     {'dimension': 'Site', 'code': 2000, 'color': 'F5DCF5', 'name': 'Developing site'},
 72 |     {'dimension': 'Site', 'code': 3000, 'color': 'CD9EB7', 'name': 'Developed site -- crops, grazing, forestry, etc.'},
 73 |     {'dimension': 'Site', 'code': 4000, 'color': '8B667E', 'name': 'Developed site -- no buildings and no structures'},
 74 |     {'dimension': 'Site', 'code': 5000, 'color': '8B2B00', 'name': 'Developed site -- nonbuilding structures'},
 75 |     {'dimension': 'Site', 'code': 6000, 'color': '8B2323', 'name': 'Developed site -- with buildings'},
 76 |     {'dimension': 'Site', 'code': 7000, 'color': '22228B', 'name': 'Developed site -- with parks'},
 77 |     {'dimension': 'Site', 'code': 8000, 'color': 'D3D3D3', 'name': 'Not applicable to this dimension'},
 78 |     {'dimension': 'Site', 'code': 9000, 'color': 'FFFFFF', 'name': 'Unclassifiable site development character'},
 79 | 
 80 |     {'dimension': 'Structure', 'code': 1000, 'color': 'FF00FF', 'name': 'Residential buildings'},
 81 |     {'dimension': 'Structure', 'code': 2000, 'color': 'FF0000', 'name': 'Commercial buildings and other specialized structures'},
 82 |     {'dimension': 'Structure', 'code': 3000, 'color': 'A0F020', 'name': 'Public assembly structures'},
 83 |     {'dimension': 'Structure', 'code': 4000, 'color': '00FF00', 'name': 'Institutional or community facilities'},
 84 |     {'dimension': 'Structure', 'code': 5000, 'color': 'BEBEBE', 'name': 'Transportation-related facilities'},
 85 |     {'dimension': 'Structure', 'code': 6000, 'color': '858585', 'name': 'Utility and other nonbuilding structures'},
 86 |     {'dimension': 'Structure', 'code': 7000, 'color': 'FFCBC0', 'name': 'Specialized military structures'},
 87 |     {'dimension': 'Structure', 'code': 8000, 'color': '22228B', 'name': 'Sheds, farm buildings, or agricultural facilities'},
 88 |     {'dimension': 'Structure', 'code': 9000, 'color': 'FFFFFF', 'name': 'No structure'}
 89 | ]
 90 | 
 91 | 
 92 | def tokenize(text):
 93 |     tokens = nltk.word_tokenize(text.lower().replace('-', ' '))
 94 |     filtered = [t for t in tokens if t not in stop_words and t.isalpha()]
 95 | 
 96 |     return [lemmatizer.lemmatize(t) for t in filtered]
 97 | 
 98 | 
 99 | def mean_vector(tokens):
100 |     count = 0
101 |     sum_vector = np.zeros(300)
102 |     for token in tokens:
103 |         if token not in embeddings_dict:
104 |             continue
105 |         sum_vector += embeddings_dict[token]
106 |         count += 1
107 | 
108 |     return sum_vector / count
109 | 
110 | 
111 | def vectorize():
112 |     try:
113 |         text = Path('../data/LBCS.txt').read_text()
114 |     except FileNotFoundError:
115 |         text = textract.process('../data/LBCS.pdf')
116 |         text = text.decode('utf-8')
117 |         Path('../data/LBCS.txt').write_text(text)
118 | 
119 |     # split text to parts with each dimension details
120 |     dimensions_details = re.split('\w Dimension with Detail', text)[1:]
121 |     # remove text tail from the last part
122 |     dimensions_details[-1] = re.split('LBCS Top Level Codes for all Dimensions', dimensions_details[-1])[0]
123 | 
124 |     corpus = []
125 |     vectors = []
126 |     for dimension in dimensions_details:
127 |         # split by classes codes
128 |         codes = re.split('1000|2000|3000|4000|5000|6000|7000|8000|9000|9999', dimension)[1:10]
129 | 
130 |         for code in codes:
131 |             tokenized = tokenize(code)
132 |             corpus.append(' '.join(tokenized))
133 |             # vec = mean_vector(tokenized)
134 |             # vectors.append(vec)
135 | 
136 |     vectorizer = TfidfVectorizer()
137 |     X = vectorizer.fit_transform(corpus)
138 |     features = vectorizer.get_feature_names()
139 | 
140 |     top_k = 10
141 |     for tf_idf_vector in X:
142 |         tf_idf_arr = tf_idf_vector[0].toarray().reshape(-1)
143 | 
144 |         # for top-k implementation
145 |         # max_indexes = np.argpartition(tf_idf_arr, -top_k)[-top_k:]
146 | 
147 |         # just plain tf-idf
148 |         weights = tf_idf_arr / sum(tf_idf_arr)
149 |         # softmax
150 |         # weights = np.exp(tf_idf_arr) / np.sum(np.exp(tf_idf_arr))
151 |         # like softmax but with tanh
152 |         # weights = np.tanh(tf_idf_arr) / np.sum(np.tanh(tf_idf_arr))
153 | 
154 |         sum_vector = np.zeros(300)
155 |         for i, feature in enumerate(features):
156 |             if feature not in embeddings_dict:
157 |                 continue
158 |             sum_vector += weights[i] * embeddings_dict[feature]
159 |         vectors.append(sum_vector)
160 | 
161 |     assert len(rows) == len(vectors)
162 |     for (row, vec) in zip(rows, vectors):
163 |         for i in range(len(vec)):
164 |             row['f{}'.format(i+1)] = vec[i]
165 |         print(json.dumps(row))
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     vectorize()
170 | 


--------------------------------------------------------------------------------
/dags/utils/metadata_manager.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import time
  4 | 
  5 | from utils import gcs_utils
  6 | 
  7 | OSM_ENTITIES = ["nodes", "ways", "relations"]
  8 | 
  9 | 
 10 | def file_name_without_ext(file_name):
 11 |     if "." in file_name:
 12 |         return file_name.split(".")[0]
 13 |     else:
 14 |         return file_name
 15 | 
 16 | 
 17 | def file_name_from_path(file_path):
 18 |     if "/" in file_path:
 19 |         return file_path.split("/")[-1]
 20 |     else:
 21 |         return file_path
 22 | 
 23 | 
 24 | def get_index_metadata_file_path(src_osm_name, num_db_shards):
 25 |     return file_name_without_ext(src_osm_name) + "_{}_index_shards.metadata.txt".format(num_db_shards)
 26 | 
 27 | 
 28 | def get_result_shard_metadata_file_path(src_osm_name, entity_type, index, num_results_shards):
 29 |     return file_name_without_ext(src_osm_name) + "_{}_{}_{}.metadata.txt".format(entity_type, index + 1,
 30 |                                                                                               num_results_shards)
 31 | 
 32 | 
 33 | def download_and_read_metadata_file(gcs_bucket, gcs_dir_name, src_osm_name, num_db_shards, num_results_shards):
 34 |     src_osm_file_name = file_name_from_path(src_osm_name)
 35 | 
 36 |     index_metadata_file_path = get_index_metadata_file_path(src_osm_file_name, num_db_shards)
 37 |     index_metadata_blob_name = gcs_dir_name + index_metadata_file_path
 38 |     if gcs_utils.is_gcs_blob_exists(gcs_bucket, index_metadata_blob_name):
 39 |         gcs_utils.from_gcs_to_local_file(gcs_bucket, index_metadata_blob_name, index_metadata_file_path)
 40 | 
 41 |     shards_metadata_files = {}
 42 |     for entity in OSM_ENTITIES:
 43 |         shards_metadata_files_by_entity = {}
 44 |         for index in range(num_results_shards):
 45 |             result_shard_metadata_file_path = get_result_shard_metadata_file_path(src_osm_file_name, entity, index,
 46 |                                                                                   num_results_shards)
 47 |             result_shard_metadata_blob_name = gcs_dir_name + result_shard_metadata_file_path
 48 |             if gcs_utils.is_gcs_blob_exists(gcs_bucket, result_shard_metadata_blob_name):
 49 |                 gcs_utils.from_gcs_to_local_file(gcs_bucket, result_shard_metadata_blob_name,
 50 |                                                    result_shard_metadata_file_path)
 51 |             shards_metadata_files_by_entity[str(index)] = result_shard_metadata_file_path
 52 |         shards_metadata_files[entity] = shards_metadata_files_by_entity
 53 |     return ProcessingMetadata(index_metadata_file_path, shards_metadata_files)
 54 | 
 55 | 
 56 | def save_and_upload_metadata_to_gcs(metadata,
 57 |                                     dest_bucket,
 58 |                                     dest_dir_name,
 59 |                                     save_only_shard_by_entity_and_index=None,
 60 |                                     only_db_metadata=False):
 61 |     files_to_save = metadata.save_to_json_files(save_only_shard_by_entity_and_index, only_db_metadata)
 62 | 
 63 |     for file_to_save in files_to_save:
 64 |         timestamps_file_name = file_name_from_path(file_to_save)
 65 |         timestamps_file_blob_name = dest_dir_name + timestamps_file_name
 66 |         gcs_utils.upload_file_to_gcs(file_to_save, dest_bucket, timestamps_file_blob_name)
 67 | 
 68 | 
 69 | class ProcessingMetadata(object):
 70 | 
 71 |     def __init__(self, index_metadata_file_path, shards_metadata_files):
 72 |         self.index_metadata_file_path = index_metadata_file_path
 73 |         self.shards_metadata_files = shards_metadata_files
 74 |         try:
 75 |             with open(index_metadata_file_path, "r") as f:
 76 |                 metadata_json = json.load(f)
 77 |             self.elements_counter = MetadataCounter(metadata_json["elements_counter"])
 78 |             self.index_db_timestamps = FileTimestamps(metadata_json["index_db"])
 79 |         except Exception as e:
 80 |             logging.info(str(e))
 81 |             self.elements_counter = MetadataCounter()
 82 |             self.index_db_timestamps = FileTimestamps()
 83 | 
 84 |         self.shards_timestamps = {}
 85 |         for entity, shards_metadata_files_by_entity in shards_metadata_files.items():
 86 |             shards_timestamps_by_entity = {}
 87 |             for index_str, shards_metadata_file in shards_metadata_files_by_entity.items():
 88 |                 try:
 89 |                     with open(shards_metadata_file, "r") as f:
 90 |                         metadata_json = json.load(f)
 91 |                     shards_timestamps_by_entity[index_str] = FileTimestamps(metadata_json)
 92 |                 except Exception as e:
 93 |                     logging.info(str(e))
 94 |                     shards_timestamps_by_entity[index_str] = FileTimestamps()
 95 |             self.shards_timestamps[entity] = shards_timestamps_by_entity
 96 | 
 97 |     def update_db_max_timestamp(self, db_max_timestamp):
 98 |         self.index_db_timestamps.update_max_timestamp(db_max_timestamp)
 99 | 
100 |     def update_db_last_updated(self, db_last_updated):
101 |         self.index_db_timestamps.update_last_updated(db_last_updated)
102 | 
103 |     def update_processing_counter(self, counter_dict):
104 |         self.elements_counter.update(counter_dict)
105 | 
106 |     def get_min_history_results_last_updated_timestamp(self):
107 |         return min([min([shard_timestamps.last_updated for shard_index_str, shard_timestamps in
108 |                          shards_timestamps_by_entity.items()]) for entity, shards_timestamps_by_entity in
109 |                     self.shards_timestamps.items()])
110 | 
111 |     def get_history_results_max_timestamps(self):
112 |         last_elements_timestamps = {}
113 |         for entity, shards_timestamps_by_entity in self.shards_timestamps.items():
114 |             last_elements_timestamps[entity] = {shard_index_str: shard_timestamps.max_timestamp for
115 |                                                 shard_index_str, shard_timestamps in
116 |                                                 shards_timestamps_by_entity.items()}
117 |         return last_elements_timestamps
118 | 
119 |     def update_history_result_timestamps(self, entity_type, shard_index):
120 |         self.shards_timestamps[entity_type][str(shard_index)].update_max_timestamp(
121 |             self.index_db_timestamps.max_timestamp)
122 |         self.shards_timestamps[entity_type][str(shard_index)].update_last_updated(
123 |             int(time.time()))
124 | 
125 |     def save_to_json_files(self, specific_history_results_shards_to_save=None, only_db_metadata=False):
126 |         files_to_save = []
127 |         if not specific_history_results_shards_to_save:
128 |             self.save_db_metadata(files_to_save)
129 |         if not only_db_metadata:
130 |             for entity, shards_timestamps_by_entity in self.shards_timestamps.items():
131 |                 for shard_index_str, shard_timestamps in shards_timestamps_by_entity.items():
132 |                     if not specific_history_results_shards_to_save or (
133 |                             entity == specific_history_results_shards_to_save[0] and int(shard_index_str) ==
134 |                             int(specific_history_results_shards_to_save[1])):
135 |                         shard_file = self.shards_metadata_files[entity][shard_index_str]
136 |                         with open(shard_file, "w") as f:
137 |                             json.dump(shard_timestamps.to_dict(), f)
138 |                         files_to_save.append(shard_file)
139 |         return files_to_save
140 | 
141 |     def save_db_metadata(self, files_to_save):
142 |         with open(self.index_metadata_file_path, "w") as f:
143 |             json.dump({"elements_counter": self.elements_counter.to_dict(),
144 |                        "index_db": self.index_db_timestamps.to_dict()}, f)
145 |             files_to_save.append(self.index_metadata_file_path)
146 |         return files_to_save
147 | 
148 |     def to_dict(self):
149 |         history_results = {}
150 |         for entity, shards_timestamps_by_entity in self.shards_timestamps.items():
151 |             history_results[entity] = {shard_index_str: shard_timestamps.to_dict() for shard_index_str, shard_timestamps
152 |                                        in
153 |                                        shards_timestamps_by_entity.items()}
154 |         return {"elements_counter": self.elements_counter.to_dict(),
155 |                 "index_db": self.index_db_timestamps.to_dict(),
156 |                 "history_results": history_results}
157 | 
158 | 
159 | class MetadataCounter(object):
160 | 
161 |     def __init__(self, counter_dict=None):
162 |         if counter_dict:
163 |             self.counter = {entity: counter_dict[entity] for entity in OSM_ENTITIES}
164 |         else:
165 |             self.counter = {entity: 0 for entity in OSM_ENTITIES}
166 | 
167 |     def update(self, counter):
168 |         self.counter = counter
169 | 
170 |     def to_dict(self):
171 |         return self.counter
172 | 
173 | 
174 | class FileTimestamps(object):
175 | 
176 |     def __init__(self, timestamps_dict=None):
177 |         if timestamps_dict:
178 |             self.max_timestamp = timestamps_dict["max_timestamp"]
179 |             self.last_updated = timestamps_dict["last_updated"]
180 |         else:
181 |             self.max_timestamp = 0
182 |             self.last_updated = 0
183 | 
184 |     def update_max_timestamp(self, max_timestamp):
185 |         self.max_timestamp = max_timestamp
186 | 
187 |     def update_last_updated(self, last_updated):
188 |         self.last_updated = last_updated
189 | 
190 |     def to_dict(self):
191 |         return {"max_timestamp": self.max_timestamp, "last_updated": self.last_updated}
192 | 


--------------------------------------------------------------------------------
/examples/clustering/cities/cities.csv:
--------------------------------------------------------------------------------
  1 | Name,Class,Latitude,Longitude,Radius
  2 | London,Alpha++,51.497084,-0.133168,27700
  3 | New York City,Alpha++,40.712728,-74.006015,25000
  4 | Beijing,Alpha+,39.906217,116.391276,30000
  5 | Dubai,Alpha+,25.20474,55.270707,45000
  6 | Hong Kong,Alpha+,22.311106,114.183065,5350
  7 | Paris,Alpha+,48.856697,2.351462,10000
  8 | Shanghai,Alpha+,31.269942,121.543961,60000
  9 | Singapore,Alpha+,1.399896,103.800868,27150
 10 | Sydney,Alpha+,-33.834302,151.002299,31800
 11 | Tokyo,Alpha+,35.606673,139.679901,50200
 12 | Bangkok,Alpha,13.748917,100.526046,27700
 13 | Brussels,Alpha,50.864711,4.382204,10000
 14 | Buenos Aires,Alpha,-34.6409,-58.529138,36260
 15 | Chicago,Alpha,41.853047,-87.677981,20600
 16 | Frankfurt,Alpha,50.110644,8.682092,8200
 17 | Guangzhou,Alpha,23.130196,113.259294,15000
 18 | Istanbul,Alpha,41.086265,28.984227,36400
 19 | Jakarta,Alpha,-6.220461,106.827184,30000
 20 | Kuala Lumpur,Alpha,3.154439,101.672264,13500
 21 | Los Angeles,Alpha,34.053691,-118.242767,40000
 22 | Madrid,Alpha,40.416705,-3.703582,18300
 23 | Melbourne,Alpha,-37.814218,144.963161,35700
 24 | Mexico City,Alpha,19.431334,-99.085799,24500
 25 | Miami,Alpha,26.136567,-80.210505,87000
 26 | Milan,Alpha,45.4668,9.1905,12000
 27 | Moscow,Alpha,55.739994,37.614064,20000
 28 | Mumbai,Alpha,19.109836,72.87991,23500
 29 | São Paulo,Alpha,-23.571414,-46.611434,33200
 30 | Seoul,Alpha,37.554146,126.939154,23000
 31 | Taipei,Alpha,25.031295,121.509434,12800
 32 | Toronto,Alpha,43.629414,-79.57349,40600
 33 | Warsaw,Alpha,52.225226,21.019777,15000
 34 | Zürich,Alpha,47.376774,8.531428,8500
 35 | Amsterdam,Alpha−,52.371185,4.87738,11000
 36 | Barcelona,Alpha−,41.367435,2.14035,11500
 37 | Bogotá,Alpha−,4.638176,-74.104699,15700
 38 | Budapest,Alpha−,47.486316,19.102289,13200
 39 | Dublin,Alpha−,53.349764,-6.260273,10000
 40 | Houston,Alpha−,29.806496,-95.366273,41000
 41 | Johannesburg,Alpha−,-26.205,28.049722,27100
 42 | Lisbon,Alpha−,38.776836,-9.180555,10000
 43 | Luxembourg City,Alpha−,49.622753,6.119711,7600
 44 | Manila,Alpha−,14.589404,121.00706,30000
 45 | Montreal,Alpha−,45.523191,-73.607643,20000
 46 | Munich,Alpha−,48.137108,11.575382,12700
 47 | New Delhi,Alpha−,28.590348,77.239942,35000
 48 | Prague,Alpha−,50.087238,14.444594,15000
 49 | Riyadh,Alpha−,24.658759,46.718812,26200
 50 | Rome,Alpha−,41.891275,12.493922,11000
 51 | San Francisco,Alpha−,37.59455,-122.07066,51200
 52 | Santiago,Alpha−,-33.471731,-70.655087,18800
 53 | Shenzhen,Alpha−,22.658862,113.985722,25500
 54 | Stockholm,Alpha−,59.325117,18.071093,8600
 55 | Vienna,Alpha−,48.206523,16.401343,13150
 56 | "Washington, D.C.",Alpha−,38.892663,-77.021701,7500
 57 | Athens,Beta+,38.007191,23.736531,13000
 58 | Atlanta,Beta+,33.749099,-84.390185,10000
 59 | Auckland,Beta+,-36.850995,174.790646,18700
 60 | Bangalore,Beta+,12.970415,77.598166,16600
 61 | Boston,Beta+,42.366592,-71.085065,18300
 62 | Bucharest,Beta+,44.43957,26.102027,11000
 63 | Cairo,Beta+,30.048819,31.243666,42700
 64 | Chengdu,Beta+,30.66242,104.063322,26700
 65 | Copenhagen,Beta+,55.665856,12.48629,15500
 66 | Dallas,Beta+,32.800537,-96.975442,50800
 67 | Doha,Beta+,25.262347,51.486612,17500
 68 | Düsseldorf,Beta+,51.225402,6.776314,10000
 69 | Hamburg,Beta+,53.550341,10.000654,15000
 70 | Hangzhou,Beta+,30.179977,120.306829,41200
 71 | Hanoi,Beta+,20.984588,105.860798,33000
 72 | Ho Chi Minh City,Beta+,10.854796,106.713898,36800
 73 | Lima,Beta+,-12.064102,-77.038591,23700
 74 | Perth,Beta+,-31.952712,115.86048,30000
 75 | Tel Aviv,Beta+,32.039458,34.835288,18700
 76 | Vancouver,Beta+,49.152955,-122.54212,55100
 77 | Abu Dhabi,Beta,24.478113,54.638329,35700
 78 | Beirut,Beta,33.867536,35.544112,12200
 79 | Berlin,Beta,52.521147,13.390237,23400
 80 | Brisbane,Beta,-27.509236,153.01936,40300
 81 | Calgary,Beta,51.023207,-114.062588,20500
 82 | Cape Town,Beta,-33.946082,18.596636,21900
 83 | Caracas,Beta,10.472662,-66.884744,17400
 84 | Casablanca,Beta,33.541338,-7.496539,23700
 85 | Chennai,Beta,13.049404,80.228901,18600
 86 | Denver,Beta,39.729745,-104.947775,36200
 87 | Karachi,Beta,24.891087,67.097051,21600
 88 | Kyiv,Beta,50.422067,30.51054,15600
 89 | Kuwait City,Beta,29.159419,47.920013,37000
 90 | Lagos,Beta,6.643991,3.369447,31700
 91 | Manama,Beta,26.179761,50.571947,16900
 92 | Minneapolis,Beta,44.926914,-93.235285,36100
 93 | Montevideo,Beta,-34.803526,-56.10077,21300
 94 | Nairobi,Beta,-1.283439,36.836267,24700
 95 | Nanjing,Beta,32.095898,118.809465,31700
 96 | Oslo,Beta,59.87104,10.762376,19900
 97 | Philadelphia,Beta,39.93799,-75.156659,32400
 98 | Rio de Janeiro,Beta,-22.860427,-43.285597,51200
 99 | Sofia,Beta,42.684239,23.333859,10800
100 | Tianjin,Beta,39.051002,117.382345,44000
101 | Wuhan,Beta,30.561002,114.315305,26600
102 | Zagreb,Beta,45.804714,15.970193,10000
103 | Almaty,Beta−,43.271979,76.90453,11500
104 | Antwerp,Beta−,51.270318,4.34782,13700
105 | Belgrade,Beta−,44.814647,20.426678,9100
106 | Birmingham,Beta−,52.49245,-1.882436,5400
107 | Bratislava,Beta−,48.148836,17.137814,7300
108 | Changsha,Beta−,28.207032,112.987807,17400
109 | Chongqing,Beta−,29.570495,106.547904,20800
110 | Dalian,Beta−,38.999537,121.717205,21600
111 | Dhaka,Beta−,23.800602,90.409042,11800
112 | Edinburgh,Beta−,55.941777,-3.206826,7300
113 | Geneva,Beta−,46.216487,6.136297,6800
114 | George Town,Beta−,5.410574,100.312495,5800
115 | Helsinki,Beta−,60.213146,24.904762,16800
116 | Jeddah,Beta−,21.559918,39.181155,31300
117 | Jinan,Beta−,36.675486,117.02887,14100
118 | Kampala,Beta−,0.322524,32.59784,14400
119 | Lyon,Beta−,45.747499,4.878689,12500
120 | Manchester,Beta−,53.48285,-2.232186,14800
121 | Monterrey,Beta−,25.723503,-100.307021,22200
122 | Nicosia,Beta−,35.159334,33.355705,10100
123 | Panama City,Beta−,9.041305,-79.463458,13800
124 | Port Louis,Beta−,-20.226562,57.506832,12400
125 | Qingdao,Beta−,36.126453,120.306022,26500
126 | Quito,Beta−,-0.208148,-78.491728,19000
127 | San José,Beta−,9.955537,-84.118714,17100
128 | San Juan,Beta−,18.400626,-66.096104,15500
129 | San Salvador,Beta−,13.714335,-89.182499,14900
130 | Seattle,Beta−,47.598014,-122.200917,69500
131 | Shenyang,Beta−,41.784356,123.454416,24400
132 | Stuttgart,Beta−,48.795404,9.196154,9100
133 | Suzhou,Beta−,31.283432,120.635324,25400
134 | Tunis,Beta−,36.789929,10.217612,16700
135 | Valencia,Beta−,39.469108,-0.407841,16600
136 | Xiamen,Beta−,24.491775,118.059237,18200
137 | Accra,Gamma+,5.631676,-0.212893,37600
138 | Adelaide,Gamma+,-34.915843,138.549128,34400
139 | Cleveland,Gamma+,41.41456,-81.652182,28700
140 | Colombo,Gamma+,6.912506,79.901911,19700
141 | Dar es Salaam,Gamma+,-6.842749,39.235055,16000
142 | Detroit,Gamma+,42.406132,-83.125588,39300
143 | Glasgow,Gamma+,55.857124,-4.23721,17200
144 | Guatemala City,Gamma+,14.569075,-90.545258,17400
145 | Guayaquil,Gamma+,-2.151989,-79.900603,15600
146 | Harare,Gamma+,-17.831774,31.045,16100
147 | Hyderabad,Gamma+,25.389097,68.321651,9200
148 | Lahore,Gamma+,31.525877,74.312124,19600
149 | Muscat,Gamma+,23.570524,58.381711,16100
150 | Osaka,Gamma+,34.723985,135.212998,67300
151 | Pune,Gamma+,18.593798,73.844187,21100
152 | Riga,Gamma+,56.950147,24.136084,11600
153 | Rotterdam,Gamma+,51.869146,4.486787,18400
154 | Xi'an,Gamma+,34.30514,108.898319,23900
155 | Zhengzhou,Gamma+,34.758051,113.651027,19729
156 | Ahmedabad,Gamma,23.042155,72.595156,11700
157 | Algiers,Gamma,36.77151,3.11203,14700
158 | Amman,Gamma,32.006863,35.996048,20400
159 | Ankara,Gamma,39.94147,32.771183,19600
160 | Asunción,Gamma,-25.334873,-57.454647,23200
161 | Austin,Gamma,30.402545,-97.695763,36200
162 | Baku,Gamma,40.41852,49.880112,12200
163 | Baltimore,Gamma,39.301992,-76.651303,25700
164 | Belfast,Gamma,54.591213,-5.956015,12500
165 | Bilbao,Gamma,43.289994,-2.977223,13400
166 | Bristol,Gamma,51.478047,-2.596283,9900
167 | Charlotte,Gamma,35.217061,-80.798446,18500
168 | Guadalajara,Gamma,20.635282,-103.367083,18000
169 | Hefei,Gamma,31.834972,117.283132,20200
170 | Islamabad,Gamma,33.613946,73.084453,15200
171 | Kolkata,Gamma,22.588522,88.365255,16100
172 | Kunming,Gamma,24.966022,102.755536,20000
173 | La Paz,Gamma,-16.509127,-68.172818,7800
174 | Ljubljana,Gamma,46.070454,14.518177,8100
175 | Luanda,Gamma,-8.954777,13.278967,27100
176 | Lusaka,Gamma,-15.392474,28.359294,16100
177 | Phoenix,Gamma,33.48492,-112.028775,51000
178 | Porto,Gamma,41.463435,-7.853176,16100
179 | Saint Petersburg,Gamma,59.942837,30.288084,17400
180 | San Diego,Gamma,32.766346,-117.078526,24575
181 | San Jose,Gamma,37.366646,-121.954236,23300
182 | Santo Domingo,Gamma,18.491268,-69.906053,18200
183 | St. Louis,Gamma,38.712435,-90.383864,49400
184 | Taiyuan,Gamma,37.810584,112.591887,23400
185 | Tallinn,Gamma,59.424525,24.795728,15800
186 | Tampa,Gamma,27.987801,-82.397335,22300
187 | Tbilisi,Gamma,41.719537,44.831698,16100
188 | Tegucigalpa,Gamma,14.073792,-87.202263,7900
189 | Turin,Gamma,45.071154,7.662587,12800
190 | Vilnius,Gamma,54.689817,25.272954,7600
191 | Wellington,Gamma,-41.276648,174.782394,8700
192 | Belo Horizonte,Gamma−,-19.899461,-44.027494,20800
193 | Cologne,Gamma−,50.934034,6.962721,15800
194 | Curitiba,Gamma−,-25.503295,-49.212235,25300
195 | Durban,Gamma−,-29.845191,30.96253,22200
196 | Fuzhou,Gamma−,27.951573,116.359736,8400
197 | Johor Bahru,Gamma−,1.578175,103.699511,16800
198 | Maputo,Gamma−,-25.817914,32.591492,29700
199 | Medellín,Gamma−,6.249858,-75.577845,14400
200 | Milwaukee,Gamma−,43.04401,-88.014503,24800
201 | Minsk,Gamma−,53.890378,27.57764,12500
202 | Nantes,Gamma−,47.217697,-1.566498,13600
203 | Nashville,Gamma−,36.188744,-86.638442,35800
204 | Orlando,Gamma−,28.510731,-81.380408,38600
205 | Ottawa,Gamma−,45.386109,-75.721147,25700
206 | Penang,Gamma−,5.422595,100.508378,36100
207 | Phnom Penh,Gamma−,11.550778,104.8888,13200
208 | Poznań,Gamma−,52.410371,16.929402,13000
209 | Sacramento,Gamma−,38.62148,-121.310583,29900
210 | San Antonio,Gamma−,29.429281,-98.484132,29200
211 | Tirana,Gamma−,41.338097,19.777162,10100
212 | Wrocław,Gamma−,51.117738,17.037314,8700
213 | Yangon,Gamma−,16.901835,96.1555,16600


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/elements_processing.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import hashlib
  3 | 
  4 | 
  5 | def generate_complex_id(obj_dict):
  6 |     return "{}_{}".format(obj_dict["id"], obj_dict["version"])
  7 | 
  8 | 
  9 | def get_uniformly_shard_index_from_id(id, num_shards):
 10 |     return int(hashlib.md5(str(id).encode("utf-8")).hexdigest(), 16) % num_shards
 11 | 
 12 | 
 13 | class IdManager(object):
 14 | 
 15 |     def __init__(self):
 16 |         self.relation_id_map = {}
 17 |         self.way_id_map = {}
 18 |         self.node_id_map = {}
 19 | 
 20 |         self.id_counter = 1
 21 | 
 22 |     def replace_ids_in_way_and_its_dependencies(self, way_dict, way_nodes_dicts):
 23 |         self.replace_ids_in_obj_list([way_dict], self.way_id_map)
 24 |         local_id_map = self.replace_ids_in_obj_list(way_nodes_dicts, self.node_id_map)
 25 |         way_dict["nodes"] = [local_id_map[node_id] if node_id in local_id_map else node_id
 26 |                              for node_id in way_dict["nodes"]]
 27 | 
 28 |     def replace_ids_in_relation_and_its_dependencies(self, relation_dict, relation_nodes_dicts,
 29 |                                                      relation_ways_dicts, relation_relations_dicts):
 30 |         self.replace_ids_in_obj_list([relation_dict], self.relation_id_map)
 31 |         nodes_local_id_map = self.replace_ids_in_obj_list(relation_nodes_dicts, self.node_id_map)
 32 |         ways_local_id_map = self.replace_ids_in_obj_list(relation_ways_dicts, self.way_id_map)
 33 |         relations_local_id_map = self.replace_ids_in_obj_list(relation_relations_dicts, self.relation_id_map)
 34 | 
 35 |         for relation_way_dict in relation_ways_dicts:
 36 |             self.replace_ids_in_way_nodes(relation_way_dict, nodes_local_id_map)
 37 |         for relation_relation_dict in relation_relations_dicts:
 38 |             self.replace_ids_in_relation_members(relation_relation_dict, nodes_local_id_map,
 39 |                                                  ways_local_id_map, relations_local_id_map)
 40 |         self.replace_ids_in_relation_members(relation_dict, nodes_local_id_map,
 41 |                                              ways_local_id_map, relations_local_id_map)
 42 | 
 43 |     def replace_ids_in_way_nodes(self, way_dict, id_map):
 44 |         way_dict["nodes"] = [id_map[node_id] if node_id in id_map else node_id for node_id in way_dict["nodes"]]
 45 | 
 46 |     def replace_ids_in_relation_members(self, relation_dict, nodes_id_map, ways_id_map, relations_id_map):
 47 |         for index in range(len(relation_dict["members"])):
 48 |             member = relation_dict["members"][index]
 49 |             member_type, member_id, member_role = member
 50 |             if member_type == "n":
 51 |                 relation_dict["members"][index] = (member_type,
 52 |                           nodes_id_map[member_id] if member_id in nodes_id_map else member_id, member_role)
 53 |             elif member_type == "w":
 54 |                 relation_dict["members"][index] = (member_type,
 55 |                           ways_id_map[member_id] if member_id in ways_id_map else member_id, member_role)
 56 |             elif member_type == "r":
 57 |                 relation_dict["members"][index] = (member_type,
 58 |                           relations_id_map[member_id] if member_id in relations_id_map else member_id, member_role)
 59 | 
 60 |     def replace_ids_in_obj_list(self, osm_obj_dicts, id_map):
 61 |         local_id_map = {}
 62 |         for osm_obj_dict in osm_obj_dicts:
 63 |             osm_obj_complex_id = generate_complex_id(osm_obj_dict)
 64 |             if osm_obj_complex_id in id_map:
 65 |                 local_id_map[osm_obj_dict["id"]] = id_map[osm_obj_complex_id]
 66 |                 osm_obj_dict["id"] = id_map[osm_obj_complex_id]
 67 |             else:
 68 |                 id_map[osm_obj_complex_id] = self.id_counter
 69 |                 local_id_map[osm_obj_dict["id"]] = self.id_counter
 70 |                 osm_obj_dict["id"] = self.id_counter
 71 |                 self.id_counter = self.id_counter + 1
 72 |         return local_id_map
 73 | 
 74 |     def get_simplified_id_and_original_id_maps(self):
 75 |         result_relations_ids_map = {simple_id: int(complex_id.split("_")[0])
 76 |                                     for complex_id, simple_id in self.relation_id_map.items()}
 77 |         result_ways_ids_map = {simple_id: int(complex_id.split("_")[0])
 78 |                                for complex_id, simple_id in self.way_id_map.items()}
 79 |         result_nodes_ids_map = {simple_id: int(complex_id.split("_")[0])
 80 |                                 for complex_id, simple_id in self.node_id_map.items()}
 81 |         return result_nodes_ids_map, result_ways_ids_map, result_relations_ids_map
 82 | 
 83 |     def reset(self):
 84 |         self.relation_id_map.clear()
 85 |         self.way_id_map.clear()
 86 |         self.node_id_map.clear()
 87 |         self.id_counter = 1
 88 | 
 89 | 
 90 | class BatchManager(object):
 91 | 
 92 |     def __init__(self, gdal_batch_size, entities_number):
 93 |         self.entities_number = entities_number
 94 | 
 95 |         self.nodes_batch = {}
 96 |         self.ways_batch = {}
 97 |         self.all_relations_batch = {}
 98 |         self.main_relation_batch = {}
 99 |         self.ways_batch_counter = 0
100 |         self.gdal_batch_size = gdal_batch_size
101 | 
102 |         self.id_manager = IdManager()
103 | 
104 |     def add_osm_dicts_to_batches(self, node_dicts_list=list(), way_dicts_list=list(), relation_dicts_list=list(),
105 |                                  main_relation_dict=None):
106 |         for node_dict in node_dicts_list:
107 |             self.nodes_batch[node_dict["id"]] = node_dict
108 |         for way_dict in way_dicts_list:
109 |             self.ways_batch[way_dict["id"]] = way_dict
110 |         for relation_dict in relation_dicts_list:
111 |             self.all_relations_batch[relation_dict["id"]] = relation_dict
112 |         if main_relation_dict is not None:
113 |             self.all_relations_batch[main_relation_dict["id"]] = main_relation_dict
114 |             self.main_relation_batch[main_relation_dict["id"]] = main_relation_dict
115 |         self.ways_batch_counter = self.ways_batch_counter + 1
116 | 
117 |     def sorted_obj_batch_values(self, obj_batch):
118 |         return sorted(list(obj_batch.values()), key=lambda obj: obj["id"])
119 | 
120 |     def get_batches_values_sorted_lists(self):
121 |         return self.sorted_obj_batch_values(self.nodes_batch), \
122 |                self.sorted_obj_batch_values(self.ways_batch), \
123 |                self.sorted_obj_batch_values(self.all_relations_batch)
124 | 
125 |     def get_ways_simplified_ids(self):
126 |         return list(self.ways_batch.keys())
127 | 
128 |     def get_main_relations_simplified_ids(self):
129 |         return list(self.main_relation_batch.keys())
130 | 
131 |     def get_simplified_id_and_original_id_maps(self):
132 |         return self.id_manager.get_simplified_id_and_original_id_maps()
133 | 
134 |     def replace_ids_in_way_and_its_dependencies(self, way_dict, way_nodes_dicts):
135 |         self.id_manager.replace_ids_in_way_and_its_dependencies(way_dict, way_nodes_dicts)
136 | 
137 |     def replace_ids_in_relation_and_its_dependencies(self, relation_dict, relation_nodes_dicts,
138 |                                                      relation_ways_dicts, relation_relations_dicts):
139 |         self.id_manager.replace_ids_in_relation_and_its_dependencies(relation_dict, relation_nodes_dicts,
140 |                                                                      relation_ways_dicts, relation_relations_dicts)
141 | 
142 |     def restore_ways_ids_and_add_geometry(self, id_geometry_map, result_func):
143 |         result_nodes_ids_map, result_ways_ids_map, _ = self.id_manager.get_simplified_id_and_original_id_maps()
144 | 
145 |         for way_dict_id, way_dict in self.ways_batch.items():
146 |             if way_dict["id"] in id_geometry_map:
147 |                 way_dict["geometry"] = json.dumps(id_geometry_map[way_dict["id"]])
148 |             way_dict["id"] = result_ways_ids_map[way_dict["id"]]
149 |             way_dict["nodes"] = [result_nodes_ids_map[node_id] if node_id in result_nodes_ids_map else node_id for
150 |                                  node_id in way_dict["nodes"]]
151 |             result_func(way_dict)
152 | 
153 |     def restore_relations_ids_and_add_geometry(self, id_geometry_map, result_func):
154 |         result_nodes_ids_map, result_ways_ids_map, result_relations_ids_map = \
155 |             self.id_manager.get_simplified_id_and_original_id_maps()
156 | 
157 |         for relation_dict_id, relation_dict in self.main_relation_batch.items():
158 |             if relation_dict["id"] in id_geometry_map:
159 |                 relation_dict["geometry"] = json.dumps(id_geometry_map[relation_dict["id"]])
160 |             relation_dict["id"] = result_relations_ids_map[relation_dict["id"]]
161 |             self.id_manager.replace_ids_in_relation_members(relation_dict, result_nodes_ids_map,
162 |                                                             result_ways_ids_map, result_relations_ids_map)
163 |             result_func(relation_dict)
164 | 
165 |     def generate_batch_osm_file_name(self, work_dir, current_entity_type, current_index, pool_size):
166 |         batch_end = current_index
167 |         batch_start = batch_end - (self.get_batch_limit_for_current_entity(current_entity_type)*pool_size)
168 |         return work_dir + '{}_{}_{}.osm'.format(current_entity_type, batch_start, batch_end)
169 | 
170 |     def is_full(self, entity_type):
171 |         return self.ways_batch_counter >= self.get_batch_limit_for_current_entity(entity_type)
172 | 
173 |     def get_batch_limit_for_current_entity(self, entity_type):
174 |         return self.gdal_batch_size if entity_type != "relations" else self.gdal_batch_size/2
175 | 
176 |     def reset(self):
177 |         self.ways_batch_counter = 0
178 |         self.nodes_batch.clear()
179 |         self.ways_batch.clear()
180 |         self.all_relations_batch.clear()
181 |         self.main_relation_batch.clear()
182 |         self.id_manager.reset()
183 | 


--------------------------------------------------------------------------------
/tasks_docker_images/osm_converter_with_history_index/src/cache_manager.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import datetime
  4 | import time
  5 | 
  6 | import gcs_service
  7 | import file_service
  8 | 
  9 | OSM_ENTITIES = ["nodes", "ways", "relations"]
 10 | 
 11 | 
 12 | def create_processing_counter():
 13 |     return {key: 0 for key in OSM_ENTITIES}
 14 | 
 15 | 
 16 | def is_file_fresh(last_updated_timestamp, data_freshness_exp_days):
 17 |     db_freshness = datetime.timedelta(seconds=int(time.time()) - last_updated_timestamp)
 18 |     logging.info("Freshness: {}".format(db_freshness))
 19 |     return db_freshness < datetime.timedelta(days=data_freshness_exp_days)
 20 | 
 21 | 
 22 | def download_db_if_exists(dbs_file_paths,
 23 |                           dest_bucket,
 24 |                           dest_dir_name):
 25 |     db_gcs_and_local_paths = []
 26 |     for db_file_path in dbs_file_paths:
 27 |         db_name = file_service.file_name_from_path(db_file_path)
 28 |         db_blob_name = dest_dir_name + db_name
 29 | 
 30 |         if not gcs_service.is_gcs_blob_exists(dest_bucket, db_blob_name):
 31 |             return False
 32 |         else:
 33 |             db_gcs_and_local_paths.append((db_blob_name, db_file_path))
 34 | 
 35 |     for db_blob_name, db_file_path in db_gcs_and_local_paths:
 36 |         gcs_service.from_gcs_to_local_file(dest_bucket, db_blob_name, db_file_path)
 37 |     return True
 38 | 
 39 | 
 40 | def get_index_metadata_file_path(src_osm_name, num_db_shards):
 41 |     return file_service.file_name_without_ext(src_osm_name) + "_{}_index_shards.metadata.txt".format(num_db_shards)
 42 | 
 43 | 
 44 | def get_result_shard_metadata_file_path(src_osm_name, entity_type, index, num_results_shards):
 45 |     return file_service.file_name_without_ext(src_osm_name) + "_{}_{}_{}.metadata.txt".format(entity_type, index + 1,
 46 |                                                                                               num_results_shards)
 47 | 
 48 | 
 49 | def download_and_read_metadata_file(gcs_bucket, gcs_dir_name, src_osm_name, num_db_shards, num_results_shards):
 50 |     src_osm_file_name = file_service.file_name_from_path(src_osm_name)
 51 | 
 52 |     index_metadata_file_path = get_index_metadata_file_path(src_osm_file_name, num_db_shards)
 53 |     index_metadata_blob_name = gcs_dir_name + index_metadata_file_path
 54 |     if gcs_service.is_gcs_blob_exists(gcs_bucket, index_metadata_blob_name):
 55 |         gcs_service.from_gcs_to_local_file(gcs_bucket, index_metadata_blob_name, index_metadata_file_path)
 56 | 
 57 |     shards_metadata_files = {}
 58 |     for entity in OSM_ENTITIES:
 59 |         shards_metadata_files_by_entity = {}
 60 |         for index in range(num_results_shards):
 61 |             result_shard_metadata_file_path = get_result_shard_metadata_file_path(src_osm_file_name, entity, index,
 62 |                                                                                   num_results_shards)
 63 |             result_shard_metadata_blob_name = gcs_dir_name + result_shard_metadata_file_path
 64 |             if gcs_service.is_gcs_blob_exists(gcs_bucket, result_shard_metadata_blob_name):
 65 |                 gcs_service.from_gcs_to_local_file(gcs_bucket, result_shard_metadata_blob_name,
 66 |                                                    result_shard_metadata_file_path)
 67 |             shards_metadata_files_by_entity[str(index)] = result_shard_metadata_file_path
 68 |         shards_metadata_files[entity] = shards_metadata_files_by_entity
 69 |     return ProcessingMetadata(index_metadata_file_path, shards_metadata_files)
 70 | 
 71 | 
 72 | def save_and_upload_metadata_to_gcs(metadata,
 73 |                                     dest_bucket,
 74 |                                     dest_dir_name,
 75 |                                     save_only_shard_by_entity_and_index=None,
 76 |                                     only_db_metadata=False):
 77 |     files_to_save = metadata.save_to_json_files(save_only_shard_by_entity_and_index, only_db_metadata)
 78 | 
 79 |     for file_to_save in files_to_save:
 80 |         timestamps_file_name = file_service.file_name_from_path(file_to_save)
 81 |         timestamps_file_blob_name = dest_dir_name + timestamps_file_name
 82 |         gcs_service.upload_file_to_gcs(file_to_save, dest_bucket, timestamps_file_blob_name)
 83 | 
 84 | 
 85 | class ProcessingMetadata(object):
 86 | 
 87 |     def __init__(self, index_metadata_file_path, shards_metadata_files):
 88 |         self.index_metadata_file_path = index_metadata_file_path
 89 |         self.shards_metadata_files = shards_metadata_files
 90 |         try:
 91 |             with open(index_metadata_file_path, "r") as f:
 92 |                 metadata_json = json.load(f)
 93 |             self.elements_counter = MetadataCounter(metadata_json["elements_counter"])
 94 |             self.index_db_timestamps = FileTimestamps(metadata_json["index_db"])
 95 |         except Exception as e:
 96 |             logging.info(str(e))
 97 |             self.elements_counter = MetadataCounter()
 98 |             self.index_db_timestamps = FileTimestamps()
 99 | 
100 |         self.shards_timestamps = {}
101 |         for entity, shards_metadata_files_by_entity in shards_metadata_files.items():
102 |             shards_timestamps_by_entity = {}
103 |             for index_str, shards_metadata_file in shards_metadata_files_by_entity.items():
104 |                 try:
105 |                     with open(shards_metadata_file, "r") as f:
106 |                         metadata_json = json.load(f)
107 |                     shards_timestamps_by_entity[index_str] = FileTimestamps(metadata_json)
108 |                 except Exception as e:
109 |                     logging.info(str(e))
110 |                     shards_timestamps_by_entity[index_str] = FileTimestamps()
111 |             self.shards_timestamps[entity] = shards_timestamps_by_entity
112 | 
113 |     def update_db_max_timestamp(self, db_max_timestamp):
114 |         self.index_db_timestamps.update_max_timestamp(db_max_timestamp)
115 | 
116 |     def update_db_last_updated(self, db_last_updated):
117 |         self.index_db_timestamps.update_last_updated(db_last_updated)
118 | 
119 |     def update_processing_counter(self, counter_dict):
120 |         self.elements_counter.update(counter_dict)
121 | 
122 |     def get_min_history_results_last_updated_timestamp(self):
123 |         return min([min([shard_timestamps.last_updated for shard_index_str, shard_timestamps in
124 |                          shards_timestamps_by_entity.items()]) for entity, shards_timestamps_by_entity in
125 |                     self.shards_timestamps.items()])
126 | 
127 |     def get_history_results_max_timestamps(self):
128 |         last_elements_timestamps = {}
129 |         for entity, shards_timestamps_by_entity in self.shards_timestamps.items():
130 |             last_elements_timestamps[entity] = {shard_index_str: shard_timestamps.max_timestamp for
131 |                                                 shard_index_str, shard_timestamps in
132 |                                                 shards_timestamps_by_entity.items()}
133 |         return last_elements_timestamps
134 | 
135 |     def update_history_result_timestamps(self, entity_type, shard_index):
136 |         self.shards_timestamps[entity_type][str(shard_index)].update_max_timestamp(
137 |             self.index_db_timestamps.max_timestamp)
138 |         self.shards_timestamps[entity_type][str(shard_index)].update_last_updated(
139 |             int(time.time()))
140 | 
141 |     def save_to_json_files(self, specific_history_results_shards_to_save=None, only_db_metadata=False):
142 |         files_to_save = []
143 |         if not specific_history_results_shards_to_save:
144 |             self.save_db_metadata(files_to_save)
145 |         if not only_db_metadata:
146 |             for entity, shards_timestamps_by_entity in self.shards_timestamps.items():
147 |                 for shard_index_str, shard_timestamps in shards_timestamps_by_entity.items():
148 |                     if not specific_history_results_shards_to_save or (
149 |                             entity == specific_history_results_shards_to_save[0] and int(shard_index_str) ==
150 |                             int(specific_history_results_shards_to_save[1])):
151 |                         shard_file = self.shards_metadata_files[entity][shard_index_str]
152 |                         with open(shard_file, "w") as f:
153 |                             json.dump(shard_timestamps.to_dict(), f)
154 |                         files_to_save.append(shard_file)
155 |         return files_to_save
156 | 
157 |     def save_db_metadata(self, files_to_save):
158 |         with open(self.index_metadata_file_path, "w") as f:
159 |             json.dump({"elements_counter": self.elements_counter.to_dict(),
160 |                        "index_db": self.index_db_timestamps.to_dict()}, f)
161 |             files_to_save.append(self.index_metadata_file_path)
162 |         return files_to_save
163 | 
164 |     def to_dict(self):
165 |         history_results = {}
166 |         for entity, shards_timestamps_by_entity in self.shards_timestamps.items():
167 |             history_results[entity] = {shard_index_str: shard_timestamps.to_dict() for shard_index_str, shard_timestamps
168 |                                        in
169 |                                        shards_timestamps_by_entity.items()}
170 |         return {"elements_counter": self.elements_counter.to_dict(),
171 |                 "index_db": self.index_db_timestamps.to_dict(),
172 |                 "history_results": history_results}
173 | 
174 | 
175 | class MetadataCounter(object):
176 | 
177 |     def __init__(self, counter_dict=None):
178 |         if counter_dict:
179 |             self.counter = {entity: counter_dict[entity] for entity in OSM_ENTITIES}
180 |         else:
181 |             self.counter = {entity: 0 for entity in OSM_ENTITIES}
182 | 
183 |     def update(self, counter):
184 |         self.counter = counter
185 | 
186 |     def to_dict(self):
187 |         return self.counter
188 | 
189 | 
190 | class FileTimestamps(object):
191 | 
192 |     def __init__(self, timestamps_dict=None):
193 |         if timestamps_dict:
194 |             self.max_timestamp = timestamps_dict["max_timestamp"]
195 |             self.last_updated = timestamps_dict["last_updated"]
196 |         else:
197 |             self.max_timestamp = 0
198 |             self.last_updated = 0
199 | 
200 |     def update_max_timestamp(self, max_timestamp):
201 |         self.max_timestamp = max_timestamp
202 | 
203 |     def update_last_updated(self, last_updated):
204 |         self.last_updated = last_updated
205 | 
206 |     def to_dict(self):
207 |         return {"max_timestamp": self.max_timestamp, "last_updated": self.last_updated}
208 | 


--------------------------------------------------------------------------------