├── dags ├── utils │ ├── __init__.py │ ├── bq_utils.py │ ├── gcs_utils.py │ └── metadata_manager.py ├── sql │ ├── create_features_part_format.sql │ ├── join_relations_geometries.sql │ ├── join_nodes_geometries.sql │ └── join_ways_geometries.sql ├── schemas │ ├── simple_table_schema.json │ ├── features_table_schema.json │ ├── nodes_table_schema.json │ ├── ways_table_schema.json │ └── relations_table_schema.json └── transfer_src_file.py ├── deployment ├── config │ ├── .gitignore │ ├── set_env_vars.sh │ ├── set_env_vars_from_config.sh │ └── generate_config.py ├── delete_dag.sh ├── create_composer_env.sh ├── upload_dags_files.sh └── create_full.sh ├── examples └── clustering │ ├── .gitignore │ ├── requirements.txt │ ├── colors │ ├── README.md │ └── vectorize.py │ ├── bq_udf │ ├── geohash.sql │ ├── metrics.js │ ├── metrics.sql │ └── geohash.js │ ├── words │ ├── w2v_generate_schema.py │ ├── w2v_to_jsonl.py │ └── README.md │ ├── tf_idf │ ├── create_model.py │ ├── materialize.sql │ ├── vectorize.sql │ └── analyze.sql │ └── cities │ ├── query.py │ ├── README.md │ └── cities.csv ├── tasks_docker_images ├── generate_layers │ ├── keys │ │ └── .gitignore │ ├── README.md │ ├── Dockerfile │ └── src │ │ ├── layered_gis │ │ ├── 00_generate_queries.sh │ │ ├── building │ │ │ └── building.sh │ │ ├── aeroway │ │ │ └── aeroway.sh │ │ ├── poi_money │ │ │ └── poi_money.sh │ │ ├── cycle_route_segment │ │ │ └── cycle_route_segment.sh │ │ ├── powerline │ │ │ └── powerline.sh │ │ ├── waterway │ │ │ └── waterways.sh │ │ ├── poi_health │ │ │ └── poi_health.sh │ │ ├── route │ │ │ └── route.sh │ │ ├── poi_catering │ │ │ └── poi_catering.sh │ │ ├── traffic_waterway │ │ │ └── traffic_waterway.sh │ │ ├── barrier │ │ │ └── barrier.sh │ │ ├── query_templates_history.sh │ │ ├── 01_append_table.sh │ │ ├── poi_accommodation │ │ │ └── poi_accommodation.sh │ │ ├── natural │ │ │ └── natural.sh │ │ ├── poi_destination │ │ │ └── poi_destination.sh │ │ ├── boundary │ │ │ └── boundary.sh │ │ ├── query_templates_planet.sh │ │ ├── poi_leisure │ │ │ └── poi_leisure.sh │ │ ├── traffic_calming │ │ │ └── traffic_calming.sh │ │ ├── land_use │ │ │ └── land_use.sh │ │ ├── traffic │ │ │ └── traffic.sh │ │ ├── transport │ │ │ └── transport.sh │ │ ├── pofw │ │ │ └── pofw.sh │ │ ├── traffic_barrier │ │ │ └── traffic_barrier.sh │ │ ├── poi_public │ │ │ └── poi_public.sh │ │ ├── poi_shopping │ │ │ └── poi_shopping.sh │ │ ├── power │ │ │ └── power.sh │ │ ├── poi_tourism │ │ │ └── poi_tourism.sh │ │ ├── poi_miscpoi │ │ │ └── poi_miscpoi.sh │ │ └── place │ │ │ └── place.sh │ │ ├── run.sh │ │ └── schema │ │ └── layers_schema.json ├── osm_to_features │ ├── keys │ │ └── .gitignore │ ├── src │ │ ├── download_osm.sh │ │ ├── csv_to_json │ │ │ ├── csv-to-json.sh │ │ │ └── geojson-csv-to-json.pl │ │ ├── osm_to_features.sh │ │ ├── osmconf.ini │ │ └── osm2geojsoncsv │ ├── Dockerfile │ └── utils │ │ └── get_client_id.py ├── osm_converter_with_history_index │ ├── src │ │ ├── gdal │ │ │ ├── __init__.py │ │ │ ├── run_ogr.sh │ │ │ ├── osmconf.ini │ │ │ └── gdal_handler.py │ │ ├── file_service.py │ │ ├── gcs_service.py │ │ ├── elements_transformer.py │ │ ├── parser.py │ │ ├── osm_index.py │ │ ├── elements_processing.py │ │ └── cache_manager.py │ ├── keys │ │ └── .gitignore │ └── Dockerfile └── osm_to_nodes_ways_relations │ ├── keys │ └── .gitignore │ ├── Dockerfile │ └── src │ ├── osm_dtos.py │ └── pbf_parser.py ├── .gitignore ├── docs ├── graph.png └── OSM_Planet_file_processing.png ├── triggering └── trigger_osm_to_big_query_dg_gcf │ ├── requirements.txt │ └── main.py ├── .gcloudignore └── utils └── get_client_id.py /dags/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deployment/config/.gitignore: -------------------------------------------------------------------------------- 1 | config* -------------------------------------------------------------------------------- /examples/clustering/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | !data/.gitkeep -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/keys/.gitignore: -------------------------------------------------------------------------------- 1 | gcloud_keys.json -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_features/keys/.gitignore: -------------------------------------------------------------------------------- 1 | gcloud_keys.json -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | venv/ 3 | test_data/ 4 | deployment/create_full_* -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/gdal/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/README.md: -------------------------------------------------------------------------------- 1 | # osm to nodes/ways/relations -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_nodes_ways_relations/keys/.gitignore: -------------------------------------------------------------------------------- 1 | gcloud_keys.json -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/keys/.gitignore: -------------------------------------------------------------------------------- 1 | gcloud_keys.json -------------------------------------------------------------------------------- /docs/graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcp-pdp/geo-openstreetmap/HEAD/docs/graph.png -------------------------------------------------------------------------------- /examples/clustering/requirements.txt: -------------------------------------------------------------------------------- 1 | textract 2 | nltk 3 | numpy 4 | scipy 5 | matplotlib 6 | sklearn 7 | -------------------------------------------------------------------------------- /triggering/trigger_osm_to_big_query_dg_gcf/requirements.txt: -------------------------------------------------------------------------------- 1 | requests_toolbelt==0.9.1 2 | google-auth==1.15.0 -------------------------------------------------------------------------------- /docs/OSM_Planet_file_processing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcp-pdp/geo-openstreetmap/HEAD/docs/OSM_Planet_file_processing.png -------------------------------------------------------------------------------- /dags/utils/bq_utils.py: -------------------------------------------------------------------------------- 1 | def union_queries(queries): 2 | union_all_sql = "\nUNION ALL\n" 3 | return union_all_sql.join(queries) 4 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_features/src/download_osm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | OSM_GCS_PATH="$1" 5 | OSM_DEST_PATH="$2" 6 | 7 | gsutil cp ${OSM_GCS_PATH} ${OSM_DEST_PATH} -------------------------------------------------------------------------------- /dags/sql/create_features_part_format.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | '{}' AS feature_type, 3 | osm_id, 4 | osm_way_id, 5 | osm_version, 6 | osm_timestamp, 7 | all_tags, 8 | geometry 9 | FROM 10 | {} -------------------------------------------------------------------------------- /deployment/delete_dag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DAG_NAME="$1" 3 | COMPOSER_ENV_NAME="$2" 4 | REGION_LOCATION="$3" 5 | 6 | gcloud composer environments storage dags delete \ 7 | --environment $COMPOSER_ENV_NAME --location $REGION_LOCATION \ 8 | $DAG_NAME 9 | -------------------------------------------------------------------------------- /deployment/config/set_env_vars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OSM_ENV_VARS_STR="$1" 3 | COMPOSER_ENV_NAME="$2" 4 | REGION_LOCATION="$3" 5 | 6 | gcloud composer environments update $COMPOSER_ENV_NAME \ 7 | --location $REGION_LOCATION \ 8 | --update-env-variables=$OSM_ENV_VARS_STR -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM google/cloud-sdk 2 | 3 | # copy script files 4 | COPY src /generate_layers/src 5 | # set work dir 6 | WORKDIR /generate_layers/src 7 | 8 | RUN ["chmod", "+x", "run.sh"] 9 | 10 | CMD ./run.sh $MODE 11 | 12 | -------------------------------------------------------------------------------- /deployment/create_composer_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | COMPOSER_ENV_NAME="$1" 3 | REGION_LOCATION="$2" 4 | DISK_SIZE="$3" 5 | MACHINE_TYPE="$4" 6 | 7 | gcloud composer environments create $COMPOSER_ENV_NAME \ 8 | --location $REGION_LOCATION \ 9 | --disk-size $DISK_SIZE \ 10 | --machine-type $MACHINE_TYPE 11 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/00_generate_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir "../sql" 4 | FOLDERS="$(find . -mindepth 1 -type d)" 5 | 6 | for FOLDER in $FOLDERS; do 7 | cd $FOLDER || exit 8 | FILE="$(find *.sh)" 9 | echo "running " $FOLDER/$FILE 10 | bash $FILE 11 | cd .. 12 | done -------------------------------------------------------------------------------- /deployment/upload_dags_files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DAG_ELEMENT_PATH="$1" 3 | COMPOSER_ENV_NAME="$2" 4 | REGION_LOCATION="$3" 5 | 6 | echo "Uploading $DAG_ELEMENT_PATH ..." 7 | gcloud composer environments storage dags import \ 8 | --environment $COMPOSER_ENV_NAME --location $REGION_LOCATION \ 9 | --source $DAG_ELEMENT_PATH 10 | -------------------------------------------------------------------------------- /examples/clustering/colors/README.md: -------------------------------------------------------------------------------- 1 | Download document with coloring scheme 2 | ```shell script 3 | wget -P ../data/ https://planning-org-uploaded-media.s3.amazonaws.com/document/LBCS.pdf 4 | ``` 5 | 6 | Convert document sections to vectors 7 | ```shell script 8 | python vectorize.py > colors.jsonl 9 | ``` 10 | 11 | Import colors.jsonl into BigQuery. 12 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/building/building.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CODE=1500 5 | CLASS=building 6 | K=building 7 | V=building 8 | N=building 9 | F=building 10 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K')" 11 | common_query > "../../sql/$F.sql" 12 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_features/src/csv_to_json/csv-to-json.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | CSV_FILES_PATH="$1" 4 | 5 | for csv_file in $(ls -d ${CSV_FILES_PATH}*.geojson.csv); 6 | do 7 | echo ${csv_file} 8 | cat ${csv_file} \ 9 | | perl csv_to_json/geojson-csv-to-json.pl \ 10 | 2> ${csv_file}.errors.jsonl > ${csv_file}.jsonl 11 | done 12 | -------------------------------------------------------------------------------- /examples/clustering/bq_udf/geohash.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE FUNCTION udfs.decodeGeoHash(geohash STRING) 2 | RETURNS STRUCT, longitude ARRAY> 3 | LANGUAGE js 4 | OPTIONS ( 5 | library=["gs://gcp-pdp-osm-dev-bq-udf/gis/geohash.js"] 6 | ) 7 | AS 8 | """ 9 | return decodeGeoHash(geohash); 10 | """; 11 | SELECT udfs.decodeGeoHash('0000'); -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODE="$1" 3 | cd layered_gis 4 | 5 | if [ $MODE == "planet" ] 6 | then 7 | cp query_templates_planet.sh query_templates.sh 8 | else 9 | cp query_templates_history.sh query_templates.sh 10 | fi 11 | 12 | echo "running ./00_generate_queries.sh" 13 | ./00_generate_queries.sh 14 | echo "running ./01_append_table.sh" 15 | ./01_append_table.sh $MODE 16 | 17 | -------------------------------------------------------------------------------- /examples/clustering/words/w2v_generate_schema.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | NUM_COLUMNS = 300 4 | # Template for resulting schema word:STRING,f1:FLOAT,f2... 5 | SCHEMA = 'word:STRING,{}' 6 | if __name__ == '__main__': 7 | num_columns = NUM_COLUMNS 8 | if len(sys.argv) > 1: 9 | num_columns = int(sys.argv[1]) 10 | 11 | print(SCHEMA.format(','.join(['f{}:FLOAT'.format(x) for x in range(1, num_columns + 1)]))) 12 | -------------------------------------------------------------------------------- /examples/clustering/words/w2v_to_jsonl.py: -------------------------------------------------------------------------------- 1 | import fileinput 2 | import json 3 | 4 | if __name__ == '__main__': 5 | num_columns = None 6 | for line in fileinput.input(): 7 | columns = line.strip().split(' ') 8 | 9 | if num_columns is None: 10 | num_columns = len(columns) - 1 11 | 12 | result = {'word': columns[0]} 13 | for i in range(num_columns): 14 | result['f{}'.format(i+1)] = columns[i+1] 15 | print(json.dumps(result)) 16 | -------------------------------------------------------------------------------- /dags/sql/join_relations_geometries.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | osmium_table.id, 3 | osmium_table.version, 4 | osmium_table.username, 5 | osmium_table.changeset, 6 | osmium_table.visible, 7 | osmium_table.osm_timestamp, 8 | gdal_table.geometry, 9 | osmium_table.all_tags, 10 | osmium_table.members, 11 | FROM 12 | {}.planet_relations AS osmium_table 13 | LEFT JOIN 14 | {}.planet_features AS gdal_table 15 | ON 16 | osmium_table.id = gdal_table.osm_id AND osmium_table.osm_timestamp = gdal_table.osm_timestamp -------------------------------------------------------------------------------- /examples/clustering/bq_udf/metrics.js: -------------------------------------------------------------------------------- 1 | function euclideanDistances(a, b) { 2 | var sum = 0; 3 | var n; 4 | for (n = 0; n < a.length; n++) { 5 | sum += Math.pow(a[n] - b[n], 2); 6 | } 7 | return Math.sqrt(sum); 8 | } 9 | 10 | function cosineSimilarity(a, b) { 11 | var p = 0; 12 | var p2 = 0; 13 | var q2 = 0; 14 | var n; 15 | for (var n = 0; n < a.length; n++) { 16 | p += a[n] * b[n]; 17 | p2 += a[n] * a[n]; 18 | q2 += b[n] * b[n]; 19 | } 20 | return p / (Math.sqrt(p2) * Math.sqrt(q2)); 21 | } 22 | -------------------------------------------------------------------------------- /dags/sql/join_nodes_geometries.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | osmium_table.id, 3 | osmium_table.version, 4 | osmium_table.username, 5 | osmium_table.changeset, 6 | osmium_table.visible, 7 | osmium_table.osm_timestamp, 8 | gdal_table.geometry, 9 | osmium_table.all_tags, 10 | osmium_table.latitude, 11 | osmium_table.longitude 12 | FROM 13 | {}.planet_nodes AS osmium_table 14 | LEFT JOIN 15 | {}.planet_features AS gdal_table 16 | ON 17 | osmium_table.id = gdal_table.osm_id AND osmium_table.osm_timestamp = gdal_table.osm_timestamp -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_features/src/osm_to_features.sh: -------------------------------------------------------------------------------- 1 | LAYERS="$1" 2 | 3 | SRC_FILE_NAME=$(basename $SRC_OSM_GCS_URI) 4 | LOCAL_FILE_NAME=${DATA_DIR}${SRC_FILE_NAME} 5 | CSV_FILE_PREFIX=feature 6 | JSONL_EXT=.jsonl 7 | 8 | echo 'Source GCS URI: '$SRC_OSM_GCS_URI 9 | gsutil cp $SRC_OSM_GCS_URI $LOCAL_FILE_NAME 10 | echo $SRC_OSM_GCS_URI' copied to '$LOCAL_FILE_NAME 11 | 12 | ./osm2geojsoncsv $LOCAL_FILE_NAME ${DATA_DIR}${CSV_FILE_PREFIX} $LAYERS 13 | ./csv_to_json/csv-to-json.sh ${DATA_DIR} 14 | gsutil cp ${DATA_DIR}*${JSONL_EXT} ${FEATURES_DIR_GCS_URI} -------------------------------------------------------------------------------- /dags/sql/join_ways_geometries.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | osmium_table.id, 3 | osmium_table.version, 4 | osmium_table.username, 5 | osmium_table.changeset, 6 | osmium_table.visible, 7 | osmium_table.osm_timestamp, 8 | gdal_table.geometry, 9 | osmium_table.all_tags, 10 | osmium_table.nodes, 11 | FROM 12 | {}.planet_ways AS osmium_table 13 | LEFT JOIN 14 | {}.planet_features AS gdal_table 15 | ON 16 | (osmium_table.id = gdal_table.osm_id OR osmium_table.id = gdal_table.osm_way_id) AND osmium_table.osm_timestamp = gdal_table.osm_timestamp -------------------------------------------------------------------------------- /examples/clustering/tf_idf/create_model.py: -------------------------------------------------------------------------------- 1 | CREATE_MODEL_STATEMENT = """ 2 | CREATE OR REPLACE MODEL 3 | osm_clustering_grid_1km.kmeans_tfidf_clusters_10 4 | TRANSFORM( 5 | {} 6 | ) 7 | OPTIONS(model_type='kmeans', num_clusters=10, max_iterations=50, EARLY_STOP=TRUE, MIN_REL_PROGRESS=0.001) AS 8 | SELECT 9 | tfidf_vec 10 | FROM 11 | osm_clustering_grid_1km.vectors_tfidf 12 | """ 13 | DIMENSIONALITY = 339 14 | create_model = CREATE_MODEL_STATEMENT.format( 15 | ', '.join(['tfidf_vec[OFFSET({})] as f{}'.format(i, i + 1) for i in range(DIMENSIONALITY)])) 16 | print(create_model) 17 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/aeroway/aeroway.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ../query_templates.sh 3 | 4 | CLASS=aeroway 5 | LAYER=( 6 | "6701:aeroway=runway" 7 | "6702:aeroway=taxiway" 8 | ) 9 | 10 | for layer in "${LAYER[@]}" 11 | do 12 | CODE="${layer%%:*}" 13 | KVF="${layer##*:}" 14 | K="${KVF%%=*}" 15 | VF="${KVF##*=}" 16 | V="${VF%%>*}" 17 | F="${VF##*>}" 18 | N="${F%%-*}" 19 | 20 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 21 | common_query > "../../sql/$F.sql" 22 | done 23 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/poi_money/poi_money.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=poi_money 5 | LAYER=( 6 | "2601:amenity=bank" 7 | "2602:amenity=atm" 8 | ) 9 | 10 | 11 | for layer in "${LAYER[@]}" 12 | do 13 | CODE="${layer%%:*}" 14 | KVF="${layer##*:}" 15 | K="${KVF%%=*}" 16 | VF="${KVF##*=}" 17 | V="${VF%%>*}" 18 | F="${VF##*>}" 19 | N="${F%%-*}" 20 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 21 | common_query > "../../sql/$F.sql" 22 | done 23 | -------------------------------------------------------------------------------- /.gcloudignore: -------------------------------------------------------------------------------- 1 | # This file specifies files that are *not* uploaded to Google Cloud Platform 2 | # using gcloud. It follows the same syntax as .gitignore, with the addition of 3 | # "#!include" directives (which insert the entries of the given .gitignore-style 4 | # file at that point). 5 | # 6 | # For more information, run: 7 | # $ gcloud topic gcloudignore 8 | # 9 | .gcloudignore 10 | # If you would like to upload your .git directory, .gitignore file or files 11 | # from your .gitignore file, remove the corresponding line 12 | # below: 13 | .git 14 | .gitignore 15 | 16 | node_modules 17 | #!include:.gitignore 18 | -------------------------------------------------------------------------------- /dags/schemas/simple_table_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "type": "INTEGER", 4 | "name": "id", 5 | "description": "Object unique ID." 6 | }, 7 | { 8 | "type": "RECORD", 9 | "mode": "REPEATED", 10 | "name": "all_tags", 11 | "description": "Unstructured key=value attributes for this object.", 12 | "fields": [ 13 | { 14 | "type": "STRING", 15 | "name": "key", 16 | "description": "Attribute key." 17 | }, 18 | { 19 | "type": "STRING", 20 | "name": "value", 21 | "description": "Attribute value." 22 | } 23 | ] 24 | } 25 | ] -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/gdal/run_ogr.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | OGRCONFIG="$1" 4 | SRC_FILE="$2" 5 | DEST_FILE="$3" 6 | OGR_TYPE="$4" 7 | 8 | if [ "$OGR_TYPE" = "multipolygons" ] 9 | then 10 | osm_fields="osm_id, osm_way_id" 11 | else 12 | osm_fields="osm_id, NULL as osm_way_id" 13 | fi 14 | ogr2ogr \ 15 | -skipfailures \ 16 | -f GeoJSON \ 17 | $DEST_FILE $SRC_FILE \ 18 | --config OSM_CONFIG_FILE $OGRCONFIG \ 19 | -dialect sqlite \ 20 | -sql "select $osm_fields, AsGeoJSON(geometry) AS geometry, geometry from ${OGR_TYPE} where ST_IsValid(geometry) = 1" \ 21 | --debug on \ 22 | 2> /dev/null -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/cycle_route_segment/cycle_route_segment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=cycle_route_segment 5 | LAYER=( 6 | "9102:rcn=yes>regional_cycle_network" 7 | "9102:lcn=yes>local_cycle_network" 8 | ) 9 | 10 | for layer in "${LAYER[@]}" 11 | do 12 | CODE="${layer%%:*}" 13 | KVF="${layer##*:}" 14 | K="${KVF%%=*}" 15 | VF="${KVF##*=}" 16 | V="${VF%%>*}" 17 | F="${VF##*>}" 18 | N="${F%%-*}" 19 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 20 | common_query > "../../sql/$F.sql" 21 | done 22 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/powerline/powerline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=powerline 5 | LAYER=( 6 | "6600:power=line" 7 | "6601:power=minor_line" 8 | "6611:power=cable" 9 | "6611:power=minor_cable" 10 | ) 11 | 12 | for layer in "${LAYER[@]}" 13 | do 14 | CODE="${layer%%:*}" 15 | KVF="${layer##*:}" 16 | K="${KVF%%=*}" 17 | VF="${KVF##*=}" 18 | V="${VF%%>*}" 19 | F="${VF##*>}" 20 | N="${F%%-*}" 21 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 22 | common_query > "../../sql/$F.sql" 23 | done 24 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/waterway/waterways.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=waterway 5 | LAYER=( 6 | "8101:waterway=river" 7 | "8102:waterway=stream" 8 | "8103:waterway=canal" 9 | "8104:waterway=drain" 10 | ) 11 | 12 | for layer in "${LAYER[@]}" 13 | do 14 | CODE="${layer%%:*}" 15 | KVF="${layer##*:}" 16 | K="${KVF%%=*}" 17 | VF="${KVF##*=}" 18 | V="${VF%%>*}" 19 | F="${VF##*>}" 20 | N="${F%%-*}" 21 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 22 | common_query > "../../sql/$F.sql" 23 | done 24 | -------------------------------------------------------------------------------- /deployment/config/set_env_vars_from_config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CONFIG_FILE="$1" 3 | COMPOSER_ENV_NAME="$2" 4 | REGION_LOCATION="$3" 5 | 6 | declare -A VARS_ARRAY 7 | while IFS="=" read -r key value 8 | do 9 | VARS_ARRAY[$key]="$value" 10 | done < <(jq -r "to_entries|map(\"\(.key)=\(.value|tostring)\")|.[]" $CONFIG_FILE) 11 | 12 | OSM_ENV_VARS_STR='' 13 | for key in "${!VARS_ARRAY[@]}" 14 | do 15 | OSM_ENV_VARS_STR="${OSM_ENV_VARS_STR}${key^^}=${VARS_ARRAY[$key]}," 16 | done 17 | 18 | OSM_ENV_VARS_STR=${OSM_ENV_VARS_STR::-1} 19 | echo $OSM_ENV_VARS_STR 20 | 21 | gcloud composer environments update $COMPOSER_ENV_NAME \ 22 | --location $REGION_LOCATION \ 23 | --update-env-variables=$OSM_ENV_VARS_STR -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/poi_health/poi_health.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=poi_health 5 | LAYER=( 6 | "2101:amenity=pharmacy" 7 | "2110:amenity=hospital" 8 | "2120:amenity=doctors" 9 | "2121:amenity=dentist" 10 | "2129:amenity=veterinary" 11 | ) 12 | 13 | for layer in "${LAYER[@]}" 14 | do 15 | CODE="${layer%%:*}" 16 | KVF="${layer##*:}" 17 | K="${KVF%%=*}" 18 | VF="${KVF##*=}" 19 | V="${VF%%>*}" 20 | F="${VF##*>}" 21 | N="${F%%-*}" 22 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 23 | common_query > "../../sql/$F.sql" 24 | done 25 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/route/route.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=route 5 | LAYER=( 6 | "9001:route=bicycle" 7 | "9002:route=mtb" 8 | "9003:route=hiking" 9 | "9004:route=horse" 10 | "9005:route=nordic_walking" 11 | "9006:route=running" 12 | ) 13 | 14 | for layer in "${LAYER[@]}" 15 | do 16 | CODE="${layer%%:*}" 17 | KVF="${layer##*:}" 18 | K="${KVF%%=*}" 19 | VF="${KVF##*=}" 20 | V="${VF%%>*}" 21 | F="${VF##*>}" 22 | N="${F%%-*}" 23 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 24 | common_query > "../../sql/$F.sql" 25 | done 26 | -------------------------------------------------------------------------------- /examples/clustering/bq_udf/metrics.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE FUNCTION udfs.euclidean_distance(a ARRAY, b ARRAY) 2 | RETURNS FLOAT64 3 | LANGUAGE js 4 | OPTIONS ( 5 | library=["gs://gcp-pdp-osm-dev-bq-udf/metrics/metrics.js"] 6 | ) 7 | AS 8 | """ 9 | return euclideanDistances(a, b); 10 | """; 11 | 12 | SELECT EUCLIDEAN_DISTANCE([0., 0.], [1., 1.]); 13 | 14 | CREATE OR REPLACE FUNCTION udfs.cosine_similarity(a ARRAY, b ARRAY) 15 | RETURNS FLOAT64 16 | LANGUAGE js 17 | OPTIONS ( 18 | library=["gs://gcp-pdp-osm-dev-bq-udf/metrics/metrics.js"] 19 | ) 20 | AS 21 | """ 22 | return cosineSimilarity(a, b); 23 | """; 24 | 25 | SELECT COSINE_SIMILARITY([1., 4.], [1., 1.]); -------------------------------------------------------------------------------- /examples/clustering/words/README.md: -------------------------------------------------------------------------------- 1 | ## Import Glove vectors into BigQuery 2 | Download word2vec (Glove) 3 | ``` 4 | cd ../data 5 | wget http://nlp.stanford.edu/data/glove.6B.zip 6 | unzip ./glove.6B.zip 7 | rm ./glove.6B.zip 8 | ``` 9 | 10 | Convert word2vec to JSONL format 11 | ``` 12 | cat ./data/glove.6B.300d.txt | python3 w2v_to_jsonl.py > ./data/glove.6B.300d.jsonl 13 | ``` 14 | 15 | Upload result to GCS: 16 | ``` 17 | gsutil cp ./data/glove.6B.300d.jsonl gs://gcp-pdp-osm-dev-bq-import/glove/ 18 | ``` 19 | 20 | Import into BQ: 21 | ``` 22 | bq load \ 23 | --source_format=NEWLINE_DELIMITED_JSON \ 24 | gcp-pdp-osm-dev:osm_clustering.w2v_glove_6B_300d \ 25 | gs://gcp-pdp-osm-dev-bq-import/glove.6B.300d.jsonl \ 26 | "$(python3 w2v_generate_schema.py 300)" 27 | ``` -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/file_service.py: -------------------------------------------------------------------------------- 1 | import errno 2 | import os 3 | 4 | 5 | def make_dir_for_file_if_not_exists(filename): 6 | if not os.path.exists(os.path.dirname(filename)): 7 | try: 8 | os.makedirs(os.path.dirname(filename)) 9 | except OSError as exc: # Guard against race condition 10 | if exc.errno != errno.EEXIST: 11 | raise 12 | 13 | 14 | def file_name_from_path(file_path): 15 | if "/" in file_path: 16 | return file_path.split("/")[-1] 17 | else: 18 | return file_path 19 | 20 | 21 | def file_name_without_ext(file_name): 22 | if "." in file_name: 23 | return file_name.split(".")[0] 24 | else: 25 | return file_name 26 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/poi_catering/poi_catering.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=poi_catering 5 | LAYER=( 6 | "2301:amenity=restaurant" 7 | "2302:amenity=fast_food" 8 | "2303:amenity=cafe" 9 | "2304:amenity=pub" 10 | "2305:amenity=bar" 11 | "2306:amenity=food_court" 12 | "2307:amenity=biergarten" 13 | ) 14 | 15 | for layer in "${LAYER[@]}" 16 | do 17 | CODE="${layer%%:*}" 18 | KVF="${layer##*:}" 19 | K="${KVF%%=*}" 20 | VF="${KVF##*=}" 21 | V="${VF%%>*}" 22 | F="${VF##*>}" 23 | N="${F%%-*}" 24 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 25 | common_query > "../../sql/$F.sql" 26 | done 27 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/traffic_waterway/traffic_waterway.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=traffic 5 | LAYER=( 6 | "5301:leisure=slipway" 7 | "5302:leisure=marina" 8 | "5303:man_made=pier" 9 | "5311:waterway=dam" 10 | "5321:waterway=waterfall" 11 | "5331:waterway=lock_gate" 12 | "5332:waterway=weir" 13 | ) 14 | 15 | for layer in "${LAYER[@]}" 16 | do 17 | CODE="${layer%%:*}" 18 | KVF="${layer##*:}" 19 | K="${KVF%%=*}" 20 | VF="${KVF##*=}" 21 | V="${VF%%>*}" 22 | F="${VF##*>}" 23 | N="${F%%-*}" 24 | NAME_PREFIX=waterway_ 25 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 26 | common_query > "../../sql/$F.sql" 27 | done 28 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_features/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM osgeo/gdal 2 | 3 | # update repos 4 | RUN apt-get update -y 5 | 6 | # make installation 7 | RUN apt-get install build-essential -y 8 | 9 | # gsutil installation 10 | RUN curl -sSL https://sdk.cloud.google.com | bash 11 | 12 | # perl installation 13 | RUN cpan JSON 14 | RUN cpan Text::CSV::Encoded 15 | 16 | # set env vars 17 | ENV PATH $PATH:/root/google-cloud-sdk/bin 18 | ENV DATA_DIR /osm_to_features/data/ 19 | 20 | # copy script files 21 | COPY src /osm_to_features/src 22 | # set work dir 23 | WORKDIR /osm_to_features/src 24 | 25 | # set sh files as executable 26 | RUN ["chmod", "+x", "download_osm.sh"] 27 | RUN ["chmod", "+x", "csv_to_json/csv-to-json.sh"] 28 | RUN ["chmod", "+x", "osm_to_features.sh"] 29 | 30 | # run main script 31 | CMD ./osm_to_features.sh $LAYERS -------------------------------------------------------------------------------- /examples/clustering/cities/query.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | FIRST_CITY_SQL = 'SELECT "{}" as city_name, "{}" as city_class, {} as latitude, {} as longitude, {} as radius' 4 | CITY_SQL = 'SELECT "{}", "{}", {}, {}, {}' 5 | 6 | QUERY = """ 7 | WITH cities AS ({}) 8 | SELECT 9 | city_name, 10 | city_class, 11 | ST_GEOGPOINT(longitude, latitude) as center, 12 | radius 13 | FROM cities 14 | """ 15 | 16 | if __name__ == '__main__': 17 | with open('cities.csv', newline='') as csv_file: 18 | reader = csv.reader(csv_file) 19 | rows = [row for row in reader] 20 | first_city = rows[1] 21 | cities_tail = rows[2:] 22 | 23 | cities_sql = ' UNION ALL\n'.join([FIRST_CITY_SQL.format(*first_city)] + [CITY_SQL.format(*city) for city in cities_tail]) 24 | print(QUERY.format(cities_sql)) 25 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/barrier/barrier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=barrier 5 | LAYER=( 6 | "5501:barrier=fence>fence-barrier" 7 | "5501:barrier=wood_fence>fence-wood_fence" 8 | "5501:barrier=wire_fence>fence-wire_fence" 9 | "5511:barrier=hedge" 10 | "5512:barrier=tree_row" 11 | "5521:barrier=wall" 12 | "5531:man_made=dyke" 13 | ) 14 | 15 | 16 | 17 | for layer in "${LAYER[@]}" 18 | do 19 | CODE="${layer%%:*}" 20 | KVF="${layer##*:}" 21 | K="${KVF%%=*}" 22 | VF="${KVF##*=}" 23 | V="${VF%%>*}" 24 | F="${VF##*>}" 25 | N="${F%%-*}" 26 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 27 | common_query > "../../sql/$F.sql" 28 | done 29 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_nodes_ways_relations/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM google/cloud-sdk 2 | 3 | # update repos 4 | RUN apt-get update -y 5 | 6 | # install pyosmium dependencies 7 | RUN apt-get install build-essential cmake libboost-dev \ 8 | libexpat1-dev zlib1g-dev libbz2-dev -y 9 | # install python GCS sdk 10 | RUN pip3 install --upgrade google-cloud-storage 11 | 12 | # install pyosmium 13 | RUN pip3 install osmium 14 | # install guppy3 (memory profiler) 15 | RUN pip3 install guppy3 16 | 17 | # set env vars 18 | ENV DATA_DIR /osm_to_nodes_ways_relations/data/ 19 | 20 | # copy script files 21 | COPY src /osm_to_nodes_ways_relations/src 22 | # set work dir 23 | WORKDIR /osm_to_nodes_ways_relations/src 24 | 25 | CMD python3 pbf_parser.py $SRC_OSM_GCS_URI $NODES_WAYS_RELATIONS_DIR_GCS_URI --num_threads $NUM_THREADS 26 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/query_templates_history.sh: -------------------------------------------------------------------------------- 1 | common_query() { 2 | echo " 3 | WITH osm AS ( 4 | SELECT id, null AS way_id, all_tags, osm_timestamp, version, geometry FROM \`${BQ_DATASET_TO_EXPORT}.history_nodes\` 5 | UNION ALL 6 | SELECT id, id AS way_id, all_tags, osm_timestamp, version, geometry FROM \`${BQ_DATASET_TO_EXPORT}.history_ways\` 7 | UNION ALL 8 | SELECT id, null AS way_id, all_tags, osm_timestamp, version, geometry FROM \`${BQ_DATASET_TO_EXPORT}.history_relations\` 9 | ) 10 | SELECT 11 | $CODE AS layer_code, 12 | '$CLASS' AS layer_class, 13 | '$NAME_PREFIX$N' AS layer_name, 14 | osm.id AS osm_id, 15 | osm.way_id AS osm_way_id, 16 | osm.osm_timestamp AS osm_timestamp, 17 | osm.version AS osm_version, 18 | osm.all_tags, 19 | osm.geometry 20 | FROM osm 21 | WHERE osm.id IS NOT NULL 22 | $EXTRA_CONSTRAINTS 23 | " 24 | } 25 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/01_append_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROCESSING_MODE="$1" 3 | 4 | BQ_DATASET_TO_EXPORT_WITH_COLON=$(echo $BQ_DATASET_TO_EXPORT | sed 's/\./:/') 5 | 6 | i=0 7 | mode="" 8 | for SQL in `find ../sql/ -type f -name '*.sql' | sort`; do 9 | echo $SQL 10 | if (($i > 0)); then 11 | mode="--append_table" 12 | else 13 | mode="--replace" 14 | fi 15 | 16 | cmd="cat $SQL | bq query\ 17 | --project_id ${PROJECT_ID}\ 18 | --nouse_legacy_sql\ 19 | $mode\ 20 | --range_partitioning 'layer_code,0,9999,1'\ 21 | --clustering_fields 'layer_code,geometry'\ 22 | --display_name $SQL\ 23 | --destination_table '${BQ_DATASET_TO_EXPORT_WITH_COLON}.${PROCESSING_MODE}_layers'\ 24 | --destination_schema ../schema/layers_schema.json >/dev/null" 25 | 26 | echo "$cmd" 27 | echo "$cmd" | bash 28 | 29 | ((i=i+1)) 30 | done 31 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/poi_accommodation/poi_accommodation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=poi_accommodation 5 | LAYER=( 6 | "2401:tourism=hotel" 7 | "2402:tourism=motel" 8 | "2403:tourism=bed_and_breakfast" 9 | "2404:tourism=guest_house" 10 | "2405:tourism=hostel" 11 | "2406:tourism=chalet" 12 | "2421:amenity=shelter" 13 | "2422:tourism=camp_site" 14 | "2423:tourism=alpine_hut" 15 | "2424:tourism=caravan_site" 16 | ) 17 | 18 | for layer in "${LAYER[@]}" 19 | do 20 | CODE="${layer%%:*}" 21 | KVF="${layer##*:}" 22 | K="${KVF%%=*}" 23 | VF="${KVF##*=}" 24 | V="${VF%%>*}" 25 | F="${VF##*>}" 26 | N="${F%%-*}" 27 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 28 | common_query > "../../sql/$F.sql" 29 | done 30 | -------------------------------------------------------------------------------- /dags/schemas/features_table_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "description": null, 4 | "name": "osm_id", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "description": null, 9 | "name": "osm_version", 10 | "type": "INTEGER" 11 | }, 12 | { 13 | "description": null, 14 | "name": "osm_way_id", 15 | "type": "INTEGER" 16 | }, 17 | { 18 | "description": "Last-modified timestamp for this object.", 19 | "name": "osm_timestamp", 20 | "type": "TIMESTAMP" 21 | }, 22 | { 23 | "description": "GEOGRAPHY-encoded point", 24 | "name": "geometry", 25 | "type": "GEOGRAPHY" 26 | }, 27 | { 28 | "description": "Unstructured key=value attributes for this object.", 29 | "fields": [ 30 | { 31 | "description": "Attribute key.", 32 | "name": "key", 33 | "type": "STRING" 34 | }, 35 | { 36 | "description": "Attribute value.", 37 | "name": "value", 38 | "type": "STRING" 39 | } 40 | ], 41 | "mode": "REPEATED", 42 | "name": "all_tags", 43 | "type": "RECORD" 44 | } 45 | ] -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/natural/natural.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=natural 5 | LAYER=( 6 | "4101:natural=spring" 7 | "4102:natural=glacier" 8 | "4111:natural=peak" 9 | "4112:natural=cliff" 10 | "4113:natural=volcano" 11 | "4121:natural=tree" 12 | "4131:natural=mine>mine-natural" 13 | "4131:historic=mine>mine-historic" 14 | "4131:landuse=mine>mine-landuse" 15 | "4131:survey_point=mine>mine-survey_point" 16 | "4131:industrial=mine>mine-industrial" 17 | "4132:natural=cave_entrance" 18 | "4141:natural=beach" 19 | "8300:natural=coastline" 20 | ) 21 | 22 | 23 | 24 | for layer in "${LAYER[@]}" 25 | do 26 | CODE="${layer%%:*}" 27 | KVF="${layer##*:}" 28 | K="${KVF%%=*}" 29 | VF="${KVF##*=}" 30 | V="${VF%%>*}" 31 | F="${VF##*>}" 32 | N="${F%%-*}" 33 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 34 | common_query > "../../sql/$F.sql" 35 | done 36 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/poi_destination/poi_destination.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=poi_destination 5 | LAYER=( 6 | "2721:tourism=attraction" 7 | "2722:tourism=museum" 8 | "2723:historic=monument" 9 | "2724:historic=memorial" 10 | "2725:tourism=artwork" 11 | "2731:historic=castle" 12 | "2732:historic=ruins" 13 | "2733:historic=archaeological_site" 14 | "2734:historic=wayside_cross" 15 | "2735:historic=wayside_shrine" 16 | "2736:historic=battlefield" 17 | "2737:historic=fort" 18 | "2741:tourism=picnic_site" 19 | "2742:tourism=viewpoint" 20 | "2743:tourism=zoo" 21 | "2744:tourism=theme_park" 22 | ) 23 | 24 | for layer in "${LAYER[@]}" 25 | do 26 | CODE="${layer%%:*}" 27 | KVF="${layer##*:}" 28 | K="${KVF%%=*}" 29 | VF="${KVF##*=}" 30 | V="${VF%%>*}" 31 | F="${VF##*>}" 32 | N="${F%%-*}" 33 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 34 | common_query > "../../sql/$F.sql" 35 | done 36 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/boundary/boundary.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=boundary 5 | LAYER=( 6 | "1101:admin_level=1>admin_level1" 7 | "1102:admin_level=2>national" 8 | "1103:admin_level=3>admin_level3" 9 | "1104:admin_level=4>admin_level4" 10 | "1105:admin_level=5>admin_level5" 11 | "1106:admin_level=6>admin_level6" 12 | "1107:admin_level=7>admin_level7" 13 | "1108:admin_level=8>admin_level8" 14 | "1109:admin_level=9>admin_level9" 15 | "1110:admin_level=10>admin_level10" 16 | "1111:admin_level=11>admin_level11" 17 | ) 18 | 19 | 20 | for layer in "${LAYER[@]}" 21 | do 22 | CODE="${layer%%:*}" 23 | KVF="${layer##*:}" 24 | K="${KVF%%=*}" 25 | VF="${KVF##*=}" 26 | V="${VF%%>*}" 27 | F="${VF##*>}" 28 | N="${F%%-*}" 29 | 30 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V') 31 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'boundary' AND tags.value='administrative')" 32 | common_query > "../../sql/$F.sql" 33 | done 34 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM google/cloud-sdk 2 | 3 | RUN cat /etc/os-release 4 | # update repos 5 | RUN apt-get update -y 6 | 7 | # install pyosmium dependencies 8 | RUN apt-get install build-essential cmake libboost-dev \ 9 | libexpat1-dev zlib1g-dev libbz2-dev -y 10 | # install GDAL 11 | RUN apt-get install binutils libproj-dev gdal-bin -y 12 | 13 | # install python GCS sdk 14 | RUN pip3 install --upgrade google-cloud-storage 15 | 16 | # install pyosmium 17 | RUN pip3 install osmium 18 | 19 | # install guppy3 (memory profiler) 20 | RUN pip3 install guppy3 21 | # install psutil 22 | RUN pip install psutil 23 | 24 | # set env vars 25 | ENV DATA_DIR /osm_converter_with_history_index/data/ 26 | 27 | # copy script files 28 | COPY src /osm_converter_with_history_index/src 29 | # set work dir 30 | WORKDIR /osm_converter_with_history_index/src 31 | 32 | # (Optional) GCP credetials setup fro local runs 33 | #COPY keys /osm_converter_with_history_index/keys 34 | #ENV GOOGLE_APPLICATION_CREDENTIALS=/osm_converter_with_history_index/keys/gcloud_keys.json 35 | 36 | CMD python3 main.py $SRC_OSM_GCS_URI --index_db_and_metadata_gcs_dir $INDEX_DB_AND_METADATA_DIR_GCS_URI --converted_gcs_dir $CONVERTED_OSM_DIR_GCS_URI --num_db_shards $NUM_DB_SHARDS --num_threads $NUM_THREADS $ADDITIONAL_ARGS -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/query_templates_planet.sh: -------------------------------------------------------------------------------- 1 | common_query() { 2 | echo " 3 | WITH osm AS ( 4 | SELECT id, null AS way_id, osm_timestamp, version, all_tags FROM \`${BQ_DATASET_TO_EXPORT}.planet_nodes\` 5 | UNION ALL 6 | SELECT id, id AS way_id, osm_timestamp, version, all_tags FROM \`${BQ_DATASET_TO_EXPORT}.planet_ways\` 7 | UNION ALL 8 | SELECT id, null AS way_id, osm_timestamp, version, all_tags FROM \`${BQ_DATASET_TO_EXPORT}.planet_relations\` 9 | ) 10 | SELECT $CODE AS layer_code, '$CLASS' AS layer_class, '$NAME_PREFIX$N' AS layer_name, f.feature_type AS gdal_type, 11 | f.osm_id AS osm_id, 12 | f.osm_way_id AS osm_way_id, 13 | f.osm_timestamp, 14 | osm.version AS osm_version, 15 | osm.all_tags, 16 | f.geometry 17 | FROM \`${BQ_DATASET_TO_EXPORT}.planet_features\` AS f, osm 18 | WHERE osm.id = f.osm_id AND osm.osm_timestamp = f.osm_timestamp 19 | $EXTRA_CONSTRAINTS 20 | 21 | UNION ALL 22 | 23 | SELECT $CODE AS layer_code, '$CLASS' AS layer_class, '$NAME_PREFIX$N' AS layer_name, f.feature_type AS gdal_type, 24 | f.osm_id AS osm_id, 25 | f.osm_way_id AS osm_way_id, 26 | f.osm_timestamp, 27 | osm.version AS osm_version, 28 | osm.all_tags, 29 | f.geometry 30 | FROM \`${BQ_DATASET_TO_EXPORT}.planet_features\` AS f, osm 31 | WHERE osm.way_id = f.osm_way_id AND osm.osm_timestamp = f.osm_timestamp 32 | $EXTRA_CONSTRAINTS 33 | " 34 | } 35 | -------------------------------------------------------------------------------- /examples/clustering/tf_idf/materialize.sql: -------------------------------------------------------------------------------- 1 | -- Selects geo ID, geography, TF-IDF vector, TF-IDF features, 2 | -- lbcs category name, lbcs color, similarity with lbcs category 3 | WITH features AS (SELECT ARRAY_AGG(word ORDER BY word) as words 4 | FROM `gcp-pdp-osm-dev.words.w2v_glove_6B_300d_osm_tags`) 5 | ,similarities AS (SELECT 6 | grid.geo_id, 7 | MAX(udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec)) as max_similarity 8 | FROM 9 | `gcp-pdp-osm-dev.osm_clustering_grid_01km.vectors_tfidf` tfidf 10 | JOIN `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` grid USING(geo_id) 11 | CROSS JOIN `gcp-pdp-osm-dev.lbcs.lbcs_tfidf` lbcs 12 | WHERE lbcs.dimension = 'Function' 13 | AND udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec) > 0 14 | GROUP BY grid.geo_id) 15 | SELECT 16 | grid.geo_id, 17 | grid.geog, 18 | grid.city_name, 19 | tfidf.tfidf_vec, 20 | features.words as tfidf_features, 21 | lbcs.name, 22 | lbcs.color, 23 | udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec) as similarity 24 | FROM 25 | `gcp-pdp-osm-dev.osm_clustering_grid_01km.vectors_tfidf` tfidf 26 | JOIN `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` grid USING(geo_id) 27 | CROSS JOIN features 28 | CROSS JOIN `gcp-pdp-osm-dev.lbcs.lbcs_tfidf` lbcs 29 | JOIN similarities ON similarities.geo_id = tfidf.geo_id AND similarities.max_similarity = udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec) 30 | AND lbcs.dimension = 'Function' 31 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_features/src/osmconf.ini: -------------------------------------------------------------------------------- 1 | # 2 | # Configuration file for OSM import 3 | # 4 | 5 | # put here the name of keys, or key=value, for ways that are assumed to be polygons if they are closed 6 | # see http://wiki.openstreetmap.org/wiki/Map_Features 7 | closed_ways_are_polygons=aeroway,amenity,boundary,building,craft,geological,historic,landuse,leisure,military,natural,office,place,shop,sport,tourism,highway=platform,public_transport=platform 8 | 9 | # comment to avoid laundering of keys ( ':' turned into '_' ) 10 | attribute_name_laundering=yes 11 | 12 | # keys that should NOT be reported in the "other_tags" field 13 | ignore=created_by,converted_by,source,time,ele,note,openGeoDB:,fixme,FIXME 14 | 15 | [lines] 16 | # common attributes 17 | osm_id=yes 18 | osm_version=yes 19 | osm_timestamp=yes 20 | other_tags=no 21 | # create "all_tags" field 22 | all_tags=yes 23 | 24 | [multilinestrings] 25 | # common attributes 26 | osm_id=yes 27 | osm_version=yes 28 | osm_timestamp=yes 29 | other_tags=no 30 | # create "all_tags" field 31 | all_tags=yes 32 | 33 | [multipolygons] 34 | # common attributes 35 | osm_id=yes 36 | osm_version=yes 37 | osm_timestamp=yes 38 | other_tags=no 39 | # create "all_tags" field 40 | all_tags=yes 41 | 42 | [other_relations] 43 | # common attributes 44 | osm_id=yes 45 | osm_version=yes 46 | osm_timestamp=yes 47 | other_tags=no 48 | # create "all_tags" field 49 | all_tags=yes 50 | 51 | [points] 52 | # common attributes 53 | osm_id=yes 54 | osm_version=yes 55 | osm_timestamp=yes 56 | other_tags=no 57 | # create "all_tags" field 58 | all_tags=yes 59 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/gdal/osmconf.ini: -------------------------------------------------------------------------------- 1 | # 2 | # Configuration file for OSM import 3 | # 4 | 5 | # put here the name of keys, or key=value, for ways that are assumed to be polygons if they are closed 6 | # see http://wiki.openstreetmap.org/wiki/Map_Features 7 | closed_ways_are_polygons=aeroway,amenity,boundary,building,craft,geological,historic,landuse,leisure,military,natural,office,place,shop,sport,tourism,highway=platform,public_transport=platform 8 | 9 | # comment to avoid laundering of keys ( ':' turned into '_' ) 10 | attribute_name_laundering=yes 11 | 12 | # keys that should NOT be reported in the "other_tags" field 13 | ignore=created_by,converted_by,source,time,ele,note,openGeoDB:,fixme,FIXME 14 | 15 | [lines] 16 | # common attributes 17 | osm_id=yes 18 | osm_version=no 19 | osm_timestamp=no 20 | other_tags=no 21 | # create "all_tags" field 22 | all_tags=no 23 | 24 | [multilinestrings] 25 | # common attributes 26 | osm_id=yes 27 | osm_version=no 28 | osm_timestamp=no 29 | other_tags=no 30 | # create "all_tags" field 31 | all_tags=no 32 | 33 | [multipolygons] 34 | # common attributes 35 | osm_id=yes 36 | osm_version=no 37 | osm_timestamp=no 38 | other_tags=no 39 | # create "all_tags" field 40 | all_tags=no 41 | 42 | [other_relations] 43 | # common attributes 44 | osm_id=yes 45 | osm_version=no 46 | osm_timestamp=no 47 | other_tags=no 48 | # create "all_tags" field 49 | all_tags=no 50 | 51 | [points] 52 | # common attributes 53 | osm_id=yes 54 | osm_version=no 55 | osm_timestamp=no 56 | other_tags=no 57 | # create "all_tags" field 58 | all_tags=no 59 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/poi_leisure/poi_leisure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=poi_leisure 5 | LAYER=( 6 | "2201:amenity=theatre" 7 | "2202:amenity=nightclub" 8 | "2203:amenity=cinema" 9 | "2204:leisure=park" 10 | "2205:leisure=playground" 11 | "2206:leisure=dog_park" 12 | "2251:leisure=sports_centre" 13 | "2252:leisure=pitch" 14 | "2254:sport=tennis>tennis_court" 15 | "2255:leisure=golf_course" 16 | "2256:leisure=stadium" 17 | "2257:leisure=ice_rink" 18 | ) 19 | 20 | for layer in "${LAYER[@]}" 21 | do 22 | CODE="${layer%%:*}" 23 | KVF="${layer##*:}" 24 | K="${KVF%%=*}" 25 | VF="${KVF##*=}" 26 | V="${VF%%>*}" 27 | F="${VF##*>}" 28 | N="${F%%-*}" 29 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 30 | common_query > "../../sql/$F.sql" 31 | done 32 | 33 | CODE=2253 34 | N=swimming_pool 35 | F=swimming_pool 36 | EXTRA_CONSTRAINTS=" 37 | AND ( 38 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amenity' AND tags.value='swimming_pool') 39 | OR 40 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'leisure' AND tags.value='swimming_pool') 41 | OR 42 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'sport' AND tags.value='swimming') 43 | OR 44 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'leisure' AND tags.value='water_park') 45 | )" 46 | common_query > "../../sql/$F.sql" 47 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/traffic_calming/traffic_calming.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=traffic 5 | LAYER=( 6 | "5231:traffic_calming=hump" 7 | "5232:traffic_calming=bump" 8 | "5233:traffic_calming=table" 9 | "5234:traffic_calming=chicane" 10 | "5235:traffic_calming=cushion" 11 | ) 12 | 13 | for layer in "${LAYER[@]}" 14 | do 15 | CODE="${layer%%:*}" 16 | KVF="${layer##*:}" 17 | K="${KVF%%=*}" 18 | VF="${KVF##*=}" 19 | V="${VF%%>*}" 20 | F="${VF##*>}" 21 | N="${F%%-*}" 22 | NAME_PREFIX=calming_ 23 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 24 | common_query > "../../sql/$NAME_PREFIX$F.sql" 25 | done 26 | 27 | #5230 28 | CODE=5230 29 | N=calming 30 | F=calming 31 | NAME_PREFIX="" 32 | EXTRA_CONSTRAINTS=" 33 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming') 34 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming' AND tags.value='hump') 35 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming' AND tags.value='bump') 36 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming' AND tags.value='table') 37 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming' AND tags.value='chicane') 38 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'traffic_calming' AND tags.value='cushion')" 39 | common_query > "../../sql/$F.sql" 40 | -------------------------------------------------------------------------------- /dags/schemas/nodes_table_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "type": "INTEGER", 4 | "name": "id", 5 | "description": "Object unique ID." 6 | }, 7 | { 8 | "type": "INTEGER", 9 | "name": "version", 10 | "description": "Version number for this object." 11 | }, 12 | { 13 | "type": "STRING", 14 | "name": "username", 15 | "description": "Name of user who created this version of the object." 16 | }, 17 | { 18 | "type": "INTEGER", 19 | "name": "changeset", 20 | "description": "Changeset number for this object." 21 | }, 22 | { 23 | "type": "BOOLEAN", 24 | "name": "visible", 25 | "description": "Is this version of the object visible?" 26 | }, 27 | { 28 | "type": "TIMESTAMP", 29 | "name": "osm_timestamp", 30 | "description": "Last-modified timestamp for this object." 31 | }, 32 | { 33 | "type": "GEOGRAPHY", 34 | "name": "geometry", 35 | "description": "GEOGRAPHY-encoded point" 36 | }, 37 | { 38 | "type": "RECORD", 39 | "mode": "REPEATED", 40 | "name": "all_tags", 41 | "description": "Unstructured key=value attributes for this object.", 42 | "fields": [ 43 | { 44 | "type": "STRING", 45 | "name": "key", 46 | "description": "Attribute key." 47 | }, 48 | { 49 | "type": "STRING", 50 | "name": "value", 51 | "description": "Attribute value." 52 | } 53 | ] 54 | }, 55 | { 56 | "description": null, 57 | "name": "latitude", 58 | "type": "NUMERIC" 59 | }, 60 | { 61 | "description": null, 62 | "name": "longitude", 63 | "type": "NUMERIC" 64 | } 65 | ] -------------------------------------------------------------------------------- /dags/schemas/ways_table_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "type":"INTEGER", 4 | "name":"id", 5 | "description": "Object unique ID." 6 | }, 7 | { 8 | "type":"INTEGER", 9 | "name":"version", 10 | "description": "Version number for this object." 11 | }, 12 | { 13 | "type":"STRING", 14 | "name":"username", 15 | "description": "Name of user who created this version of the object." 16 | }, 17 | { 18 | "type":"INTEGER", 19 | "name":"changeset", 20 | "description": "Changeset number for this object." 21 | }, 22 | { 23 | "type":"BOOLEAN", 24 | "name":"visible", 25 | "description": "Is this version of the object visible?" 26 | }, 27 | { 28 | "type":"TIMESTAMP", 29 | "name":"osm_timestamp", 30 | "description": "Last-modified timestamp for this object." 31 | }, 32 | { 33 | "type":"GEOGRAPHY", 34 | "name":"geometry", 35 | "description": "GEOGRAPHY-encoded bounding box" 36 | }, 37 | { 38 | "type":"RECORD", 39 | "mode":"REPEATED", 40 | "name":"nodes", 41 | "fields":[ 42 | { 43 | "type":"INTEGER", 44 | "name":"id", 45 | "description": "Nodes that are part of this way" 46 | } 47 | ] 48 | }, 49 | { 50 | "type":"RECORD", 51 | "mode":"REPEATED", 52 | "name":"all_tags", 53 | "description": "Unstructured key=value attributes for this object.", 54 | "fields":[ 55 | { 56 | "type":"STRING", 57 | "name":"key", 58 | "description": "Attribute key." 59 | }, 60 | { 61 | "type":"STRING", 62 | "name":"value", 63 | "description": "Attribute value." 64 | } 65 | ] 66 | } 67 | ] -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/land_use/land_use.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=land_use 5 | LAYER=( 6 | "7201:landuse=forest>forest-landuse" 7 | "7201:natural=wood>forest-natural" 8 | "7202:leisure=park>park-park" 9 | "7202:leisure=common>park-common" 10 | "7203:landuse=residential" 11 | "7204:landuse=industrial" 12 | "7206:amenity=grave_yard>cemetery-amenity" 13 | "7206:landuse=cemetery>cemetery-landuse" 14 | "7207:landuse=allotments" 15 | "7208:landuse=meadow" 16 | "7209:landuse=commercial" 17 | "7210:leisure=nature_reserve" 18 | "7211:leisure=recreation_ground>recreation_ground-leisure" 19 | "7211:landuse=recreation_ground>recreation_ground-landuse" 20 | "7212:landuse=retail" 21 | "7213:landuse=military" 22 | "7214:landuse=quarry" 23 | "7215:landuse=orchard" 24 | "7216:landuse=vineyard" 25 | "7217:landuse=scrub" 26 | "7218:landuse=grass" 27 | "7219:landuse=heath" 28 | "7220:boundary=national_park" 29 | "7221:landuse=basin" 30 | "7222:landuse=village_green" 31 | "7223:landuse=plant_nursery" 32 | "7224:landuse=brownfield" 33 | "7225:landuse=greenfield" 34 | "7226:landuse=construction" 35 | "7227:landuse=railway" 36 | "7228:landuse=farmland" 37 | "7229:landuse=farmyard" 38 | 39 | ) 40 | 41 | for layer in "${LAYER[@]}" 42 | do 43 | CODE="${layer%%:*}" 44 | KVF="${layer##*:}" 45 | K="${KVF%%=*}" 46 | VF="${KVF##*=}" 47 | V="${VF%%>*}" 48 | F="${VF##*>}" 49 | N="${F%%-*}" 50 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 51 | common_query > "../../sql/$F.sql" 52 | done 53 | -------------------------------------------------------------------------------- /examples/clustering/tf_idf/vectorize.sql: -------------------------------------------------------------------------------- 1 | WITH objects_with_terms AS (SELECT osm_id, geometry, term 2 | FROM `gcp-pdp-osm-dev.osm_cities.cities_objects` as objects 3 | JOIN UNNEST(SPLIT(CONCAT(layer_class, "_", layer_name), "_")) as term 4 | WHERE objects.city_name = 'Kyiv') 5 | , data AS ( 6 | SELECT 7 | grid.geo_id, 8 | objects.term 9 | FROM 10 | objects_with_terms AS objects, 11 | `gcp-pdp-osm-dev.osm_cities.cities_population_grid_1km` as grid 12 | WHERE ST_INTERSECTS(grid.geog, objects.geometry) 13 | ) 14 | , counts AS (SELECT 15 | geo_id, 16 | term, 17 | COUNT(term) OVER(partition by CONCAT(geo_id, term)) as term_count, 18 | COUNT(term) OVER(partition by geo_id) as terms_in_cell 19 | FROM data) 20 | , tf AS (SELECT geo_id, term, ANY_VALUE(term_count)/ANY_VALUE(terms_in_cell) as tf 21 | FROM counts 22 | GROUP BY geo_id, term) 23 | , term_in_cells AS ( 24 | SELECT term, COUNT(DISTINCT geo_id) in_cells 25 | FROM data 26 | GROUP BY 1 27 | ) 28 | , total_cells AS ( 29 | SELECT COUNT(DISTINCT geo_id) total_cells 30 | FROM data 31 | ) 32 | , idf AS ( 33 | SELECT term, LOG(total_cells.total_cells/in_cells) idf 34 | FROM term_in_cells 35 | CROSS JOIN total_cells 36 | ) 37 | , tf_idf AS ( 38 | SELECT 39 | geo_id, 40 | term, 41 | tf.tf * idf.idf tfidf, 42 | CONCAT(term, ': ', CAST(tf.tf * idf.idf AS STRING)) as term_and_tfidf 43 | FROM tf 44 | JOIN idf 45 | USING(term) 46 | ORDER BY geo_id, tfidf DESC 47 | ) 48 | , features_matrix AS (SELECT geo_id, word 49 | FROM `gcp-pdp-osm-dev.words.w2v_glove_6B_300d_osm_tags` 50 | CROSS JOIN (SELECT geo_id FROM data GROUP BY geo_id) 51 | ORDER BY geo_id, word) 52 | SELECT 53 | fm.geo_id, ARRAY_AGG(fm.word ORDER BY fm.word) as words, ARRAY_AGG(IFNULL(tf_idf.tfidf, 0.0) ORDER BY fm.word) as tfidf_vec 54 | FROM features_matrix fm 55 | LEFT JOIN tf_idf ON tf_idf.term = fm.word AND tf_idf.geo_id = fm.geo_id 56 | GROUP BY geo_id 57 | ORDER BY geo_id -------------------------------------------------------------------------------- /dags/schemas/relations_table_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "type":"INTEGER", 4 | "name":"id", 5 | "description": "Object unique ID." 6 | }, 7 | { 8 | "type":"INTEGER", 9 | "name":"version", 10 | "description": "Version number for this object." 11 | }, 12 | { 13 | "type":"STRING", 14 | "name":"username", 15 | "description": "Name of user who created this version of the object." 16 | }, 17 | { 18 | "type":"INTEGER", 19 | "name":"changeset", 20 | "description": "Changeset number for this object." 21 | }, 22 | { 23 | "type":"BOOLEAN", 24 | "name":"visible", 25 | "description": "Is this version of the object visible?" 26 | }, 27 | { 28 | "type":"TIMESTAMP", 29 | "name":"osm_timestamp", 30 | "description": "Last-modified timestamp for this object." 31 | }, 32 | { 33 | "type":"GEOGRAPHY", 34 | "name":"geometry", 35 | "description": "GEOGRAPHY-encoded bounding box" 36 | }, 37 | { 38 | "type":"RECORD", 39 | "mode":"REPEATED", 40 | "name":"members", 41 | "fields":[ 42 | { 43 | "type":"STRING", 44 | "name":"type", 45 | "description": null 46 | }, 47 | { 48 | "type":"INTEGER", 49 | "name":"id", 50 | "description": "Relations that are part of this relation" 51 | }, 52 | { 53 | "type":"STRING", 54 | "name":"role", 55 | "description": "Role of this relation, if any." 56 | } 57 | ] 58 | }, 59 | { 60 | "type":"RECORD", 61 | "mode":"REPEATED", 62 | "name":"all_tags", 63 | "description": "Unstructured key=value attributes for this object.", 64 | "fields":[ 65 | { 66 | "type":"STRING", 67 | "name":"key", 68 | "description": "Attribute key." 69 | }, 70 | { 71 | "type":"STRING", 72 | "name":"value", 73 | "description": "Attribute value." 74 | } 75 | ] 76 | } 77 | ] -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/schema/layers_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "type":"INTEGER", 4 | "name":"layer_code", 5 | "description": "Geofabrik layer code. Example: 1003. Layers are hierarchical; The mask code=10xx corresponds to 'place' layers, and code=1003 corresponds to a 'village' type of place." 6 | }, 7 | { 8 | "type":"STRING", 9 | "name":"layer_class", 10 | "description": "Geofabrik layer class, a friendly name for layer_code. Example: 'place'" 11 | }, 12 | { 13 | "type":"STRING", 14 | "name":"layer_name", 15 | "description": "Geofabrik layer name, a friendly name for layer_code. Example: 'village'" 16 | }, 17 | { 18 | "type":"STRING", 19 | "name":"gdal_type", 20 | "description":"OpenStreetMap feature type. One of: point, line, multilinestring, multipolygon, other_relation" 21 | }, 22 | { 23 | "type":"INTEGER", 24 | "name":"osm_id", 25 | "description": "OSM Id taken from the id of this feature (node_id or relation_id) in the OSM database." 26 | }, 27 | { 28 | "type":"INTEGER", 29 | "name":"osm_way_id", 30 | "description": "OSM Way Id taken from the id of this feature (way_id) in the OSM database." 31 | }, 32 | { 33 | "type":"INTEGER", 34 | "name":"osm_version", 35 | "description": "Version number for this object." 36 | }, 37 | { 38 | "type":"TIMESTAMP", 39 | "name":"osm_timestamp", 40 | "description": "Last-modified timestamp for this object." 41 | }, 42 | { 43 | "type":"RECORD", 44 | "mode":"REPEATED", 45 | "name":"all_tags", 46 | "description": "Unstructured key=value attributes for this object.", 47 | "fields":[ 48 | { 49 | "type":"STRING", 50 | "name":"key", 51 | "description": "Attribute key." 52 | }, 53 | { 54 | "type":"STRING", 55 | "name":"value", 56 | "description": "Attribute value." 57 | } 58 | ] 59 | }, 60 | { 61 | "type":"GEOGRAPHY", 62 | "name":"geometry", 63 | "description": "GEOGRAPHY-encoded object" 64 | } 65 | ] 66 | -------------------------------------------------------------------------------- /utils/get_client_id.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import logging 4 | 5 | logging.getLogger().setLevel(logging.INFO) 6 | 7 | def get_client_id(project_id, location, composer_environment): 8 | import google.auth 9 | import google.auth.transport.requests 10 | import requests 11 | import six.moves.urllib.parse 12 | 13 | # Authenticate with Google Cloud. 14 | # See: https://cloud.google.com/docs/authentication/getting-started 15 | credentials, _ = google.auth.default( 16 | scopes=['https://www.googleapis.com/auth/cloud-platform']) 17 | authed_session = google.auth.transport.requests.AuthorizedSession( 18 | credentials) 19 | 20 | environment_url = ( 21 | 'https://composer.googleapis.com/v1beta1/projects/{}/locations/{}' 22 | '/environments/{}').format(project_id, location, composer_environment) 23 | composer_response = authed_session.request('GET', environment_url) 24 | environment_data = composer_response.json() 25 | airflow_uri = environment_data['config']['airflowUri'] 26 | 27 | # The Composer environment response does not include the IAP client ID. 28 | # Make a second, unauthenticated HTTP request to the web server to get the 29 | # redirect URI. 30 | redirect_response = requests.get(airflow_uri, allow_redirects=False) 31 | redirect_location = redirect_response.headers['location'] 32 | 33 | # Extract the client_id query parameter from the redirect. 34 | parsed = six.moves.urllib.parse.urlparse(redirect_location) 35 | query_string = six.moves.urllib.parse.parse_qs(parsed.query) 36 | return query_string['client_id'][0] 37 | 38 | 39 | if __name__ == '__main__': 40 | parser = argparse.ArgumentParser( 41 | description=__doc__, 42 | formatter_class=argparse.RawDescriptionHelpFormatter) 43 | parser.add_argument('project_id', help='Your Project ID.') 44 | parser.add_argument( 45 | 'location', help='Region of the Cloud Composer environent.') 46 | parser.add_argument( 47 | 'composer_environment', help='Name of the Cloud Composer environent.') 48 | 49 | args = parser.parse_args() 50 | logging.info(args) 51 | client_id = get_client_id(args.project_id, args.location, args.composer_environment) 52 | print(client_id) 53 | sys.exit(0) 54 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/traffic/traffic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=traffic 5 | LAYER=( 6 | "5201:highway=traffic_signals" 7 | "5202:highway=mini_roundabout" 8 | "5203:highway=stop" 9 | "5204:highway=crossing>crossing-highway" 10 | "5204:railway=level_crossing>crossing-railway" 11 | "5205:highway=ford" 12 | "5206:highway=motorway_junction" 13 | "5207:highway=turning_circle" 14 | "5208:highway=speed_camera" 15 | "5209:highway=street_lamp" 16 | "5250:amenity=fuel" 17 | "5251:highway=services>services" 18 | "5251:highway=service>service" 19 | "5270:amenity=bicycle_parking" 20 | ) 21 | 22 | for layer in "${LAYER[@]}" 23 | do 24 | CODE="${layer%%:*}" 25 | KVF="${layer##*:}" 26 | K="${KVF%%=*}" 27 | VF="${KVF##*=}" 28 | V="${VF%%>*}" 29 | F="${VF##*>}" 30 | N="${F%%-*}" 31 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 32 | common_query > "../../sql/$F.sql" 33 | done 34 | 35 | LAYER=( 36 | "5261:parking=surface>parking_site" 37 | "5262:parking=multi-storey>parking_multistorey" 38 | "5263:parking=underground>parking_underground" 39 | ) 40 | for layer in "${LAYER[@]}" 41 | do 42 | CODE="${layer%%:*}" 43 | KVF="${layer##*:}" 44 | K="${KVF%%=*}" 45 | VF="${KVF##*=}" 46 | V="${VF%%>*}" 47 | F="${VF##*>}" 48 | N="${F%%-*}" 49 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V') 50 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amenity' AND tags.value='parking')" 51 | common_query > "../../sql/$F.sql" 52 | done 53 | 54 | 55 | CODE=5260 56 | N=parking 57 | F=parking 58 | EXTRA_CONSTRAINTS=" 59 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amenity' AND tags.value='parking') 60 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'parking' AND tags.value='surface') 61 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'parking' AND tags.value='multi-storey') 62 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'parking' AND tags.value='underground')" 63 | common_query > "../../sql/$F.sql" 64 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/gcs_service.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from google.cloud import storage 5 | 6 | 7 | def parse_uri_to_bucket_and_filename(file_path): 8 | """Divides file uri to bucket name and file name""" 9 | path_parts = file_path.split("//") 10 | if len(path_parts) >= 2: 11 | main_part = path_parts[1] 12 | if "/" in main_part: 13 | divide_index = main_part.index("/") 14 | bucket_name = main_part[:divide_index] 15 | file_name = main_part[divide_index + 1 - len(main_part):] 16 | 17 | return bucket_name, file_name 18 | return "", "" 19 | 20 | 21 | def from_gcs_to_local_file(src_gcs_bucket, src_gcs_name, local_file_path): 22 | storage_client = storage.Client(os.environ['PROJECT_ID']) 23 | # Create a bucket object for our bucket 24 | bucket = storage_client.get_bucket(src_gcs_bucket) 25 | # Create a blob object from the filepath 26 | blob = bucket.blob(src_gcs_name) 27 | # Download the file to a destination 28 | logging.info("Downloading gs://{}/{} to {}...".format(src_gcs_bucket, src_gcs_name, local_file_path)) 29 | blob.download_to_filename(local_file_path) 30 | logging.info("Successfully downloaded gs://{}/{} to {}".format(src_gcs_bucket, src_gcs_name, local_file_path)) 31 | 32 | 33 | def is_gcs_blob_exists(bucket, blob_name): 34 | storage_client = storage.Client(os.environ['PROJECT_ID']) 35 | # Create a bucket object for our bucket 36 | bucket = storage_client.get_bucket(bucket) 37 | # Create a blob object from the filepath 38 | blob = bucket.blob(blob_name) 39 | return blob.exists() 40 | 41 | 42 | def upload_file_to_gcs(filename, destination_bucket_name, destination_blob_name): 43 | """ 44 | Uploads a file to a given Cloud Storage bucket and returns the public url 45 | to the new object. 46 | """ 47 | bucket = storage.Client().bucket(destination_bucket_name) 48 | blob = bucket.blob(destination_blob_name) 49 | logging.info("Uploading of {} to gs://{}/{}...".format(filename, destination_bucket_name, destination_blob_name)) 50 | blob.upload_from_filename( 51 | filename, 52 | content_type="text/plain") 53 | logging.info( 54 | "Finished uploading of {} to gs://{}/{}".format(filename, destination_bucket_name, destination_blob_name)) 55 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/transport/transport.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=transport 5 | LAYER=( 6 | "5621:highway=bus_stop>bus_stop-highway" 7 | "5622:amenity=bus_station" 8 | "5641:amenity=taxi" 9 | "5652:aeroway=airfield>airfield-aeroway" 10 | "5652:military=airfield>airfield-military" 11 | "5655:aeroway=helipad" 12 | "5656:aeroway=apron" 13 | "5661:amenity=ferry_terminal" 14 | "5671:aerialway=station>aerialway_station" 15 | ) 16 | 17 | for layer in "${LAYER[@]}" 18 | do 19 | CODE="${layer%%:*}" 20 | KVF="${layer##*:}" 21 | K="${KVF%%=*}" 22 | VF="${KVF##*=}" 23 | V="${VF%%>*}" 24 | F="${VF##*>}" 25 | N="${F%%-*}" 26 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 27 | common_query > "../../sql/$F.sql" 28 | done 29 | 30 | CODE=5621 31 | N=bus_stop 32 | F=bus_stop-public_transport 33 | #highway=bus_stop, or public_transport=stop_position + bus=yes 34 | EXTRA_CONSTRAINTS=" 35 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'public_transport' AND tags.value='stop_position') 36 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'bus' AND tags.value='yes') 37 | AND COALESCE(osm.id,osm.way_id) = COALESCE(f.osm_id,f.osm_way_id)" 38 | common_query > "../../sql/$F.sql" 39 | 40 | CODE=5651 41 | N=airport 42 | F=airport 43 | #amenity=airport or aeroway=aerodrome unless type=airstrip 44 | EXTRA_CONSTRAINTS=" 45 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE (tags.key = 'type' AND tags.value='airstrip')) 46 | AND ( 47 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amenity' AND tags.value='airport') 48 | OR 49 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'aeroway' AND tags.value='aerodrome') 50 | )" 51 | common_query > "../../sql/$F.sql" 52 | 53 | CODE=5652 54 | N=airfield 55 | F=airfield-airstrip 56 | #aeroway=airfield, military=airfield, aeroway=aeroway with type=airstrip 57 | EXTRA_CONSTRAINTS=" 58 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'aeroway' AND tags.value='aeroway') 59 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'type' AND tags.value='airstrip')" 60 | common_query > "../../sql/$F.sql" 61 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/pofw/pofw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=pofw 5 | LAYER=( 6 | "3100:religion=christian" 7 | "3200:religion=jewish" 8 | "3300:religion=muslim" 9 | "3400:religion=buddhist" 10 | "3500:religion=hindu" 11 | "3600:religion=taoist" 12 | "3700:religion=shinto" 13 | "3800:religion=sikh" 14 | ) 15 | 16 | for layer in "${LAYER[@]}" 17 | do 18 | CODE="${layer%%:*}" 19 | KVF="${layer##*:}" 20 | K="${KVF%%=*}" 21 | VF="${KVF##*=}" 22 | V="${VF%%>*}" 23 | F="${VF##*>}" 24 | N="${F%%-*}" 25 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 26 | common_query > "../../sql/$F.sql" 27 | done 28 | 29 | LAYER=( 30 | "3101:denomination=anglican>christian_anglican" 31 | "3102:denomination=catholic>christian_catholic" 32 | "3103:denomination=evangelical>christian_evangelical" 33 | "3104:denomination=lutheran>christian_lutheran" 34 | "3105:denomination=methodist>christian_methodist" 35 | "3106:denomination=orthodox>christian_orthodox" 36 | "3107:denomination=protestant>christian_protestant" 37 | "3108:denomination=baptist>christian_baptist" 38 | "3109:denomination=mormon>christian_mormon" 39 | ) 40 | for layer in "${LAYER[@]}" 41 | do 42 | CODE="${layer%%:*}" 43 | KVF="${layer##*:}" 44 | K="${KVF%%=*}" 45 | VF="${KVF##*=}" 46 | V="${VF%%>*}" 47 | F="${VF##*>}" 48 | N="${F%%-*}" 49 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V') 50 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'religion' AND tags.value='christian')" 51 | common_query > "../../sql/$F.sql" 52 | done 53 | 54 | LAYER=( 55 | "3301:denomination=sunni>muslim_sunni" 56 | "3302:denomination=shia>muslim_shia" 57 | ) 58 | for layer in "${LAYER[@]}" 59 | do 60 | CODE="${layer%%:*}" 61 | KVF="${layer##*:}" 62 | K="${KVF%%=*}" 63 | VF="${KVF##*=}" 64 | V="${VF%%>*}" 65 | F="${VF##*>}" 66 | N="${F%%-*}" 67 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V') 68 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'religion' AND tags.value='muslim')" 69 | common_query > "../../sql/$F.sql" 70 | done 71 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_features/utils/get_client_id.py: -------------------------------------------------------------------------------- 1 | """Get the client ID associated with a Cloud Composer environment.""" 2 | 3 | import argparse 4 | 5 | 6 | def get_client_id(project_id, location, composer_environment): 7 | # [START composer_get_environment_client_id] 8 | import google.auth 9 | import google.auth.transport.requests 10 | import requests 11 | import urllib.parse 12 | 13 | # Authenticate with Google Cloud. 14 | # See: https://cloud.google.com/docs/authentication/getting-started 15 | credentials, _ = google.auth.default( 16 | scopes=['https://www.googleapis.com/auth/cloud-platform']) 17 | authed_session = google.auth.transport.requests.AuthorizedSession( 18 | credentials) 19 | 20 | # project_id = 'YOUR_PROJECT_ID' 21 | # location = 'us-central1' 22 | # composer_environment = 'YOUR_COMPOSER_ENVIRONMENT_NAME' 23 | 24 | environment_url = ( 25 | 'https://composer.googleapis.com/v1beta1/projects/{}/locations/{}' 26 | '/environments/{}').format(project_id, location, composer_environment) 27 | composer_response = authed_session.request('GET', environment_url) 28 | environment_data = composer_response.json() 29 | airflow_uri = environment_data['config']['airflowUri'] 30 | 31 | # The Composer environment response does not include the IAP client ID. 32 | # Make a second, unauthenticated HTTP request to the web server to get the 33 | # redirect URI. 34 | redirect_response = requests.get(airflow_uri, allow_redirects=False) 35 | redirect_location = redirect_response.headers['location'] 36 | 37 | # Extract the client_id query parameter from the redirect. 38 | parsed = urllib.parse.urlparse(redirect_location) 39 | query_string = urllib.parse.parse_qs(parsed.query) 40 | print(query_string['client_id'][0]) 41 | # [END composer_get_environment_client_id] 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser( 46 | description=__doc__, 47 | formatter_class=argparse.RawDescriptionHelpFormatter) 48 | parser.add_argument('project_id', help='Your Project ID.') 49 | parser.add_argument( 50 | 'location', help='Region of the Cloud Composer environent.') 51 | parser.add_argument( 52 | 'composer_environment', help='Name of the Cloud Composer environent.') 53 | 54 | args = parser.parse_args() 55 | get_client_id( 56 | args.project_id, args.location, args.composer_environment) -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_features/src/csv_to_json/geojson-csv-to-json.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use JSON; 5 | use Text::CSV::Encoded; 6 | use Encode qw( decode ); 7 | 8 | 9 | #geometry,osm_id,osm_way_id,osm_version,osm_timestamp,all_tags 10 | 11 | my $csv = Text::CSV::Encoded->new({ sep_char => ',',escape_char => '"',encoding_in => "iso-8859-1" }); 12 | my @field_names; 13 | my $i = 1; 14 | while (my $line = <>) { 15 | chomp $line; 16 | $line = decode( 'iso-8859-1', $line ); 17 | if ( $i == 1 ) { 18 | @field_names = split /,/, $line; 19 | } 20 | else { 21 | if ($csv->parse($line)) { 22 | my @fields = $csv->fields(); 23 | my $geometry = JSON::decode_json($fields[0]); 24 | my $osm_id = $fields[1]; 25 | my $osm_way_id = $fields[2]; 26 | my $osm_version = $fields[3]; 27 | my $osm_timestamp = $fields[4]; 28 | my $all_tags = $fields[5]; 29 | $all_tags =~ s/""/\\"/g; 30 | $all_tags =~ s/\r/\\r/gs; 31 | $all_tags =~ s/\t/\\t/gs; 32 | $all_tags =~ s/\\\\/DOUBLEBACKSLASH/g; 33 | my @tags = (); 34 | while ( $all_tags =~ m/\G.*?"(.*?[^\\])"=>"(.*?[^\\])"(,|$)/g ) { 35 | my $k = $1; 36 | my $v = $2; 37 | if ( $v =~ m/\\"$/ ) { warn "MATCHED '\\'"; $v .= '"'; } 38 | $k =~ s/DOUBLEBACKSLASH/\\\\/; 39 | $v =~ s/DOUBLEBACKSLASH/\\\\/; 40 | #warn "$k\t=>\t$v"; 41 | push @tags, {"key" => $k, "value" => $v}; 42 | } 43 | my $json_tags = '[' . join(',', map { '{"key":"' . $_->{key} . '","value":"' . $_->{value} . '"}' } @tags) . ']'; 44 | #$all_tags = '[' . join(",",(map{qq({"key":"$_","value":"$at{$_}"})} keys %at)) . ']'; 45 | eval { 46 | $json_tags = JSON::encode_json(JSON::decode_json($json_tags)); 47 | }; 48 | if ( $@ ) { 49 | print STDERR "failed to JSON encode at line $i: $@, offending data:\n"; 50 | print STDERR "\torig: $all_tags\n"; 51 | print STDERR "\tjson: $json_tags\n"; 52 | } 53 | else { 54 | my $genc = JSON::encode_json($geometry); 55 | $genc =~ s/"/\\"/g; 56 | print sprintf(qq({"geometry":"%s","osm_id":"%s","osm_way_id":"%s","osm_version":%d,"osm_timestamp":"%s","all_tags":%s}\n), 57 | $genc, $osm_id, $osm_way_id, $osm_version, $osm_timestamp, $json_tags 58 | ); 59 | } 60 | } 61 | else { 62 | print STDERR "failed to parse $i:\t$line\n"; 63 | } 64 | } 65 | $i++; 66 | } 67 | -------------------------------------------------------------------------------- /dags/utils/gcs_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from google.cloud import storage 5 | 6 | 7 | def parse_uri_to_bucket_and_filename(file_path): 8 | """Divides file uri to bucket name and file name""" 9 | path_parts = file_path.split("//") 10 | if len(path_parts) >= 2: 11 | main_part = path_parts[1] 12 | if "/" in main_part: 13 | divide_index = main_part.index("/") 14 | bucket_name = main_part[:divide_index] 15 | file_name = main_part[divide_index + 1 - len(main_part):] 16 | 17 | return bucket_name, file_name 18 | else: 19 | raise Exception("Wrong file_path format: {}".format(file_path)) 20 | else: 21 | raise Exception("Wrong file_path format: {}".format(file_path)) 22 | 23 | 24 | def from_gcs_to_local_file(src_gcs_bucket, src_gcs_name, local_file_path): 25 | storage_client = storage.Client(os.environ['PROJECT_ID']) 26 | # Create a bucket object for our bucket 27 | bucket = storage_client.get_bucket(src_gcs_bucket) 28 | # Create a blob object from the filepath 29 | blob = bucket.blob(src_gcs_name) 30 | # Download the file to a destination 31 | logging.info("Downloading gs://{}/{} to {}...".format(src_gcs_bucket, src_gcs_name, local_file_path)) 32 | blob.download_to_filename(local_file_path) 33 | logging.info("Successfully downloaded gs://{}/{} to {}".format(src_gcs_bucket, src_gcs_name, local_file_path)) 34 | 35 | 36 | def is_gcs_blob_exists(bucket, blob_name): 37 | storage_client = storage.Client(os.environ['PROJECT_ID']) 38 | # Create a bucket object for our bucket 39 | bucket = storage_client.get_bucket(bucket) 40 | # Create a blob object from the filepath 41 | blob = bucket.blob(blob_name) 42 | return blob.exists() 43 | 44 | 45 | def upload_file_to_gcs(filename, destination_bucket_name, destination_blob_name): 46 | """ 47 | Uploads a file to a given Cloud Storage bucket and returns the public url 48 | to the new object. 49 | """ 50 | bucket = storage.Client().bucket(destination_bucket_name) 51 | blob = bucket.blob(destination_blob_name) 52 | logging.info("Uploading of {} to gs://{}/{}...".format(filename, destination_bucket_name, destination_blob_name)) 53 | blob.upload_from_filename( 54 | filename, 55 | content_type="text/plain") 56 | logging.info( 57 | "Finished uploading of {} to gs://{}/{}".format(filename, destination_bucket_name, destination_blob_name)) -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/traffic_barrier/traffic_barrier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=traffic 5 | LAYER=( 6 | "5211:barrier=gate" 7 | "5212:barrier=bollard" 8 | "5213:barrier=lift_gate" 9 | "5214:barrier=stile>stile-barrier" 10 | "5214:highway=stile>stile-highway" 11 | "5215:barrier=cycle_barrier>cycle" 12 | "5216:barrier=fence" 13 | "5217:barrier=toll_booth>toll" 14 | "5218:barrier=block" 15 | "5219:barrier=kissing_gate" 16 | "5220:barrier=cattle_grid" 17 | ) 18 | 19 | for layer in "${LAYER[@]}" 20 | do 21 | CODE="${layer%%:*}" 22 | KVF="${layer##*:}" 23 | K="${KVF%%=*}" 24 | VF="${KVF##*=}" 25 | V="${VF%%>*}" 26 | F="${VF##*>}" 27 | N="${F%%-*}" 28 | NAME_PREFIX=barrier_ 29 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 30 | common_query > "../../sql/$NAME_PREFIX$F.sql" 31 | done 32 | 33 | CODE=5210 34 | V=barrier 35 | N=barrier 36 | F=barrier 37 | NAME_PREFIX="" 38 | EXTRA_CONSTRAINTS=" 39 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K') 40 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='gate') 41 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='bollard') 42 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='lift_gate') 43 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='stile') 44 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'highway' AND tags.value='stile') 45 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='cycle_barrier') 46 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='fence') 47 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='toll_booth') 48 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='block') 49 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='kissing_gate') 50 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'barrier' AND tags.value='cattle_grid')" 51 | common_query > "../../sql/$F.sql" 52 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/poi_public/poi_public.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=poi_public 5 | LAYER=( 6 | "2001:amenity=police" 7 | "2002:amenity=fire_station" 8 | "2004:amenity=post_box" 9 | "2005:amenity=post_office" 10 | "2006:amenity=telephone" 11 | "2007:amenity=library" 12 | "2008:amenity=townhall>town_hall" 13 | "2009:amenity=courthouse" 14 | "2010:amenity=prison" 15 | "2011:amenity=embassy" 16 | "2012:amenity=community_centre" 17 | "2013:amenity=nursing_home" 18 | "2014:amenity=arts_centre" 19 | "2015:amenity=grave_yard>graveyard-amenity" 20 | "2015:landuse=cemetery>graveyard-landuse" 21 | "2016:amenity=marketplace" 22 | "2081:amenity=university" 23 | "2082:amenity=school" 24 | "2083:amenity=kindergarten" 25 | "2084:amenity=college" 26 | "2099:amenity=public_building" 27 | ) 28 | 29 | for layer in "${LAYER[@]}" 30 | do 31 | CODE="${layer%%:*}" 32 | KVF="${layer##*:}" 33 | K="${KVF%%=*}" 34 | VF="${KVF##*=}" 35 | V="${VF%%>*}" 36 | F="${VF##*>}" 37 | N="${F%%-*}" 38 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 39 | common_query > "../../sql/$F.sql" 40 | done 41 | 42 | LAYER=( 43 | "2031:glass=yes>recycling_glass" 44 | "2032:paper=yes>recycling_paper" 45 | "2033:clothes=yes>recycling_clothes" 46 | "2034:metal=yes>recycling_metal" 47 | ) 48 | for layer in "${LAYER[@]}" 49 | do 50 | CODE="${layer%%:*}" 51 | KVF="${layer##*:}" 52 | K="${KVF%%=*}" 53 | VF="${KVF##*=}" 54 | V="${VF%%>*}" 55 | F="${VF##*>}" 56 | N="${F%%-*}" 57 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'recycling:$K' AND tags.value='$V')" 58 | common_query > "../../sql/$F.sql" 59 | done 60 | 61 | CODE=2030 62 | N=recycling 63 | F=recycling 64 | EXTRA_CONSTRAINTS=" 65 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'amenity' AND tags.value='recycling') 66 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'recycling:glass' AND tags.value='yes') 67 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'recycling:paper' AND tags.value='yes') 68 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'recycling:clothes' AND tags.value='yes') 69 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'recycling:scrap_metal' AND tags.value='yes') 70 | " 71 | common_query > "../../sql/$F.sql" 72 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/poi_shopping/poi_shopping.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=poi_shopping 5 | LAYER=( 6 | "2501:shop=supermarket" 7 | "2502:shop=bakery" 8 | "2503:shop=kiosk" 9 | "2504:shop=mall" 10 | "2505:shop=department_store" 11 | "2511:shop=convenience" 12 | "2512:shop=clothes" 13 | "2513:shop=florist" 14 | "2514:shop=chemist" 15 | "2515:shop=books" 16 | "2516:shop=butcher" 17 | "2517:shop=shoes" 18 | "2518:shop=alcohol>beverages-alcohol" 19 | "2518:shop=beverages>beverages-beverages" 20 | "2519:shop=optician" 21 | "2520:shop=jewelry" 22 | "2521:shop=gift" 23 | "2522:shop=sports" 24 | "2523:shop=stationery" 25 | "2524:shop=outdoor" 26 | "2525:shop=mobile_phone" 27 | "2526:shop=toys" 28 | "2527:shop=newsagent" 29 | "2528:shop=greengrocer" 30 | "2529:shop=beauty" 31 | "2530:shop=video" 32 | "2541:shop=car" 33 | "2542:shop=bicycle" 34 | "2543:shop=doityourself>doityourself-doityourself" 35 | "2543:shop=hardware>doityourself-hardware" 36 | "2544:shop=furniture" 37 | "2546:shop=computer" 38 | "2547:shop=garden_centre" 39 | "2561:shop=hairdresser" 40 | "2562:shop=car_repair" 41 | "2563:amenity=car_rental" 42 | "2564:amenity=car_wash" 43 | "2565:amenity=car_sharing" 44 | "2566:amenity=bicycle_rental" 45 | "2567:shop=travel_agency" 46 | "2568:shop=laundry>laundry-laundry" 47 | "2568:shop=dry_cleaning>laundry-dry_cleaning" 48 | "2591:vending=cigarettes>vending_cigarette" 49 | "2592:vending=parking_tickets>vending_parking" 50 | ) 51 | 52 | for layer in "${LAYER[@]}" 53 | do 54 | CODE="${layer%%:*}" 55 | KVF="${layer##*:}" 56 | K="${KVF%%=*}" 57 | VF="${KVF##*=}" 58 | V="${VF%%>*}" 59 | F="${VF##*>}" 60 | N="${F%%-*}" 61 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 62 | common_query > "../../sql/$F.sql" 63 | done 64 | 65 | CODE=2590 66 | N=vending_machine 67 | F=vending_machine 68 | EXTRA_CONSTRAINTS=" 69 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amenity' AND tags.value='vending_machine') 70 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'vending' AND tags.value='cigarettes') 71 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'vending' AND tags.value='parking_tickets')" 72 | common_query > "../../sql/$F.sql" 73 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/power/power.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=power 5 | LAYER=( 6 | "6411:source=nuclear>station_nuclear" 7 | "6412:source=solar>station_solar-solar" 8 | "6413:source=gas>station_fossil-gas" 9 | "6413:source=coal>station_fossil-coal" 10 | "6413:source=oil>station_fossil-oil" 11 | "6413:source=diesel>station_fossil-diesel" 12 | "6414:source=hydro>station_water-generator" 13 | "6415:source=wind>station_wind-generator" 14 | ) 15 | for layer in "${LAYER[@]}" 16 | do 17 | CODE="${layer%%:*}" 18 | KVF="${layer##*:}" 19 | K="${KVF%%=*}" 20 | VF="${KVF##*=}" 21 | V="${VF%%>*}" 22 | F="${VF##*>}" 23 | N="${F%%-*}" 24 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'generator:$K' AND tags.value='$V')" 25 | common_query > "../../sql/$F.sql" 26 | done 27 | 28 | LAYER=( 29 | "6204:power=pole>pole" 30 | "6401:power=tower>tower" 31 | "6412:power_source=photovoltaic>station_solar-photovoltaic" 32 | "6414:power_source=hydro>station_water-power" 33 | "6415:power_source=wind>station_wind-power" 34 | "6422:power=station>substation-station" 35 | "6422:power=sub_station>substation-sub_station" 36 | "6423:power=transformer>transformer" 37 | ) 38 | for layer in "${LAYER[@]}" 39 | do 40 | CODE="${layer%%:*}" 41 | KVF="${layer##*:}" 42 | K="${KVF%%=*}" 43 | VF="${KVF##*=}" 44 | V="${VF%%>*}" 45 | F="${VF##*>}" 46 | N="${F%%-*}" 47 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 48 | common_query > "../../sql/$F.sql" 49 | done 50 | 51 | 52 | CODE=6410 53 | N=station 54 | F=station 55 | EXTRA_CONSTRAINTS=" 56 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'power' AND tags.value='generator') 57 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE 58 | ( tags.key = 'generator:source' AND tags.value = 'nuclear' ) 59 | OR ( (tags.key = 'generator:source' AND tags.value = 'solar') OR (tags.key = 'power_source' AND tags.value = 'photovoltaic') ) 60 | OR ( tags.key = 'generator:source' AND tags.value IN ('gas','coal','oil','diesel') ) 61 | OR ( (tags.key = 'generator:source' AND tags.value = 'hydro') OR (tags.key = 'power_source' AND tags.value = 'hydro') ) 62 | OR ( (tags.key = 'generator:source' AND tags.value = 'wind') OR (tags.key = 'power_source' AND tags.value = 'wind') ) 63 | OR ( (tags.key = 'power' AND tags.value = 'station') OR (tags.key = 'power' AND tags.value = 'sub_station') ) 64 | OR ( tags.key = 'power' AND tags.value = 'transformer' ) 65 | )" 66 | common_query > "../../sql/$F.sql" 67 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/gdal/gdal_handler.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import logging 3 | import json 4 | import time 5 | import os 6 | 7 | class GDALHandler(object): 8 | 9 | def __init__(self, script_path, config_path, work_dir): 10 | self.script_path = script_path 11 | self.config_path = config_path 12 | self.work_dir = work_dir 13 | self.type_layers = {"ways": ["lines", "multipolygons"], 14 | "relations": ["multipolygons", "other_relations", "points", "multilinestrings", "lines"]} 15 | 16 | def osm_to_geojson(self, src_osm_filename, entity_type, result_ids): 17 | def geometry_from_geojson_features(geojson_features, feature_index): 18 | return geojson_features[feature_index]["properties"]["geometry"] 19 | try: 20 | file_size = os.path.getsize(src_osm_filename) 21 | except Exception: 22 | file_size = -1 23 | logging.info("Working with {}, size: {}".format(src_osm_filename, str(file_size))) 24 | start = time.time() 25 | 26 | id_geometry_map = {} 27 | layers = self.type_layers[entity_type] 28 | for layer in layers: 29 | temp_geojson_file_name = self.work_dir + "{}.geojson".format(layer) 30 | cmd = "sh {} {} {} {} {}".format(self.script_path, self.config_path, src_osm_filename, 31 | temp_geojson_file_name, layer) 32 | process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) 33 | process.communicate() 34 | 35 | geojson_file = open(temp_geojson_file_name, "r") 36 | geojson_data = json.load(geojson_file) 37 | geojson_file.close() 38 | os.remove(temp_geojson_file_name) 39 | 40 | geojson_features = geojson_data["features"] 41 | if len(geojson_features) > 0: 42 | for index in range(len(geojson_features)): 43 | current_id = geojson_features[index]["properties"]["osm_id"] 44 | if not current_id: 45 | current_id = geojson_features[index]["properties"]["osm_way_id"] 46 | current_id = int(current_id) 47 | if current_id in result_ids: 48 | id_geometry_map[current_id] = geometry_from_geojson_features(geojson_features, index) 49 | result_ids.remove(current_id) 50 | if len(result_ids) == 0: 51 | break 52 | if len(result_ids) == 0: 53 | break 54 | os.remove(src_osm_filename) 55 | logging.info("Finish working with {}. Time spent: {}s".format(src_osm_filename, (time.time() - start))) 56 | return id_geometry_map 57 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/poi_tourism/poi_tourism.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=poi_tourism 5 | LAYER=( 6 | "2721:tourism=attraction" 7 | "2722:tourism=museum" 8 | "2723:historic=monument" 9 | "2724:historic=memorial" 10 | "2725:tourism=artwork>art" 11 | "2731:historic=castle" 12 | "2732:historic=ruins" 13 | "2733:historic=archaeological_site>archaeological" 14 | "2734:historic=wayside_cross" 15 | "2735:historic=wayside_shrine" 16 | "2736:historic=battlefield" 17 | "2737:historic=fort" 18 | "2741:tourism=picnic_site" 19 | "2742:tourism=viewpoint" 20 | "2743:tourism=zoo" 21 | "2744:tourism=theme_park" 22 | ) 23 | 24 | for layer in "${LAYER[@]}" 25 | do 26 | CODE="${layer%%:*}" 27 | KVF="${layer##*:}" 28 | K="${KVF%%=*}" 29 | VF="${KVF##*=}" 30 | V="${VF%%>*}" 31 | F="${VF##*>}" 32 | N="${F%%-*}" 33 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 34 | common_query > "../../sql/$F.sql" 35 | done 36 | 37 | CODE=2701 38 | N=tourist_info 39 | F=tourist_info 40 | #2701 41 | EXTRA_CONSTRAINTS=" 42 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tourism' AND tags.value='information') 43 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='map') 44 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='board') 45 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='guidepost')" 46 | common_query > "../../sql/$F.sql" 47 | 48 | CODE=2704 49 | N=tourist_map 50 | F=tourist_map 51 | EXTRA_CONSTRAINTS=" 52 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tourism' AND tags.value='information') 53 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='map')" 54 | common_query > "../../sql/$F.sql" 55 | 56 | CODE=2705 57 | N=tourist_board 58 | F=tourist_board 59 | EXTRA_CONSTRAINTS=" 60 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tourism' AND tags.value='information') 61 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='board')" 62 | common_query > "../../sql/$F.sql" 63 | 64 | CODE=2706 65 | N=tourist_guidepost 66 | F=tourist_guidepost 67 | EXTRA_CONSTRAINTS=" 68 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tourism' AND tags.value='information') 69 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'information' AND tags.value='guidepost')" 70 | common_query > "../../sql/$F.sql" 71 | -------------------------------------------------------------------------------- /examples/clustering/tf_idf/analyze.sql: -------------------------------------------------------------------------------- 1 | -- assign tile to lbcs category 2 | WITH similarities AS (SELECT 3 | grid.geo_id, 4 | MAX(udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec)) as max_similarity 5 | FROM 6 | `gcp-pdp-osm-dev.osm_clustering_grid_01km.vectors_tfidf` tfidf 7 | JOIN `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` grid USING(geo_id) 8 | CROSS JOIN `gcp-pdp-osm-dev.lbcs.lbcs_tfidf` lbcs 9 | WHERE grid.city_name = "Madrid" 10 | AND lbcs.dimension = 'Function' 11 | AND udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec) > 0 12 | GROUP BY grid.geo_id) 13 | SELECT 14 | grid.geo_id, 15 | grid.geog, 16 | lbcs.name, 17 | lbcs.color, 18 | udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec) as similarity 19 | FROM 20 | `gcp-pdp-osm-dev.osm_clustering_grid_01km.vectors_tfidf` tfidf 21 | JOIN `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` grid USING(geo_id) 22 | CROSS JOIN `gcp-pdp-osm-dev.lbcs.lbcs_tfidf` lbcs 23 | JOIN similarities ON similarities.geo_id = tfidf.geo_id AND similarities.max_similarity = udfs.cosine_similarity(tfidf.tfidf_vec, lbcs.tfidf_vec) 24 | WHERE grid.city_name = "Madrid" 25 | AND lbcs.dimension = 'Function' 26 | ORDER BY similarity DESC 27 | 28 | -- Selects tile terms 29 | WITH objects_with_terms AS (SELECT osm_id, geometry, term 30 | FROM `gcp-pdp-osm-dev.osm_cities.cities_objects` as objects 31 | JOIN UNNEST(SPLIT(CONCAT(layer_class, "_", layer_name), "_")) as term 32 | WHERE objects.city_name = 'Madrid') 33 | , data AS ( 34 | SELECT 35 | grid.geo_id, 36 | objects.term 37 | FROM 38 | objects_with_terms AS objects, 39 | `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` as grid 40 | WHERE ST_INTERSECTS(grid.geog, objects.geometry) 41 | ) 42 | , counts AS (SELECT 43 | geo_id, 44 | term, 45 | COUNT(term) OVER(partition by CONCAT(geo_id, term)) as term_count, 46 | COUNT(term) OVER(partition by geo_id) as terms_in_cell 47 | FROM data) 48 | , tf AS (SELECT geo_id, term, ANY_VALUE(term_count)/ANY_VALUE(terms_in_cell) as tf 49 | FROM counts 50 | GROUP BY geo_id, term) 51 | , term_in_cells AS ( 52 | SELECT term, COUNT(DISTINCT geo_id) in_cells 53 | FROM data 54 | GROUP BY 1 55 | ) 56 | , total_cells AS ( 57 | SELECT COUNT(DISTINCT geo_id) total_cells 58 | FROM data 59 | ) 60 | , idf AS ( 61 | SELECT term, LOG(total_cells.total_cells/in_cells) idf 62 | FROM term_in_cells 63 | CROSS JOIN total_cells 64 | ) 65 | , tf_idf AS ( 66 | SELECT 67 | geo_id, 68 | term, 69 | tf.tf * idf.idf tfidf, 70 | CONCAT(term, ': ', CAST(tf.tf * idf.idf AS STRING)) as term_and_tfidf 71 | FROM tf 72 | JOIN idf 73 | USING(term) 74 | ORDER BY tfidf DESC 75 | ) 76 | SELECT 77 | geo_id, 78 | ANY_VALUE(grid.geog) as geog, 79 | ARRAY_TO_STRING(ARRAY_AGG(term_and_tfidf ORDER BY tfidf DESC), ',
') as terms 80 | FROM tf_idf 81 | JOIN `gcp-pdp-osm-dev.osm_cities.cities_population_grid_01km` as grid USING(geo_id) 82 | WHERE grid.city_name = "Madrid" 83 | GROUP BY geo_id -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_features/src/osm2geojsoncsv: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Build BQ GeoJSON dataset from OSM dump file 3 | # The driver will categorize features into 5 layers : 4 | 5 | # points : “node” features that have significant tags attached. 6 | # lines : “way” features that are recognized as non-area. 7 | # multilinestrings : “relation” features that form a multilinestring 8 | # (type = ‘multilinestring’ or type = ‘route’). 9 | # multipolygons : “relation” features that form a multipolygon 10 | # (type = ‘multipolygon’ or type = ‘boundary’), and “way” features that 11 | # are recognized as area. 12 | # other_relations : “relation” features that do not belong to the above 2 layers. 13 | # Note: for recent GDAL option "OGR_INTERLEAVED_READING=YES" is not required 14 | # Use as 15 | # time sh osm2geojsoncsv germany-latest.osm.pbf germany-latest 16 | set -e 17 | 18 | # use custom GDAL configuration 19 | OGRCONFIG=osmconf.ini 20 | 21 | 22 | if [ "$#" -ne 3 ] 23 | then 24 | echo "Use as: $0 INPUT_FILENAME_OSM_PBF OUTPUT_BASENAME LAYERS" 25 | exit 1 26 | fi 27 | 28 | # input file name 29 | OSMNAME="$1" 30 | # output file basename (without extension) 31 | NAME="$2" 32 | LAYERS="$3" 33 | 34 | # check input file 35 | if [ ! -f "$OSMNAME" ] 36 | then 37 | echo "Input file '$1' doesn't exist" 38 | exit 1 39 | fi 40 | # check input file 41 | if [ ! -r "$OSMNAME" ] 42 | then 43 | echo "Input file '$1' is not readable" 44 | exit 1 45 | fi 46 | if [ ! -s "$OSMNAME" ] 47 | then 48 | echo "Input file '$1' is empty" 49 | exit 1 50 | fi 51 | BASENAME=$(basename "$OSMNAME") 52 | if [ $(basename "$BASENAME" .pbf) = "$BASENAME" ] 53 | then 54 | echo "Input file '$1' is not PBF Format ('Protocolbuffer Binary Format') file" 55 | exit 1 56 | fi 57 | 58 | # the option below can be helpful for some hardware configurations: 59 | # --config OSM_COMPRESS_NODES YES 60 | # GDAL_CACHEMAX and OSM_MAX_TMPFILE_SIZE defined in MB 61 | # for GDAL_CACHEMAX=4000 and OSM_MAX_TMPFILE_SIZE=4000 recommended RAM=60GB 62 | for ogrtype in $(echo $LAYERS | sed "s/,/ /g") 63 | do 64 | if [ "$ogrtype" = "multipolygons" ] 65 | then 66 | osm_fields="osm_id,osm_way_id,osm_version,osm_timestamp" 67 | else 68 | osm_fields="osm_id,NULL AS osm_way_id,osm_version,osm_timestamp" 69 | fi 70 | echo "Processing ${ogrtype} with OSM fields ${osm_fields}" 71 | 72 | ogr2ogr \ 73 | -skipfailures \ 74 | -f CSV \ 75 | "${NAME}-${ogrtype}.geojson.csv" "${OSMNAME}" \ 76 | --config OSM_CONFIG_FILE "${OGRCONFIG}" \ 77 | --config OGR_INTERLEAVED_READING YES \ 78 | --config GDAL_CACHEMAX 20000 \ 79 | --config OSM_MAX_TMPFILE_SIZE 100000 \ 80 | -dialect sqlite \ 81 | -sql "select AsGeoJSON(geometry) AS geometry, ${osm_fields}, replace(all_tags,X'0A','') as all_tags from ${ogrtype} where ST_IsValid(geometry) = 1" \ 82 | --debug on \ 83 | 2>"${NAME}-${ogrtype}.debug.log" 84 | done 85 | echo "Complete" 86 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_nodes_ways_relations/src/osm_dtos.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from datetime import datetime 3 | 4 | from osmium.osm._osm import Node 5 | from osmium.osm._osm import Way 6 | from osmium.osm._osm import Relation 7 | from osmium.osm._osm import OSMObject 8 | from osmium.osm._osm import RelationMember 9 | 10 | @dataclass 11 | class OsmObjectDTO(object): 12 | id: int 13 | version: int 14 | username: str 15 | changeset: int 16 | visible: bool 17 | timestamp: int 18 | tags = [] 19 | 20 | def __init__(self, osm_entity: OSMObject): 21 | self.id = osm_entity.id 22 | self.version = osm_entity.version 23 | self.username = osm_entity.user 24 | self.changeset = osm_entity.changeset 25 | self.visible = osm_entity.visible 26 | self.timestamp = int(datetime.timestamp(osm_entity.timestamp)) 27 | self.tags = [(tag.k, tag.v) for tag in osm_entity.tags] 28 | 29 | def __dict__(self): 30 | tags_dict = [{"key": tag[0], "value": tag[1]} for tag in self.tags] 31 | return {"id": self.id, "version": self.version, "username": self.username, "changeset": self.changeset, 32 | "visible": self.visible, "osm_timestamp": self.timestamp, "all_tags": tags_dict} 33 | 34 | 35 | @dataclass 36 | class NodeDTO(OsmObjectDTO): 37 | latitude: float 38 | longitude: float 39 | 40 | def __init__(self, node_entity: Node): 41 | OsmObjectDTO.__init__(self, node_entity) 42 | self.latitude = node_entity.location.lat 43 | self.longitude = node_entity.location.lon 44 | 45 | def __dict__(self): 46 | dict_repr = super(NodeDTO, self).__dict__() 47 | dict_repr["latitude"] = self.latitude 48 | dict_repr["longitude"] = self.longitude 49 | return dict_repr 50 | 51 | 52 | @dataclass 53 | class WayDTO(OsmObjectDTO): 54 | nodes: list 55 | 56 | def __init__(self, way_entity: Way): 57 | OsmObjectDTO.__init__(self, way_entity) 58 | self.nodes = [node.ref for node in way_entity.nodes] 59 | 60 | def __dict__(self): 61 | dict_repr = super(WayDTO, self).__dict__() 62 | dict_repr["nodes"] = [{"id": node} for node in self.nodes] 63 | return dict_repr 64 | 65 | 66 | @dataclass 67 | class RelationDTO(OsmObjectDTO): 68 | members: list 69 | 70 | def __init__(self, relation_entity: Relation): 71 | OsmObjectDTO.__init__(self, relation_entity) 72 | self.members = [RelationMemberDTO(member) for member in iter(relation_entity.members)] 73 | 74 | def __dict__(self): 75 | dict_repr = super(RelationDTO, self).__dict__() 76 | dict_repr["members"] = [member.__dict__() for member in self.members] 77 | return dict_repr 78 | 79 | 80 | @dataclass 81 | class RelationMemberDTO(object): 82 | type: str 83 | id: int 84 | role: str 85 | 86 | def __init__(self, relation_entity: RelationMember): 87 | self.type = relation_entity.type 88 | self.id = relation_entity.ref 89 | self.role = relation_entity.role 90 | 91 | def __dict__(self): 92 | return {"type": self.type, "id": self.id, "role": self.role} -------------------------------------------------------------------------------- /examples/clustering/bq_udf/geohash.js: -------------------------------------------------------------------------------- 1 | // geohash.js 2 | // Geohash library for Javascript 3 | // (c) 2008 David Troy 4 | // Distributed under the MIT License 5 | 6 | BITS = [16, 8, 4, 2, 1]; 7 | 8 | BASE32 = "0123456789bcdefghjkmnpqrstuvwxyz"; 9 | NEIGHBORS = { right : { even : "bc01fg45238967deuvhjyznpkmstqrwx" }, 10 | left : { even : "238967debc01fg45kmstqrwxuvhjyznp" }, 11 | top : { even : "p0r21436x8zb9dcf5h7kjnmqesgutwvy" }, 12 | bottom : { even : "14365h7k9dcfesgujnmqp0r2twvyx8zb" } }; 13 | BORDERS = { right : { even : "bcfguvyz" }, 14 | left : { even : "0145hjnp" }, 15 | top : { even : "prxz" }, 16 | bottom : { even : "028b" } }; 17 | 18 | NEIGHBORS.bottom.odd = NEIGHBORS.left.even; 19 | NEIGHBORS.top.odd = NEIGHBORS.right.even; 20 | NEIGHBORS.left.odd = NEIGHBORS.bottom.even; 21 | NEIGHBORS.right.odd = NEIGHBORS.top.even; 22 | 23 | BORDERS.bottom.odd = BORDERS.left.even; 24 | BORDERS.top.odd = BORDERS.right.even; 25 | BORDERS.left.odd = BORDERS.bottom.even; 26 | BORDERS.right.odd = BORDERS.top.even; 27 | 28 | function refine_interval(interval, cd, mask) { 29 | if (cd&mask) 30 | interval[0] = (interval[0] + interval[1])/2; 31 | else 32 | interval[1] = (interval[0] + interval[1])/2; 33 | } 34 | 35 | function calculateAdjacent(srcHash, dir) { 36 | srcHash = srcHash.toLowerCase(); 37 | var lastChr = srcHash.charAt(srcHash.length-1); 38 | var type = (srcHash.length % 2) ? 'odd' : 'even'; 39 | var base = srcHash.substring(0,srcHash.length-1); 40 | if (BORDERS[dir][type].indexOf(lastChr)!=-1) 41 | base = calculateAdjacent(base, dir); 42 | return base + BASE32[NEIGHBORS[dir][type].indexOf(lastChr)]; 43 | } 44 | 45 | function decodeGeoHash(geohash) { 46 | var is_even = 1; 47 | var lat = []; var lon = []; 48 | lat[0] = -90.0; lat[1] = 90.0; 49 | lon[0] = -180.0; lon[1] = 180.0; 50 | lat_err = 90.0; lon_err = 180.0; 51 | 52 | for (i=0; i mid) { 89 | ch |= BITS[bit]; 90 | lon[0] = mid; 91 | } else 92 | lon[1] = mid; 93 | } else { 94 | mid = (lat[0] + lat[1]) / 2; 95 | if (latitude > mid) { 96 | ch |= BITS[bit]; 97 | lat[0] = mid; 98 | } else 99 | lat[1] = mid; 100 | } 101 | 102 | is_even = !is_even; 103 | if (bit < 4) 104 | bit++; 105 | else { 106 | geohash += BASE32[ch]; 107 | bit = 0; 108 | ch = 0; 109 | } 110 | } 111 | return geohash; 112 | } 113 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/poi_miscpoi/poi_miscpoi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=poi_miscpoi 5 | LAYER=( 6 | "2901:amenity=toilets>toilet" 7 | "2902:amenity=bench" 8 | "2903:amenity=drinking_water" 9 | "2904:amenity=fountain" 10 | "2905:amenity=hunting_stand" 11 | "2906:amenity=waste_basket" 12 | "2907:man_made=surveillance>camera_surveillance" 13 | "2923:highway=emergency_access_point>emergency_access" 14 | "2952:man_made=water_tower" 15 | "2954:man_made=windmill" 16 | "2955:man_made=lighthouse" 17 | "2961:man_made=wastewater_plant" 18 | "2962:man_made=water_well" 19 | "2963:man_made=watermill>water_mill" 20 | "2964:man_made=water_works" 21 | ) 22 | 23 | for layer in "${LAYER[@]}" 24 | do 25 | CODE="${layer%%:*}" 26 | KVF="${layer##*:}" 27 | K="${KVF%%=*}" 28 | VF="${KVF##*=}" 29 | V="${VF%%>*}" 30 | F="${VF##*>}" 31 | N="${F%%-*}" 32 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 33 | common_query > "../../sql/$F.sql" 34 | done 35 | 36 | CODE=2950 37 | N=tower 38 | F=tower 39 | EXTRA_CONSTRAINTS=" 40 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='tower') 41 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tower:type' AND tags.value='communication') 42 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='water_tower') 43 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tower:type' AND tags.value='observation') 44 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='windmill') 45 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='lighthouse')" 46 | common_query > "../../sql/$F.sql" 47 | 48 | CODE=2951 49 | N=tower_comms 50 | F=tower_comms 51 | EXTRA_CONSTRAINTS=" 52 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='tower') 53 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tower:type' AND tags.value='communication')" 54 | common_query > "../../sql/$F.sql" 55 | 56 | CODE=2953 57 | N=tower_observation 58 | F=tower_observation 59 | EXTRA_CONSTRAINTS=" 60 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'man_made' AND tags.value='tower') 61 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'tower:type' AND tags.value='observation')" 62 | common_query > "../../sql/$F.sql" 63 | 64 | 65 | CODE=2921 66 | N=emergency_phone 67 | F=emergency_phone 68 | EXTRA_CONSTRAINTS=" 69 | AND ( 70 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amnenity' AND tags.value='emergency_phone') 71 | OR 72 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'emergency' AND tags.value='phone') 73 | )" 74 | common_query > "../../sql/$F.sql" 75 | 76 | CODE=2922 77 | N=fire_hydrant 78 | F=fire_hydrant 79 | EXTRA_CONSTRAINTS=" 80 | AND ( 81 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'amnenity' AND tags.value='fire_hydrant') 82 | OR 83 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = 'emergency' AND tags.value='fire_hydrant') 84 | )" 85 | common_query > "../../sql/$F.sql" 86 | -------------------------------------------------------------------------------- /triggering/trigger_osm_to_big_query_dg_gcf/main.py: -------------------------------------------------------------------------------- 1 | from google.auth.transport.requests import Request 2 | from google.oauth2 import id_token 3 | import requests 4 | import os 5 | 6 | 7 | IAM_SCOPE = 'https://www.googleapis.com/auth/iam' 8 | OAUTH_TOKEN_URI = 'https://www.googleapis.com/oauth2/v4/token' 9 | 10 | 11 | def trigger_dag(data, context=None): 12 | """Makes a POST request to the Composer DAG Trigger API 13 | When called via Google Cloud Functions (GCF), 14 | data and context are Background function parameters. 15 | For more info, refer to 16 | https://cloud.google.com/functions/docs/writing/background#functions_background_parameters-python 17 | To call this function from a Python script, omit the ``context`` argument 18 | and pass in a non-null value for the ``data`` argument. 19 | """ 20 | 21 | # Fill in with your Composer info here 22 | # Navigate to your webserver's login page and get this from the URL 23 | # Or use the script found at 24 | # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/composer/rest/get_client_id.py 25 | client_id = os.getenv("COMPOSER_CLIENT_ID") 26 | # This should be part of your webserver's URL: 27 | # {tenant-project-id}.appspot.com 28 | webserver_id = os.getenv("COMPOSER_WEBSERVER_ID") 29 | # The name of the DAG you wish to trigger 30 | dag_name = os.getenv("DAG_NAME") 31 | webserver_url = ( 32 | 'https://' 33 | + webserver_id 34 | + '.appspot.com/api/experimental/dags/' 35 | + dag_name 36 | + '/dag_runs' 37 | ) 38 | # Make a POST request to IAP which then Triggers the DAG 39 | make_iap_request( 40 | webserver_url, client_id, method='POST', json={"conf": data}) 41 | 42 | 43 | # This code is copied from 44 | # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/iap/make_iap_request.py 45 | # START COPIED IAP CODE 46 | def make_iap_request(url, client_id, method='GET', **kwargs): 47 | """Makes a request to an application protected by Identity-Aware Proxy. 48 | Args: 49 | url: The Identity-Aware Proxy-protected URL to fetch. 50 | client_id: The client ID used by Identity-Aware Proxy. 51 | method: The request method to use 52 | ('GET', 'OPTIONS', 'HEAD', 'POST', 'PUT', 'PATCH', 'DELETE') 53 | **kwargs: Any of the parameters defined for the request function: 54 | https://github.com/requests/requests/blob/master/requests/api.py 55 | If no timeout is provided, it is set to 90 by default. 56 | Returns: 57 | The page body, or raises an exception if the page couldn't be retrieved. 58 | """ 59 | # Set the default timeout, if missing 60 | if 'timeout' not in kwargs: 61 | kwargs['timeout'] = 90 62 | 63 | # Obtain an OpenID Connect (OIDC) token from metadata server or using service 64 | # account. 65 | google_open_id_connect_token = id_token.fetch_id_token(Request(), client_id) 66 | 67 | # Fetch the Identity-Aware Proxy-protected URL, including an 68 | # Authorization header containing "Bearer " followed by a 69 | # Google-issued OpenID Connect token for the service account. 70 | resp = requests.request( 71 | method, url, 72 | headers={'Authorization': 'Bearer {}'.format( 73 | google_open_id_connect_token)}, **kwargs) 74 | if resp.status_code == 403: 75 | raise Exception('Service account does not have permission to ' 76 | 'access the IAP-protected application.') 77 | elif resp.status_code != 200: 78 | raise Exception( 79 | 'Bad response from application: {!r} / {!r} / {!r}'.format( 80 | resp.status_code, resp.headers, resp.text)) 81 | else: 82 | return resp.text -------------------------------------------------------------------------------- /deployment/config/generate_config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import json 4 | 5 | if __name__ == '__main__': 6 | logging.getLogger().setLevel(logging.INFO) 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('config_file', help='Config file to save parameters') 10 | 11 | parser.add_argument('--project_id', help='Your Project ID.', required=True) 12 | 13 | parser.add_argument('--zone', help='Location zone', 14 | required=True) 15 | 16 | parser.add_argument('--osm_url', help='URL of the source OSM file', required=True) 17 | parser.add_argument('--osm_md5_url', help='URL of the source OSM file\'s MD5 hash', required=True) 18 | 19 | parser.add_argument('--gcs_transfer_bucket', help='GCS bucket to make transferring source file to project\'s GCS', 20 | required=True) 21 | parser.add_argument('--transfer_index_files_gcs_uri', help='GCS URI to Storage Transfer index file', 22 | required=True) 23 | 24 | parser.add_argument('--gcs_work_bucket', help='GCS bucket to save intermediate results', required=True) 25 | 26 | parser.add_argument('--osm_to_features_image', help='osm_to_features image name', required=True) 27 | parser.add_argument('--osm_to_nodes_ways_relations_image', help='osm_to_nodes_ways_relations image name', 28 | required=True) 29 | parser.add_argument('--generate_layers_image', help='generate_layers image name', required=True) 30 | parser.add_argument('--osm_converter_with_history_index_image', 31 | help='osm_converter_with_history_index_image image name', required=True) 32 | 33 | parser.add_argument('--gke_main_cluster_name', help='Name of the main GKE cluster', 34 | required=True) 35 | parser.add_argument('--addt_sn_gke_pool', help='GKE pool name for additional operations (single node pool)', 36 | required=True) 37 | parser.add_argument('--addt_sn_pool_machine_type', 38 | help='Machine type for additional operations GKE pool (single node pool)', 39 | required=True) 40 | parser.add_argument('--addt_sn_pool_disk_size', 41 | help='Disk size for additional operations GKE pool (single node pool)', 42 | required=True) 43 | parser.add_argument('--addt_sn_pool_num_nodes', 44 | help='Number of nodes for additional operations GKE pool (single node pool)', 45 | required=True) 46 | parser.add_argument('--addt_sn_pool_max_num_treads', help='Maximum number of threads for addt_sn_gke_pool', 47 | required=True) 48 | 49 | parser.add_argument('--addt_mn_gke_pool', help='GKE pool name for additional operations (multiple nodes pool)', 50 | required=True) 51 | parser.add_argument('--addt_mn_pool_machine_type', 52 | help='Machine type for additional operations GKE pool (multiple nodes pool)', 53 | required=True) 54 | parser.add_argument('--addt_mn_pool_disk_size', 55 | help='Disk size for additional operations GKE pool (multiple nodes pool)', 56 | required=True) 57 | parser.add_argument('--addt_mn_pool_num_nodes', 58 | help='Number of nodes for additional operations GKE pool (multiple nodes pool)', 59 | required=True) 60 | parser.add_argument('--addt_mn_pod_requested_memory', help='addt_mn GKE POD requested memory', required=True) 61 | 62 | parser.add_argument('--bq_dataset_to_export', help='BigQuery dataset name to export results', required=True) 63 | 64 | args = parser.parse_args() 65 | args_filtered = {} 66 | for k, v in vars(args).items(): 67 | if v: 68 | print(v) 69 | args_filtered[k] = v 70 | 71 | with open(args.config_file, 'w') as fp: 72 | json.dump(args_filtered, fp, indent=4) 73 | -------------------------------------------------------------------------------- /tasks_docker_images/generate_layers/src/layered_gis/place/place.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ../query_templates.sh 3 | 4 | CLASS=place 5 | LAYER=( 6 | "1001:place=city" 7 | "1002:place=town" 8 | "1003:place=village" 9 | "1004:place=hamlet" 10 | "1010:place=suburb" 11 | "1020:place=island" 12 | "1030:place=farm" 13 | "1031:place=isolated_dwelling>dwelling" 14 | "1040:place=region" 15 | "1041:place=county" 16 | "1050:place=locality" 17 | ) 18 | 19 | for layer in "${LAYER[@]}" 20 | do 21 | CODE="${layer%%:*}" 22 | KVF="${layer##*:}" 23 | K="${KVF%%=*}" 24 | VF="${KVF##*=}" 25 | V="${VF%%>*}" 26 | F="${VF##*>}" 27 | N="${F%%-*}" 28 | 29 | EXTRA_CONSTRAINTS="AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) as tags WHERE tags.key = '$K' AND tags.value='$V')" 30 | common_query > "../../sql/$F.sql" 31 | done 32 | 33 | #1005 34 | CODE=1005 35 | N=national_capital 36 | F=national_capital 37 | EXTRA_CONSTRAINTS=" 38 | AND ( 39 | ( 40 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND 41 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'is_capital' AND tags.value='country') 42 | ) 43 | OR 44 | ( 45 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND 46 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'admin_level' AND tags.value = '2') 47 | ) 48 | OR 49 | ( 50 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND 51 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'capital' AND tags.value='yes') AND 52 | NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'admin_level') 53 | ) 54 | ) 55 | " 56 | common_query > "../../sql/$F.sql" 57 | 58 | CODE=1099 59 | N=named_place 60 | F=named_place 61 | EXTRA_CONSTRAINTS=" 62 | AND EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'area' AND tags.value='yes') 63 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') 64 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='town') 65 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='village') 66 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='hamlet') 67 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='suburb') 68 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='island') 69 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='farm') 70 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='isolated_dwelling') 71 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='region') 72 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='county') 73 | AND NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='locality') 74 | AND ( 75 | ( 76 | NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND 77 | NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'is_capital' AND tags.value='country') 78 | ) 79 | OR 80 | ( 81 | NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND 82 | NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'admin_level' AND tags.value = '2') 83 | ) 84 | OR ( 85 | NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'place' AND tags.value='city') AND 86 | NOT EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'capital' AND tags.value='yes') AND 87 | EXISTS(SELECT 1 FROM UNNEST(osm.all_tags) AS tags WHERE tags.key = 'admin_level') 88 | ) 89 | ) 90 | " 91 | common_query > "../../sql/$F.sql" 92 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/elements_transformer.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import osmium 3 | 4 | def osm_timestamp_from_osm_entity(osm_entity): 5 | return int(datetime.timestamp(osm_entity.timestamp)) 6 | 7 | def osm_obj_to_dict(osm_entity, geometry, is_simplified, with_uid, tags_to_bq, osm_timestamp): 8 | base_dict = { 9 | "id": osm_entity.id, 10 | "version": osm_entity.version, 11 | "osm_timestamp": osm_timestamp if osm_timestamp else osm_timestamp_from_osm_entity(osm_entity) 12 | } 13 | if not is_simplified: 14 | base_dict["username"] = osm_entity.user 15 | base_dict["changeset"] = osm_entity.changeset 16 | base_dict["visible"] = osm_entity.visible 17 | base_dict["geometry"] = geometry 18 | base_dict["all_tags"] = [{"key": tag.k, "value": tag.v} for tag in osm_entity.tags] \ 19 | if tags_to_bq else [(tag.k, tag.v) for tag in osm_entity.tags] 20 | if with_uid: 21 | base_dict["uid"] = osm_entity.uid 22 | return base_dict 23 | 24 | 25 | def osm_entity_node_dict(osm_node_entity, 26 | geometry=None, 27 | is_simplified=False, 28 | with_uid=False, 29 | tags_to_bq=True, 30 | osm_timestamp=None): 31 | base_dict = osm_obj_to_dict(osm_node_entity, geometry, is_simplified, with_uid, tags_to_bq, osm_timestamp) 32 | if osm_node_entity.location.valid(): 33 | base_dict.update({ 34 | "latitude": osm_node_entity.location.lat, 35 | "longitude": osm_node_entity.location.lon 36 | }) 37 | return base_dict 38 | 39 | 40 | def osm_entity_way_dict(osm_way_entity, geometry=None, is_simplified=False, with_uid=False, tags_to_bq=True, 41 | osm_timestamp=None): 42 | base_dict = osm_obj_to_dict(osm_way_entity, geometry, is_simplified, with_uid, tags_to_bq, osm_timestamp) 43 | base_dict["nodes"] = [node.ref for node in osm_way_entity.nodes] 44 | return base_dict 45 | 46 | 47 | def osm_entity_relation_dict(osm_relation_entity, geometry=None, is_simplified=False, with_uid=False, tags_to_bq=True, 48 | osm_timestamp=None): 49 | base_dict = osm_obj_to_dict(osm_relation_entity, geometry, is_simplified, with_uid, tags_to_bq, osm_timestamp) 50 | base_dict["members"] = [(member.type, member.ref, member.role) for member in iter(osm_relation_entity.members)] 51 | return base_dict 52 | 53 | 54 | def get_osm_obj_from_dict(obj_dict): 55 | return osmium.osm.mutable.OSMObject(id=obj_dict["id"], 56 | version=obj_dict["version"], 57 | visible=obj_dict["visible"] if "visible" in obj_dict else None, 58 | changeset=obj_dict["changeset"] if "changeset" in obj_dict else None, 59 | timestamp=datetime.fromtimestamp(obj_dict["osm_timestamp"]), 60 | uid=obj_dict["uid"] if "uid" in obj_dict else None, 61 | tags=obj_dict["all_tags"] if "all_tags" in obj_dict else None 62 | ) 63 | 64 | 65 | def get_osm_node_from_dict(node_dict): 66 | lon = node_dict["longitude"] 67 | lat = node_dict["latitude"] 68 | location_tuple = (lon, lat) if lon and lat else None 69 | return osmium.osm.mutable.Node(get_osm_obj_from_dict(node_dict), location_tuple) 70 | 71 | 72 | def get_osm_way_from_dict(way_dict): 73 | return osmium.osm.mutable.Way(get_osm_obj_from_dict(way_dict), way_dict["nodes"]) 74 | 75 | 76 | def get_osm_relation_from_dict(relation_dict): 77 | return osmium.osm.mutable.Relation(get_osm_obj_from_dict(relation_dict), relation_dict["members"]) 78 | 79 | 80 | def edit_osm_obj_dict_according_to_bq_schema(obj_dict): 81 | obj_dict["all_tags"] = [{"key": tag_key, "value": tag_value} for tag_key, tag_value in obj_dict["all_tags"]] 82 | return obj_dict 83 | 84 | 85 | def edit_node_dict_according_to_bq_schema(node_dict): 86 | return edit_osm_obj_dict_according_to_bq_schema(node_dict) 87 | 88 | 89 | def edit_way_dict_according_to_bq_schema(way_dict): 90 | way_dict = edit_osm_obj_dict_according_to_bq_schema(way_dict) 91 | way_dict["nodes"] = [{"id": node_id} for node_id in way_dict["nodes"]] 92 | return way_dict 93 | 94 | 95 | def edit_relation_dict_according_to_bq_schema(relation_dict): 96 | relation_dict = edit_osm_obj_dict_according_to_bq_schema(relation_dict) 97 | relation_dict["members"] = [{"type": member_type, "id": member_ref, "role": member_role} 98 | for member_type, member_ref, member_role in relation_dict["members"]] 99 | return relation_dict 100 | 101 | 102 | def is_node_dict_with_location(node_dict): 103 | return node_dict["longitude"] and node_dict["latitude"] 104 | 105 | 106 | def get_way_nodes(way_dict): 107 | return way_dict["nodes"] 108 | 109 | 110 | def get_relation_members(relation_dict): 111 | return relation_dict["members"] 112 | -------------------------------------------------------------------------------- /examples/clustering/cities/README.md: -------------------------------------------------------------------------------- 1 | # Cities 2 | 3 | ## List of Cities 4 | 5 | In the `cities.csv` there is a list of cities from the [Globalization and World Cities Research Network](https://en.wikipedia.org/wiki/Globalization_and_World_Cities_Research_Network) wiki page. 6 | Each city row contains lat/long and radius of the manually defined circle that approximately covers city infrastructure and agglomeration. 7 | 8 | The `query.py` script can be used to transform CSV into the SQL with cities data. 9 | In our example result of the query is saved into the `osm_cities.cities` table using the BigQuery console. 10 | 11 | ## OSM Objects within cities 12 | 13 | Query to select all objects within city circle area. 14 | 15 | ``` 16 | SELECT 17 | cities.city_name, 18 | planet.* 19 | FROM 20 | `bigquery-public-data.geo_openstreetmap.planet_layers` as planet, `gcp-pdp-osm-dev.osm_cities.cities` as cities 21 | WHERE ST_DWITHIN(cities.center, planet.geometry, cities.radius) 22 | ``` 23 | 24 | Result is saved in `osm_cities.cities_objects` in order to reduce scanning overhead in next stages of analysis. 25 | 26 | ## Population grid within cities 27 | 28 | ### 1km resolution 29 | 30 | Query to select [Worldpop](https://www.worldpop.org/) population grid cells within city circle area. 31 | ``` 32 | SELECT 33 | cities.city_name, 34 | grid.* 35 | FROM `bigquery-public-data.worldpop.population_grid_1km` AS grid, 36 | gcp-pdp-osm-dev.osm_cities.cities AS cities 37 | WHERE last_updated = '2020-01-01' 38 | AND ST_DWITHIN(cities.center, grid.geog, cities.radius) 39 | ``` 40 | 41 | Result is saved in `osm_cities.cities_population_grid_1km` in order to reduce scanning overhead in next stages of analysis. 42 | 43 | ### 0.5km resolution 44 | 45 | Query to simply divide 1km resolution grid 46 | ``` 47 | WITH divided_grid AS (SELECT 48 | long1 + x*(long2 - long1)/2 as long1, 49 | lat1 + y*(lat2 - lat1)/2 as lat1, 50 | long1 + (x + 1)*(long2 - long1)/2 as long2, 51 | lat1 + (y + 1)*(lat2 - lat1)/2 as lat2, 52 | city_name, 53 | country_name, 54 | geo_id, 55 | population, 56 | alpha_3_code, 57 | last_updated 58 | FROM ( 59 | WITH quadrants AS 60 | (SELECT 0 as x, 0 as y UNION ALL 61 | SELECT 1, 0 UNION ALL 62 | SELECT 0, 1 UNION ALL 63 | SELECT 1, 1) 64 | SELECT 65 | CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][0][0]') AS FLOAT64) as long1, 66 | CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][0][1]') AS FLOAT64) as lat1, 67 | CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][2][0]') AS FLOAT64) as long2, 68 | CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][2][1]') AS FLOAT64) as lat2, 69 | quadrants.x, 70 | quadrants.y, 71 | city_name, 72 | country_name, 73 | CONCAT(geo_id,x,y) as geo_id, 74 | population/4 as population, 75 | alpha_3_code, 76 | last_updated 77 | FROM `osm_cities.cities_population_grid_1km` 78 | CROSS JOIN quadrants 79 | )) 80 | SELECT 81 | city_name, 82 | country_name, 83 | geo_id, 84 | population, 85 | (long1 + long2) / 2 as longitude_centroid, 86 | (lat1 + lat2) / 2 as latitude_centroid, 87 | alpha_3_code, 88 | ST_MAKEPOLYGON(ST_MAKELINE([ 89 | ST_MAKELINE(ST_GEOGPOINT(long1, lat1), ST_GEOGPOINT(long1, lat2)), 90 | ST_MAKELINE(ST_GEOGPOINT(long1, lat2), ST_GEOGPOINT(long2, lat2)), 91 | ST_MAKELINE(ST_GEOGPOINT(long2, lat2), ST_GEOGPOINT(long2, lat1)), 92 | ST_MAKELINE(ST_GEOGPOINT(long2, lat1), ST_GEOGPOINT(long1, lat1)) 93 | ])) as geog, 94 | last_updated 95 | FROM divided_grid 96 | ``` 97 | Result is saved in `osm_cities.cities_population_grid_05km`. 98 | 99 | ### 0.25km resolution 100 | 101 | Query to simply divide 0.5km resolution grid 102 | ``` 103 | WITH divided_grid AS (SELECT 104 | long1 + x*(long2 - long1)/2 as long1, 105 | lat1 + y*(lat2 - lat1)/2 as lat1, 106 | long1 + (x + 1)*(long2 - long1)/2 as long2, 107 | lat1 + (y + 1)*(lat2 - lat1)/2 as lat2, 108 | city_name, 109 | country_name, 110 | geo_id, 111 | population, 112 | alpha_3_code, 113 | last_updated 114 | FROM ( 115 | WITH quadrants AS 116 | (SELECT 0 as x, 0 as y UNION ALL 117 | SELECT 1, 0 UNION ALL 118 | SELECT 0, 1 UNION ALL 119 | SELECT 1, 1) 120 | SELECT 121 | CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][0][0]') AS FLOAT64) as long1, 122 | CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][0][1]') AS FLOAT64) as lat1, 123 | CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][2][0]') AS FLOAT64) as long2, 124 | CAST(JSON_EXTRACT(ST_ASGEOJSON(geog), '$.coordinates[0][2][1]') AS FLOAT64) as lat2, 125 | quadrants.x, 126 | quadrants.y, 127 | city_name, 128 | country_name, 129 | CONCAT(geo_id,x,y) as geo_id, 130 | population/4 as population, 131 | alpha_3_code, 132 | last_updated 133 | FROM `osm_cities.cities_population_grid_05km` 134 | CROSS JOIN quadrants 135 | )) 136 | SELECT 137 | city_name, 138 | country_name, 139 | geo_id, 140 | population, 141 | (long1 + long2) / 2 as longitude_centroid, 142 | (lat1 + lat2) / 2 as latitude_centroid, 143 | alpha_3_code, 144 | ST_MAKEPOLYGON(ST_MAKELINE([ 145 | ST_MAKELINE(ST_GEOGPOINT(long1, lat1), ST_GEOGPOINT(long1, lat2)), 146 | ST_MAKELINE(ST_GEOGPOINT(long1, lat2), ST_GEOGPOINT(long2, lat2)), 147 | ST_MAKELINE(ST_GEOGPOINT(long2, lat2), ST_GEOGPOINT(long2, lat1)), 148 | ST_MAKELINE(ST_GEOGPOINT(long2, lat1), ST_GEOGPOINT(long1, lat1)) 149 | ])) as geog, 150 | last_updated 151 | FROM divided_grid 152 | ``` 153 | Result is saved in `osm_cities.cities_population_grid_025km`. 154 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/parser.py: -------------------------------------------------------------------------------- 1 | import osmium 2 | import logging 3 | import time 4 | 5 | import elements_transformer 6 | import elements_processing 7 | 8 | from xml.sax import handler 9 | import psutil 10 | 11 | 12 | def to_mb(bytes_num): 13 | return int(bytes_num / (1024 * 1024)) 14 | 15 | 16 | class OsmParser(osmium.SimpleHandler): 17 | 18 | def __init__(self, processing_counter, logging_range_count, pool_size=1, pool_index=0): 19 | osmium.SimpleHandler.__init__(self) 20 | 21 | self.processing_counter = processing_counter 22 | self.last_log_time = time.time() 23 | self.logging_range_count = logging_range_count 24 | self.current_entity_type = "" 25 | self.pool_index = pool_index 26 | self.pool_size = pool_size 27 | 28 | def current_pool_index(self): 29 | return self.processing_counter[self.current_entity_type] % self.pool_size 30 | 31 | def is_item_index_for_current_pool_index(self): 32 | return self.current_pool_index() == self.pool_index 33 | 34 | def log_processing(self): 35 | self.processing_counter[self.current_entity_type] = self.processing_counter[self.current_entity_type] + 1 36 | if self.processing_counter[self.current_entity_type] % self.logging_range_count == 0: 37 | virtual_memory = psutil.virtual_memory() 38 | logging.info(self.current_entity_type + " ({}/{}) ".format(self.pool_index + 1, self.pool_size) 39 | + str(self.processing_counter[self.current_entity_type]) 40 | + " " + str(time.time() - self.last_log_time) 41 | + " Memory: usage {}, free {} MB, used {} MB" 42 | .format(virtual_memory.percent, to_mb(virtual_memory.free), to_mb(virtual_memory.used))) 43 | self.last_log_time = time.time() 44 | 45 | def node(self, node): 46 | self.current_entity_type = "nodes" 47 | self.log_processing() 48 | 49 | def way(self, way): 50 | self.current_entity_type = "ways" 51 | self.log_processing() 52 | 53 | def relation(self, relation): 54 | self.current_entity_type = "relations" 55 | self.log_processing() 56 | 57 | 58 | class IndexCreatorWithXmlParser(handler.ContentHandler): 59 | 60 | def __init__(self, osm_indexer_map, 61 | processing_counter, num_shards, 62 | is_id_hash_partitioned_shards, 63 | pool_size=1, pool_index=0, 64 | batch_size_to_commit=1000000, 65 | logging_range_count=1000000): 66 | handler.ContentHandler.__init__(self) 67 | self.processing_counter = processing_counter 68 | self.last_log_time = time.time() 69 | self.logging_range_count = logging_range_count 70 | self.current_entity_type = "" 71 | self.pool_index = pool_index 72 | self.pool_size = pool_size 73 | 74 | self.xml_hierarchy = [] 75 | self.current_obj = {} 76 | 77 | self.xml_entity_map = {"node": "nodes", "way": "ways", "relation": "relations"} 78 | 79 | self.osm_indexer_map = osm_indexer_map 80 | self.num_shards = num_shards 81 | self.batch_size_to_commit = batch_size_to_commit 82 | self.is_id_hash_partitioned_shards = is_id_hash_partitioned_shards 83 | 84 | self.current_indexer = None 85 | 86 | def log_processing(self): 87 | self.processing_counter[self.current_entity_type] = self.processing_counter[self.current_entity_type] + 1 88 | if self.processing_counter[self.current_entity_type] % self.logging_range_count == 0: 89 | logging.info(self.current_entity_type + " ({}/{}) ".format(self.pool_index + 1, self.pool_size) 90 | + str(self.processing_counter[self.current_entity_type]) 91 | + " " + str(time.time() - self.last_log_time)) 92 | self.last_log_time = time.time() 93 | 94 | def startDocument(self): 95 | pass 96 | 97 | def get_current_xml_hierarchy_level(self): 98 | return self.xml_hierarchy[len(self.xml_hierarchy) - 1] 99 | 100 | def process_element(self, name, attributes): 101 | if name == "node": 102 | self.current_obj = elements_transformer.osm_entity_node_dict(attributes, 103 | is_simplified=True, 104 | is_xml_attributes=True) 105 | 106 | def startElement(self, name, attributes): 107 | self.xml_hierarchy.append(name) 108 | 109 | if name in self.xml_entity_map: 110 | self.current_entity_type = self.xml_entity_map[name] 111 | self.log_processing() 112 | 113 | if not self.is_id_hash_partitioned_shards: 114 | batch_index = self.processing_counter[self.current_entity_type] % self.num_shards 115 | else: 116 | obj_id = attributes["id"] 117 | batch_index = elements_processing.get_uniformly_shard_index_from_id(obj_id, self.num_shards) 118 | if batch_index in self.osm_indexer_map: 119 | self.process_element(name, attributes) 120 | self.current_indexer = self.osm_indexer_map[batch_index] 121 | else: 122 | self.current_indexer = None 123 | 124 | def endElement(self, name, *args): 125 | if name == "node" and self.current_indexer: 126 | self.on_node_element(self.current_obj) 127 | self.current_indexer = None 128 | del self.xml_hierarchy[-1] 129 | 130 | def characters(self, data): 131 | pass 132 | 133 | def on_node_element(self, node_dict): 134 | pass 135 | 136 | def on_way_element(self, way_dict): 137 | pass 138 | 139 | def on_relation_element(self, relation_dict): 140 | pass 141 | -------------------------------------------------------------------------------- /deployment/create_full.sh: -------------------------------------------------------------------------------- 1 | ##!/bin/bash 2 | 3 | # 1. Read intput parameters 4 | OSM_URL="$1" 5 | OSM_MD5_URL="$2" 6 | REGION_LOCATION="$3" 7 | ZONE="$4" 8 | SUFFIX="$5" 9 | 10 | BASE_COMPOSER_CLUSTER_MACHINE_TYPE="$6" 11 | BASE_COMPOSER_CLUSTER_NODES="$7" 12 | 13 | ADDT_SN_CORES="$8" 14 | ADDT_SN_DISK_SIZE="$9" 15 | 16 | ADDT_MN_CORES="${10}" 17 | ADDT_MN_DISK_SIZE="${11}" 18 | ADDT_MN_NODES="${12}" 19 | 20 | MODE="${13}" 21 | 22 | # 2. Print all parameters 23 | for PARAM in "$@"; do 24 | echo "$PARAM" 25 | done 26 | 27 | # 3. Retrieve PROJECT_ID 28 | PROJECT_ID=`gcloud config get-value project` 29 | 30 | # 4. Create GCS buckets 31 | TRANSFER_BUCKET_NAME=${PROJECT_ID}-transfer-${SUFFIX} 32 | gsutil mb gs://${TRANSFER_BUCKET_NAME}/ 33 | 34 | WORK_BUCKET_NAME=${PROJECT_ID}-work-${SUFFIX} 35 | gsutil mb gs://${WORK_BUCKET_NAME}/ 36 | 37 | # 5. Create BigQuery dataset 38 | BQ_DATASET_SHORT=osm_to_bq_${SUFFIX} 39 | BQ_DATASET=${PROJECT_ID}.${BQ_DATASET_SHORT} 40 | bq mk ${PROJECT_ID}:${BQ_DATASET_SHORT} 41 | #TODO temp 42 | #BQ_DATASET=bigquery-public-data.geo_openstreetmap 43 | 44 | # 6. Build and push to Container Registry Docker containers 45 | IMAGE_HOSTNAME=gcr.io 46 | 47 | GENERATE_LAYERS_IMAGE=$IMAGE_HOSTNAME/$PROJECT_ID/generate_layers_${SUFFIX} 48 | docker build -t $GENERATE_LAYERS_IMAGE tasks_docker_images/generate_layers/ 49 | docker push $GENERATE_LAYERS_IMAGE 50 | 51 | if [ "$MODE" = "planet" ] 52 | then 53 | OSM_TO_FEATURES_IMAGE=$IMAGE_HOSTNAME/$PROJECT_ID/osm_to_features_${SUFFIX} 54 | docker build -t $OSM_TO_FEATURES_IMAGE tasks_docker_images/osm_to_features/ 55 | docker push $OSM_TO_FEATURES_IMAGE 56 | 57 | OSM_TO_NODES_WAYS_RELATIONS_IMAGE=$IMAGE_HOSTNAME/$PROJECT_ID/osm_to_nodes_ways_relations_${SUFFIX} 58 | docker build -t $OSM_TO_NODES_WAYS_RELATIONS_IMAGE tasks_docker_images/osm_to_nodes_ways_relations/ 59 | docker push $OSM_TO_NODES_WAYS_RELATIONS_IMAGE 60 | else 61 | OSM_CONVERTER_WITH_HISTORY_INDEX_IMAGE=$IMAGE_HOSTNAME/$PROJECT_ID/osm_converter_with_history_index_${SUFFIX} 62 | docker build -t $OSM_CONVERTER_WITH_HISTORY_INDEX_IMAGE tasks_docker_images/osm_converter_with_history_index/ 63 | docker push $OSM_CONVERTER_WITH_HISTORY_INDEX_IMAGE 64 | fi 65 | 66 | # 7. Create Cloud Composer environment 67 | COMPOSER_ENV_NAME=osm-to-bq-${SUFFIX} 68 | gcloud composer environments create $COMPOSER_ENV_NAME \ 69 | --location $REGION_LOCATION \ 70 | --zone $ZONE \ 71 | --node-count $BASE_COMPOSER_CLUSTER_NODES \ 72 | --machine-type $BASE_COMPOSER_CLUSTER_MACHINE_TYPE \ 73 | --airflow-configs=broker_transport_options-visibility_timeout=2592000 74 | 75 | # 8. Retrieve Cloud Composer environment's params 76 | GKE_CLUSTER_FULL_NAME=$(gcloud composer environments describe $COMPOSER_ENV_NAME \ 77 | --location $REGION_LOCATION --format json | jq -r '.config.gkeCluster') 78 | GKE_CLUSTER_NAME=$(echo $GKE_CLUSTER_FULL_NAME | awk -F/ '{print $6}') 79 | 80 | # 9. Define additional Kubernetes clusters parameters 81 | ADDT_SN_POOL_NUM_CORES=$ADDT_SN_CORES 82 | ADDT_SN_POOL_DISK_SIZE=$ADDT_SN_DISK_SIZE 83 | ADDT_SN_POOL_NAME=osm-addt-sn-pool-${SUFFIX} 84 | ADDT_SN_POOL_MACHINE_TYPE=n1-highmem-$ADDT_SN_POOL_NUM_CORES 85 | ADDT_SN_POOL_NUM_NODES=1 86 | ADDT_SN_POOL_MAX_NUM_TREADS=$((ADDT_SN_POOL_NUM_CORES/4)) 87 | 88 | 89 | ADDT_MN_POOL_NUM_CORES=$ADDT_MN_CORES 90 | ADDT_MN_POOL_DISK_SIZE=$ADDT_MN_DISK_SIZE 91 | ADDT_MN_POOL_NAME=osm-addt-mn-pool-${SUFFIX} 92 | ADDT_MN_POOL_MACHINE_TYPE=n1-highmem-$ADDT_MN_POOL_NUM_CORES 93 | ADDT_MN_POOL_NUM_NODES=$ADDT_MN_NODES 94 | ADDT_MN_POD_REQUESTED_MEMORY=$((ADDT_MN_POOL_NUM_CORES*4))G 95 | 96 | # 10. Build config file with Cloud Composer env vars 97 | CONFIG_FILE=deployment/config/config_${SUFFIX}.json 98 | python3 deployment/config/generate_config.py $CONFIG_FILE \ 99 | --project_id=$PROJECT_ID \ 100 | --zone=$ZONE \ 101 | --osm_url=$OSM_URL \ 102 | --osm_md5_url=$OSM_MD5_URL \ 103 | --gcs_transfer_bucket=$TRANSFER_BUCKET_NAME \ 104 | --gcs_work_bucket=$WORK_BUCKET_NAME \ 105 | --transfer_index_files_gcs_uri=gs://$WORK_BUCKET_NAME/gsc_transfer_index/ \ 106 | --osm_to_features_image=$OSM_TO_FEATURES_IMAGE \ 107 | --osm_to_nodes_ways_relations_image=$OSM_TO_NODES_WAYS_RELATIONS_IMAGE \ 108 | --generate_layers_image=$GENERATE_LAYERS_IMAGE \ 109 | --osm_converter_with_history_index_image=$OSM_CONVERTER_WITH_HISTORY_INDEX_IMAGE \ 110 | --gke_main_cluster_name=$GKE_CLUSTER_NAME \ 111 | --addt_sn_gke_pool=$ADDT_SN_POOL_NAME \ 112 | --addt_sn_pool_machine_type=$ADDT_SN_POOL_MACHINE_TYPE \ 113 | --addt_sn_pool_disk_size=$ADDT_SN_POOL_DISK_SIZE \ 114 | --addt_sn_pool_num_nodes=$ADDT_SN_POOL_NUM_NODES \ 115 | --addt_sn_pool_max_num_treads=$ADDT_SN_POOL_MAX_NUM_TREADS \ 116 | --addt_mn_gke_pool=$ADDT_MN_POOL_NAME \ 117 | --addt_mn_pool_machine_type=$ADDT_MN_POOL_MACHINE_TYPE \ 118 | --addt_mn_pool_disk_size=$ADDT_MN_POOL_DISK_SIZE \ 119 | --addt_mn_pool_num_nodes=$ADDT_MN_POOL_NUM_NODES \ 120 | --addt_mn_pod_requested_memory=$ADDT_MN_POD_REQUESTED_MEMORY \ 121 | --bq_dataset_to_export=$BQ_DATASET 122 | 123 | # 11. Deploy Cloud Composer env vars 124 | deployment/config/set_env_vars_from_config.sh $CONFIG_FILE $COMPOSER_ENV_NAME $REGION_LOCATION 125 | 126 | # 12. Crete Cloud Function for triggering main DAG 127 | COMPOSER_CLIENT_ID=$(python3 utils/get_client_id.py $PROJECT_ID $REGION_LOCATION $COMPOSER_ENV_NAME) 128 | COMPOSER_WEBSERVER_ID=$(gcloud composer environments describe $COMPOSER_ENV_NAME \ 129 | --location $REGION_LOCATION --format json | \ 130 | jq -r '.config.airflowUri' | \ 131 | awk -F/ '{print $3}' | \ 132 | cut -d '.' -f1) 133 | DAG_NAME=osm_to_big_query_${MODE} 134 | 135 | TRIGGER_FUNCTION_NAME=trigger_osm_to_big_query_dg_gcf_${SUFFIX} 136 | gcloud functions deploy $TRIGGER_FUNCTION_NAME \ 137 | --source triggering/trigger_osm_to_big_query_dg_gcf \ 138 | --entry-point trigger_dag \ 139 | --runtime python37 \ 140 | --trigger-resource $TRANSFER_BUCKET_NAME \ 141 | --trigger-event google.storage.object.finalize \ 142 | --set-env-vars COMPOSER_CLIENT_ID=$COMPOSER_CLIENT_ID,COMPOSER_WEBSERVER_ID=$COMPOSER_WEBSERVER_ID,DAG_NAME=$DAG_NAME 143 | 144 | # 13. Deploy DAG files and its dependencies 145 | if [ "$MODE" = "planet" ] 146 | then 147 | DAGS_PATH='dags/osm_to_big_query_planet.py dags/transfer_src_file.py dags/*/' 148 | else 149 | DAGS_PATH='dags/osm_to_big_query_history.py dags/transfer_src_file.py dags/*/' 150 | fi 151 | for DAG_ELEMENT in $DAGS_PATH; do 152 | deployment/upload_dags_files.sh $DAG_ELEMENT $COMPOSER_ENV_NAME $REGION_LOCATION 153 | done 154 | -------------------------------------------------------------------------------- /dags/transfer_src_file.py: -------------------------------------------------------------------------------- 1 | import airflow 2 | import os 3 | import logging 4 | import datetime 5 | import json 6 | import base64 7 | import binascii 8 | import time 9 | 10 | import googleapiclient.discovery 11 | 12 | from urllib import request 13 | from airflow.operators import python_operator 14 | from google.cloud import storage 15 | 16 | from utils import gcs_utils 17 | 18 | year_start = datetime.datetime(2020, 1, 1) 19 | 20 | OSM_TRANSFER_INDEX_FILE_NAME_BASE = "osm_transfer_index" 21 | OSM_TRANSFER_INDEX_FILE_NAME_EXT = ".tsv" 22 | OSM_TRANSFER_INDEX_FILE_NAME = OSM_TRANSFER_INDEX_FILE_NAME_BASE + OSM_TRANSFER_INDEX_FILE_NAME_EXT 23 | 24 | project_id = os.environ.get('PROJECT_ID') 25 | osm_url = os.environ.get('OSM_URL') 26 | osm_md5_url = os.environ.get('OSM_MD5_URL') 27 | transfer_index_files_dir_gcs_uri = os.environ.get('TRANSFER_INDEX_FILES_GCS_URI') 28 | gcs_transfer_bucket = os.environ.get('GCS_TRANSFER_BUCKET') 29 | 30 | default_args = { 31 | 'retries': 1, 32 | 'retry_delay': datetime.timedelta(minutes=1), 33 | } 34 | 35 | with airflow.DAG( 36 | 'transferring_src_osm_file', 37 | catchup=False, 38 | default_args=default_args, 39 | start_date=year_start, 40 | schedule_interval="@weekly") as dag: 41 | 42 | def transfer_to_gcs(): 43 | md5_file_lines = read_file_lines_from_url(osm_md5_url) 44 | logging.info(md5_file_lines) 45 | 46 | md5_hex = get_md5_hash_from_md5_file_lines(md5_file_lines) 47 | logging.info(md5_hex) 48 | 49 | base64_md5_file_hash = md5_hex_to_base64(md5_hex) 50 | 51 | content_length = get_content_length_from_url(osm_url) 52 | logging.info(content_length) 53 | 54 | osm_transfer_index_file_name = create_transfer_index_tsv(OSM_TRANSFER_INDEX_FILE_NAME, 55 | osm_url, 56 | content_length, 57 | base64_md5_file_hash) 58 | index_gcs_bucket, index_gcs_dir = gcs_utils.parse_uri_to_bucket_and_filename(transfer_index_files_dir_gcs_uri) 59 | list_url = upload_file_to_gcs_as_public(osm_transfer_index_file_name, 60 | index_gcs_bucket, 61 | index_gcs_dir) 62 | 63 | job_dict = create_transfer_job_dict(project_id, list_url, gcs_transfer_bucket) 64 | execute_transfer_job(job_dict) 65 | 66 | 67 | def read_file_lines_from_url(url): 68 | logging.info(url) 69 | 70 | request.urlcleanup() 71 | data = request.urlopen(url) 72 | return [byte_str_to_str(line) for line in data] 73 | 74 | 75 | def byte_str_to_str(byte_str): 76 | return byte_str.decode("utf-8") 77 | 78 | 79 | def get_md5_hash_from_md5_file_lines(lines): 80 | first_line = lines[0] 81 | return first_line.split()[0] 82 | 83 | 84 | def get_content_length_from_url(url): 85 | data = request.urlopen(url) 86 | meta = data.info() 87 | return meta.get(name="Content-Length") 88 | 89 | 90 | def md5_hex_to_base64(md5_hex): 91 | return byte_str_to_str(to_base64(from_hex_to_binary(md5_hex))) 92 | 93 | 94 | def from_hex_to_binary(hex): 95 | return binascii.unhexlify(hex) 96 | 97 | 98 | def to_base64(byte_str): 99 | return base64.b64encode(byte_str) 100 | 101 | 102 | def create_transfer_index_tsv(osm_transfer_index_file_name, url, content_length, md5_hash): 103 | header_line = "TsvHttpData-1.0" 104 | lines = [header_line, "\t".join([url, content_length, md5_hash])] 105 | lines = [line + "\n" for line in lines] 106 | with open(osm_transfer_index_file_name, "w") as osm_transfer_index_file: 107 | osm_transfer_index_file.writelines(lines) 108 | return osm_transfer_index_file_name 109 | 110 | 111 | def upload_file_to_gcs_as_public(osm_transfer_index_file_name, gcs_data_bucket, osm_transfer_index_gcs_dir): 112 | client = storage.Client() 113 | 114 | osm_transfer_index_gcs_name = osm_transfer_index_gcs_dir \ 115 | + add_timestamped_suffix(OSM_TRANSFER_INDEX_FILE_NAME_BASE) \ 116 | + OSM_TRANSFER_INDEX_FILE_NAME_EXT 117 | bucket = client.get_bucket(gcs_data_bucket) 118 | dest_blob = bucket.blob(osm_transfer_index_gcs_name) 119 | dest_blob.upload_from_filename(osm_transfer_index_file_name) 120 | 121 | dest_blob.make_public() 122 | 123 | return dest_blob.public_url 124 | 125 | 126 | def add_timestamped_suffix(name): 127 | return name + "_" + str(time.time()).split(".")[0] 128 | 129 | 130 | def bucket_name_and_file_name_from_gcs_uri(gcs_uri): 131 | gcs_uri_without_gs_part = gcs_uri.split("//")[-1] 132 | uri_parts = gcs_uri_without_gs_part.split("/") 133 | 134 | return uri_parts[0], "/".join(uri_parts[1:]) 135 | 136 | 137 | def create_transfer_job_dict(project_id, list_url, transfer_bucket): 138 | now_datetime = datetime.datetime.now() 139 | transfer_datetime = now_datetime + datetime.timedelta(minutes=3) 140 | 141 | job_description = "transfer--{}".format(transfer_datetime.strftime("%Y-%m-%d--%H-%M-%S")) 142 | job_name = "transferJobs/{}".format(job_description) 143 | overwrite_objects_already_existing_in_sink = True 144 | 145 | transfer_date = { 146 | "day": transfer_datetime.day, 147 | "month": transfer_datetime.month, 148 | "year": transfer_datetime.year 149 | } 150 | transfer_time = { 151 | "hours": transfer_datetime.hour, 152 | "minutes": transfer_datetime.minute, 153 | "seconds": transfer_datetime.second 154 | } 155 | status = "ENABLED" 156 | transfer_job = { 157 | "name": job_name, 158 | "description": job_description, 159 | "transferSpec": { 160 | "httpDataSource": { 161 | "listUrl": list_url 162 | }, 163 | "gcsDataSink": { 164 | "bucketName": transfer_bucket 165 | }, 166 | "transferOptions": { 167 | "overwriteObjectsAlreadyExistingInSink": 168 | overwrite_objects_already_existing_in_sink 169 | } 170 | }, 171 | "projectId": project_id, 172 | "schedule": { 173 | "scheduleEndDate": transfer_date, 174 | "scheduleStartDate": transfer_date, 175 | "startTimeOfDay": transfer_time 176 | }, 177 | "status": status 178 | } 179 | return transfer_job 180 | 181 | 182 | def execute_transfer_job(job_dict): 183 | storage_transfer = googleapiclient.discovery.build('storagetransfer', 'v1') 184 | logging.info('Requesting transferJob: {}'.format( 185 | job_dict)) 186 | result = storage_transfer.transferJobs().create(body=job_dict).execute() 187 | logging.info('Returned transferJob: {}'.format( 188 | json.dumps(result, indent=4))) 189 | 190 | 191 | transferring_to_gcs = python_operator.PythonOperator( 192 | task_id='transferring_to_gcs', 193 | python_callable=transfer_to_gcs) 194 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/osm_index.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import json 3 | import logging 4 | import time 5 | 6 | 7 | class OsmIndex(object): 8 | def __init__(self): 9 | pass 10 | 11 | def create(self): 12 | pass 13 | 14 | def save(self): 15 | pass 16 | 17 | def close(self): 18 | pass 19 | 20 | 21 | class SQLiteOsmIndex(OsmIndex): 22 | 23 | def __init__(self, db_file_path): 24 | super().__init__() 25 | self.db_file_path = db_file_path 26 | self.sqlite3_connection = sqlite3.connect(db_file_path) 27 | self.osm_index_db_cursor = self.sqlite3_connection.cursor() 28 | self.query_time = 0 29 | self.query_counter = 0 30 | 31 | self.tables_and_fields = {"nodes": { 32 | "id": "INT", 33 | "version": "INT", 34 | "osm_timestamp": "INT", 35 | "longitude": "REAL", 36 | "latitude": "REAL", 37 | }, "ways": { 38 | "id": "INT", 39 | "version": "INT", 40 | "osm_timestamp": "INT", 41 | "nodes": "TEXT" 42 | }, "relations": { 43 | "id": "INT", 44 | "version": "INT", 45 | "osm_timestamp": "INT", 46 | "nodes": "TEXT" 47 | }} 48 | 49 | self.nodes_fields_list = list(self.tables_and_fields["nodes"].keys()) 50 | self.ways_fields_list = list(self.tables_and_fields["ways"].keys()) 51 | self.relations_fields_list = list(self.tables_and_fields["relations"].keys()) 52 | 53 | self.nodes_fields_str = ",".join(self.nodes_fields_list) 54 | self.ways_fields_str = ",".join(self.ways_fields_list) 55 | self.relations_fields_str = ",".join(self.relations_fields_list) 56 | 57 | def get_db_file_path(self): 58 | return self.db_file_path 59 | 60 | def get_query_time(self): 61 | return self.query_time 62 | 63 | def reset_query_time(self): 64 | self.query_time = 0 65 | 66 | def get_query_counter(self): 67 | return self.query_counter 68 | 69 | def reset_query_counter(self): 70 | self.query_counter = 0 71 | 72 | def create(self): 73 | self.init_all_tables() 74 | 75 | def save(self): 76 | self.sqlite3_connection.commit() 77 | 78 | def close(self): 79 | self.save() 80 | self.sqlite3_connection.close() 81 | 82 | def init_all_tables(self): 83 | for table, fields_dicts in self.tables_and_fields.items(): 84 | fields = ["{} {}".format(field_name, field_type) for field_name, field_type in fields_dicts.items()] 85 | self.osm_index_db_cursor.execute('CREATE TABLE {} ({})'.format(table, ",".join(fields))) 86 | self.osm_index_db_cursor.execute('CREATE INDEX idx_{}_id_version ON {} (id, version)'.format(table, table)) 87 | self.save() 88 | 89 | def execute_query(self, query, values=None): 90 | query_start_timestamp = time.time() 91 | if values: 92 | self.osm_index_db_cursor.execute(query, values) 93 | else: 94 | self.osm_index_db_cursor.execute(query) 95 | self.query_time += (time.time() - query_start_timestamp) 96 | self.query_counter += 1 97 | 98 | def add_values_to_sqlite_table(self, table_name, values): 99 | placeholders = ",".join(["?"] * len(values)) 100 | query = "INSERT INTO {} VALUES ({})".format(table_name, placeholders) 101 | self.execute_query(query, values) 102 | 103 | def get_id_version_timestamp_all_tags_from_osm_obj(self, osm_obj): 104 | return str(osm_obj["id"]), str(osm_obj["version"]), str(osm_obj["osm_timestamp"]) 105 | 106 | def add_node_to_index(self, node_dict): 107 | osm_id, ver, timestamp = self.get_id_version_timestamp_all_tags_from_osm_obj(node_dict) 108 | lon = node_dict["longitude"] if "longitude" in node_dict else None 109 | lat = node_dict["latitude"] if "latitude" in node_dict else None 110 | self.add_values_to_sqlite_table("nodes", [osm_id, ver, timestamp, lon, lat]) 111 | 112 | def add_way_to_index(self, way_dict): 113 | osm_id, ver, timestamp = self.get_id_version_timestamp_all_tags_from_osm_obj(way_dict) 114 | node_ids = json.dumps(way_dict["nodes"]) 115 | self.add_values_to_sqlite_table("ways", [osm_id, ver, timestamp, node_ids]) 116 | 117 | def add_relation_to_index(self, relation_dict): 118 | osm_id, ver, timestamp = self.get_id_version_timestamp_all_tags_from_osm_obj(relation_dict) 119 | members = json.dumps(relation_dict["members"]) 120 | self.add_values_to_sqlite_table("relations", [osm_id, ver, timestamp, members]) 121 | 122 | def get_row_from_index_by_timestamp(self, table_name, id, timestamp, fields_str=None): 123 | query = "SELECT {} FROM {} table_name WHERE id={} AND osm_timestamp<{} ORDER BY osm_timestamp DESC" \ 124 | .format(fields_str if fields_str else "*", table_name, id, timestamp) 125 | self.execute_query(query) 126 | return self.osm_index_db_cursor.fetchone() 127 | 128 | def get_node_from_index_by_timestamp(self, node_id, timestamp): 129 | node_data = self.get_row_from_index_by_timestamp("nodes", node_id, timestamp) 130 | if not node_data: 131 | return 132 | 133 | node_dict = {field: node_data[index] for index, field in enumerate(self.nodes_fields_list)} 134 | return node_dict 135 | 136 | def get_way_from_index_by_timestamp(self, way_id, timestamp): 137 | way_data = self.get_row_from_index_by_timestamp("ways", way_id, timestamp) 138 | if not way_data: 139 | return 140 | 141 | way_dict = {} 142 | for index, field in enumerate(self.ways_fields_list): 143 | if field == "nodes": 144 | way_dict["nodes"] = json.loads(way_data[index]) 145 | else: 146 | way_dict[field] = way_data[index] 147 | return way_dict 148 | 149 | def get_relation_from_index_by_timestamp(self, relation_id, timestamp): 150 | relation_data = self.get_row_from_index_by_timestamp("relations", relation_id, timestamp) 151 | if not relation_data: 152 | return 153 | 154 | relation_dict = {} 155 | for index, field in enumerate(self.relations_fields_list): 156 | if field == "members": 157 | relation_dict["members"] = json.loads(relation_data[index]) 158 | else: 159 | relation_dict[field] = relation_data[index] 160 | return relation_dict 161 | 162 | def merge_identical_db(self, db_file_to_merge): 163 | tables = list(self.tables_and_fields.keys()) 164 | db_to_merge_temp_name = "dbToMerge" 165 | self.execute_query("ATTACH '{}' as {}".format(db_file_to_merge, db_to_merge_temp_name)) 166 | for table in tables: 167 | self.execute_query("INSERT into {} SELECT * FROM {}.{}" 168 | .format(table, db_to_merge_temp_name, table)) 169 | self.sqlite3_connection.commit() 170 | self.execute_query("DETACH {}".format(db_to_merge_temp_name)) 171 | 172 | 173 | def merge_dbs(new_db_file, db_paths, db_exists): 174 | new_db = SQLiteOsmIndex(new_db_file) 175 | if not db_exists: 176 | new_db.create() 177 | for db_path in db_paths: 178 | logging.info("Merging {} into {}".format(db_path, new_db_file)) 179 | new_db.merge_identical_db(db_path) 180 | new_db.close() 181 | return new_db_file 182 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_to_nodes_ways_relations/src/pbf_parser.py: -------------------------------------------------------------------------------- 1 | import osmium 2 | import logging 3 | import json 4 | import argparse 5 | import os 6 | import errno 7 | import time 8 | import threading 9 | import multiprocessing 10 | import json 11 | 12 | from datetime import datetime 13 | from google.cloud import storage 14 | 15 | 16 | def osm_entity_to_dict(osm_entity): 17 | all_tags = [{"key": tag.k, "value": tag.v} for tag in osm_entity.tags] 18 | return {"id": osm_entity.id, "all_tags": all_tags} 19 | 20 | 21 | def osm_entity_to_dict_full(osm_entity): 22 | base_dict = osm_entity_to_dict(osm_entity) 23 | base_dict.update({ 24 | "version": osm_entity.version, 25 | "username": osm_entity.user, 26 | "changeset": osm_entity.changeset, 27 | "visible": osm_entity.visible, 28 | "osm_timestamp": int(datetime.timestamp(osm_entity.timestamp)), 29 | }) 30 | return base_dict 31 | 32 | 33 | def osm_entity_node_dict(osm_node_entity): 34 | base_dict = osm_entity_to_dict_full(osm_node_entity) 35 | if osm_node_entity.location.valid(): 36 | base_dict["latitude"] = osm_node_entity.location.lat 37 | base_dict["longitude"] = osm_node_entity.location.lon 38 | else: 39 | base_dict["latitude"] = None 40 | base_dict["longitude"] = None 41 | return base_dict 42 | 43 | 44 | def osm_entity_way_dict(osm_way_entity): 45 | base_dict = osm_entity_to_dict_full(osm_way_entity) 46 | base_dict["nodes"] = [{"id": node.ref} for node in osm_way_entity.nodes] 47 | return base_dict 48 | 49 | 50 | def osm_entity_relation_dict(osm_relation_entity): 51 | base_dict = osm_entity_to_dict_full(osm_relation_entity) 52 | base_dict["members"] = [{"type": member.type, "id": member.ref, "role": member.role} 53 | for member in iter(osm_relation_entity.members)] 54 | return base_dict 55 | 56 | 57 | class CustomHandler(osmium.SimpleHandler): 58 | 59 | def __init__(self, files_dict, pool_size, pool_index): 60 | osmium.SimpleHandler.__init__(self) 61 | self.entities_out_files_dict = files_dict 62 | self.processing_counter = 0 63 | 64 | self.last_log_time = time.time() 65 | self.pool_size = pool_size 66 | self.pool_index = pool_index 67 | self._lock = threading.Lock() 68 | self.geo_json_factory = osmium.geom.GeoJSONFactory() 69 | 70 | def log_processing(self, entity_type): 71 | if self.processing_counter % 1000000 == 0: 72 | logging.info(entity_type + " (pool_index {}) ".format(str(self.pool_index)) + str(self.processing_counter) 73 | + " " + str(time.time() - self.last_log_time)) 74 | self.last_log_time = time.time() 75 | 76 | def node(self, node): 77 | self.processing_counter = self.processing_counter + 1 78 | 79 | self.log_processing("nodes") 80 | if self.processing_counter % self.pool_size == self.pool_index: 81 | node_dict = osm_entity_node_dict(node) 82 | self.write_to_dict("nodes", node_dict) 83 | 84 | def way(self, way): 85 | self.processing_counter = self.processing_counter + 1 86 | 87 | self.log_processing("ways") 88 | if self.processing_counter % self.pool_size == self.pool_index: 89 | way_dict = osm_entity_way_dict(way) 90 | self.write_to_dict("ways", way_dict) 91 | 92 | def relation(self, relation): 93 | self.processing_counter = self.processing_counter + 1 94 | 95 | self.log_processing("relations") 96 | if self.processing_counter % self.pool_size == self.pool_index: 97 | relation_dict = osm_entity_relation_dict(relation) 98 | self.write_to_dict("relations", relation_dict) 99 | 100 | def process_as_base_osm_entity(self, osm_entity, entity_type): 101 | self.processing_counter = self.processing_counter + 1 102 | 103 | self.log_processing(entity_type) 104 | if self.processing_counter % self.pool_size == self.pool_index: 105 | node_dict = osm_entity_to_dict_full(osm_entity) 106 | self.write_to_dict(entity_type, node_dict) 107 | 108 | def write_to_dict(self, entity_type, entity_dict): 109 | with self._lock: 110 | entities_out_files_dict[entity_type].write(json.dumps(entity_dict) + "\n") 111 | 112 | 113 | def make_dir_for_file_if_not_exists(filename): 114 | if not os.path.exists(os.path.dirname(filename)): 115 | try: 116 | os.makedirs(os.path.dirname(filename)) 117 | except OSError as exc: # Guard against race condition 118 | if exc.errno != errno.EEXIST: 119 | raise 120 | 121 | 122 | def from_gcs_to_local_file(src_gcs_bucket, src_gcs_name, local_file_path): 123 | storage_client = storage.Client(os.environ['PROJECT_ID']) 124 | # Create a bucket object for our bucket 125 | bucket = storage_client.get_bucket(src_gcs_bucket) 126 | # Create a blob object from the filepath 127 | blob = bucket.blob(src_gcs_name) 128 | # Download the file to a destination 129 | logging.info("Downloading gs://{}/{} to {}...".format(src_gcs_bucket, src_gcs_name, local_file_path)) 130 | blob.download_to_filename(local_file_path) 131 | logging.info("Successfully downloaded gs://{}/{} to {}".format(src_gcs_bucket, src_gcs_name, local_file_path)) 132 | 133 | 134 | def upload_file_to_gcs(filename, destination_bucket_name, destination_blob_name): 135 | """ 136 | Uploads a file to a given Cloud Storage bucket and returns the public url 137 | to the new object. 138 | """ 139 | bucket = storage.Client().bucket(destination_bucket_name) 140 | blob = bucket.blob(destination_blob_name) 141 | logging.info("Uploading of {} to gs://{}/{}...".format(filename, destination_bucket_name, destination_blob_name)) 142 | blob.upload_from_filename( 143 | filename, 144 | content_type="text/plain") 145 | logging.info( 146 | "Finished uploading of {} to gs://{}/{}".format(filename, destination_bucket_name, destination_blob_name)) 147 | 148 | 149 | def parse_uri_to_bucket_and_filename(file_path): 150 | """Divides file uri to bucket name and file name""" 151 | path_parts = file_path.split("//") 152 | if len(path_parts) >= 2: 153 | main_part = path_parts[1] 154 | if "/" in main_part: 155 | divide_index = main_part.index("/") 156 | bucket_name = main_part[:divide_index] 157 | file_name = main_part[divide_index + 1 - len(main_part):] 158 | 159 | return bucket_name, file_name 160 | return "", "" 161 | 162 | 163 | def process_pbf(pool_index): 164 | simple_handler = CustomHandler(entities_out_files_dict, pool_size, pool_index) 165 | simple_handler.apply_file(dest_local_path) 166 | 167 | 168 | def run_pbf_processing_in_parallel(pool_size): 169 | pool = multiprocessing.Pool(pool_size) 170 | for pool_index in range(pool_size): 171 | pool.apply_async(process_pbf, [pool_index]) 172 | pool.close() 173 | pool.join() 174 | 175 | 176 | if __name__ == "__main__": 177 | logging.getLogger().setLevel(logging.INFO) 178 | 179 | parser = argparse.ArgumentParser() 180 | parser.add_argument("src_pbf_file_uri", help="The source PBF file to be converted") 181 | parser.add_argument("dest_gcs_dir", help="URI of GCS dir to save result files") 182 | parser.add_argument("--num_threads", help="Number of parallel threads for processing", default="3") 183 | 184 | args = parser.parse_args() 185 | 186 | src_bucket, src_name = parse_uri_to_bucket_and_filename(args.src_pbf_file_uri) 187 | 188 | data_dir = os.environ['DATA_DIR'] 189 | dest_local_path = data_dir + "planet.osm.pbf" 190 | make_dir_for_file_if_not_exists(dest_local_path) 191 | from_gcs_to_local_file(src_bucket, src_name, dest_local_path) 192 | 193 | entities = ["nodes", "ways", "relations"] 194 | 195 | entities_out_files_dict = {} 196 | results_local_paths = [] 197 | for entity in entities: 198 | path = data_dir + "{}.jsonl".format(entity) 199 | results_local_paths.append(path) 200 | 201 | make_dir_for_file_if_not_exists(path) 202 | entities_out_files_dict[entity] = open(path, "w") 203 | 204 | logging.info("Creating {} files".format(str(results_local_paths))) 205 | 206 | pool_size = int(args.num_threads) 207 | run_pbf_processing_in_parallel(pool_size) 208 | 209 | for entity, out_file in entities_out_files_dict.items(): 210 | out_file.close() 211 | 212 | dest_bucket, dest_dir_name = parse_uri_to_bucket_and_filename(args.dest_gcs_dir) 213 | for path in results_local_paths: 214 | dest_file_gcs_name = dest_dir_name + path.split("/")[-1] 215 | upload_file_to_gcs(path, dest_bucket, dest_file_gcs_name) 216 | -------------------------------------------------------------------------------- /examples/clustering/colors/vectorize.py: -------------------------------------------------------------------------------- 1 | import re 2 | import textract 3 | from pathlib import Path 4 | import numpy as np 5 | import json 6 | 7 | import nltk 8 | from nltk.corpus import stopwords 9 | from nltk.stem import WordNetLemmatizer 10 | 11 | from sklearn.feature_extraction.text import TfidfVectorizer 12 | 13 | nltk.download('punkt') 14 | 15 | nltk.download('stopwords') 16 | stop_words = set(stopwords.words('english')).union({'etc', 'note', 'also', 'occur'}) 17 | 18 | nltk.download('wordnet') 19 | lemmatizer = WordNetLemmatizer() 20 | 21 | embeddings_dict = {} 22 | 23 | with open("../data/glove.6B.300d.txt", 'r', encoding="utf-8") as f: 24 | for line in f: 25 | values = line.split() 26 | word = values[0] 27 | vector = np.asarray(values[1:], "float32") 28 | embeddings_dict[word] = vector 29 | 30 | rows = [ 31 | {'dimension': 'Activity', 'code': 1000, 'color': 'FF00FF', 'name': 'Residential activities'}, 32 | {'dimension': 'Activity', 'code': 2000, 'color': 'FF0000', 'name': 'Shopping, business, or trade activities'}, 33 | {'dimension': 'Activity', 'code': 3000, 'color': 'A0F020', 34 | 'name': 'Industrial, manufacturing, and waste- related activities'}, 35 | {'dimension': 'Activity', 'code': 4000, 'color': '00FF00', 36 | 'name': 'Social, institutional, or infrastructure- related activities'}, 37 | {'dimension': 'Activity', 'code': 5000, 'color': 'BEBEBE', 'name': 'Travel or movement activities'}, 38 | {'dimension': 'Activity', 'code': 6000, 'color': '2F4F4F', 'name': 'Mass assembly of people'}, 39 | {'dimension': 'Activity', 'code': 7000, 'color': '9090EE', 'name': 'Leisure activities'}, 40 | {'dimension': 'Activity', 'code': 8000, 'color': '22228B', 'name': 'Natural resources-related activities'}, 41 | {'dimension': 'Activity', 'code': 9000, 'color': 'FFFFFF', 'name': 'No human activity or unclassifiable activity'}, 42 | 43 | {'dimension': 'Function', 'code': 1000, 'color': 'FF00FF', 'name': 'Residence or accommodation functions'}, 44 | {'dimension': 'Function', 'code': 2000, 'color': 'FF0000', 'name': 'General sales or services'}, 45 | {'dimension': 'Function', 'code': 3000, 'color': 'A0F020', 'name': 'Manufacturing and wholesale trade'}, 46 | {'dimension': 'Function', 'code': 4000, 'color': 'BEBEBE', 47 | 'name': 'Transportation, communication, information, and utilities'}, 48 | {'dimension': 'Function', 'code': 5000, 'color': '9090EE', 'name': 'Arts, entertainment, and recreation'}, 49 | {'dimension': 'Function', 'code': 6000, 'color': '00FF00', 50 | 'name': 'Education, public admin., health care, andother inst.'}, 51 | {'dimension': 'Function', 'code': 7000, 'color': '008B8B', 'name': 'Construction-related businesses'}, 52 | {'dimension': 'Function', 'code': 8000, 'color': '558B00', 'name': 'Mining and extraction establishments'}, 53 | {'dimension': 'Function', 'code': 9000, 'color': '22228B', 'name': 'Agriculture, forestry, fishing and hunting'}, 54 | 55 | {'dimension': 'Ownership', 'code': 1000, 'color': 'F5DCF5', 'name': 'No constraints--private ownership'}, 56 | {'dimension': 'Ownership', 'code': 2000, 'color': '00FF00', 57 | 'name': 'Some constraints--easements or other use restrictions'}, 58 | {'dimension': 'Ownership', 'code': 3000, 'color': '008B00', 59 | 'name': 'Limited restrictions--leased and other tenancy restrictions'}, 60 | {'dimension': 'Ownership', 'code': 4000, 'color': '9090EE', 61 | 'name': 'Public restrictions--local, state, and federal ownership'}, 62 | {'dimension': 'Ownership', 'code': 5000, 'color': '000064', 63 | 'name': 'Other public use restrictions--regional, special districts, etc'}, 64 | {'dimension': 'Ownership', 'code': 6000, 'color': '6B238E', 'name': 'Nonprofit ownership restrictions'}, 65 | {'dimension': 'Ownership', 'code': 7000, 'color': 'BEBEBE', 'name': 'Joint ownership character--public entities'}, 66 | {'dimension': 'Ownership', 'code': 8000, 'color': '000000', 67 | 'name': 'Joint ownership character--public, private, nonprofit, etc.'}, 68 | {'dimension': 'Ownership', 'code': 9000, 'color': 'FFFFFF', 'name': 'Not applicable to this dimension'}, 69 | 70 | {'dimension': 'Site', 'code': 1000, 'color': '9090EE', 'name': 'Site in natural state'}, 71 | {'dimension': 'Site', 'code': 2000, 'color': 'F5DCF5', 'name': 'Developing site'}, 72 | {'dimension': 'Site', 'code': 3000, 'color': 'CD9EB7', 'name': 'Developed site -- crops, grazing, forestry, etc.'}, 73 | {'dimension': 'Site', 'code': 4000, 'color': '8B667E', 'name': 'Developed site -- no buildings and no structures'}, 74 | {'dimension': 'Site', 'code': 5000, 'color': '8B2B00', 'name': 'Developed site -- nonbuilding structures'}, 75 | {'dimension': 'Site', 'code': 6000, 'color': '8B2323', 'name': 'Developed site -- with buildings'}, 76 | {'dimension': 'Site', 'code': 7000, 'color': '22228B', 'name': 'Developed site -- with parks'}, 77 | {'dimension': 'Site', 'code': 8000, 'color': 'D3D3D3', 'name': 'Not applicable to this dimension'}, 78 | {'dimension': 'Site', 'code': 9000, 'color': 'FFFFFF', 'name': 'Unclassifiable site development character'}, 79 | 80 | {'dimension': 'Structure', 'code': 1000, 'color': 'FF00FF', 'name': 'Residential buildings'}, 81 | {'dimension': 'Structure', 'code': 2000, 'color': 'FF0000', 'name': 'Commercial buildings and other specialized structures'}, 82 | {'dimension': 'Structure', 'code': 3000, 'color': 'A0F020', 'name': 'Public assembly structures'}, 83 | {'dimension': 'Structure', 'code': 4000, 'color': '00FF00', 'name': 'Institutional or community facilities'}, 84 | {'dimension': 'Structure', 'code': 5000, 'color': 'BEBEBE', 'name': 'Transportation-related facilities'}, 85 | {'dimension': 'Structure', 'code': 6000, 'color': '858585', 'name': 'Utility and other nonbuilding structures'}, 86 | {'dimension': 'Structure', 'code': 7000, 'color': 'FFCBC0', 'name': 'Specialized military structures'}, 87 | {'dimension': 'Structure', 'code': 8000, 'color': '22228B', 'name': 'Sheds, farm buildings, or agricultural facilities'}, 88 | {'dimension': 'Structure', 'code': 9000, 'color': 'FFFFFF', 'name': 'No structure'} 89 | ] 90 | 91 | 92 | def tokenize(text): 93 | tokens = nltk.word_tokenize(text.lower().replace('-', ' ')) 94 | filtered = [t for t in tokens if t not in stop_words and t.isalpha()] 95 | 96 | return [lemmatizer.lemmatize(t) for t in filtered] 97 | 98 | 99 | def mean_vector(tokens): 100 | count = 0 101 | sum_vector = np.zeros(300) 102 | for token in tokens: 103 | if token not in embeddings_dict: 104 | continue 105 | sum_vector += embeddings_dict[token] 106 | count += 1 107 | 108 | return sum_vector / count 109 | 110 | 111 | def vectorize(): 112 | try: 113 | text = Path('../data/LBCS.txt').read_text() 114 | except FileNotFoundError: 115 | text = textract.process('../data/LBCS.pdf') 116 | text = text.decode('utf-8') 117 | Path('../data/LBCS.txt').write_text(text) 118 | 119 | # split text to parts with each dimension details 120 | dimensions_details = re.split('\w Dimension with Detail', text)[1:] 121 | # remove text tail from the last part 122 | dimensions_details[-1] = re.split('LBCS Top Level Codes for all Dimensions', dimensions_details[-1])[0] 123 | 124 | corpus = [] 125 | vectors = [] 126 | for dimension in dimensions_details: 127 | # split by classes codes 128 | codes = re.split('1000|2000|3000|4000|5000|6000|7000|8000|9000|9999', dimension)[1:10] 129 | 130 | for code in codes: 131 | tokenized = tokenize(code) 132 | corpus.append(' '.join(tokenized)) 133 | # vec = mean_vector(tokenized) 134 | # vectors.append(vec) 135 | 136 | vectorizer = TfidfVectorizer() 137 | X = vectorizer.fit_transform(corpus) 138 | features = vectorizer.get_feature_names() 139 | 140 | top_k = 10 141 | for tf_idf_vector in X: 142 | tf_idf_arr = tf_idf_vector[0].toarray().reshape(-1) 143 | 144 | # for top-k implementation 145 | # max_indexes = np.argpartition(tf_idf_arr, -top_k)[-top_k:] 146 | 147 | # just plain tf-idf 148 | weights = tf_idf_arr / sum(tf_idf_arr) 149 | # softmax 150 | # weights = np.exp(tf_idf_arr) / np.sum(np.exp(tf_idf_arr)) 151 | # like softmax but with tanh 152 | # weights = np.tanh(tf_idf_arr) / np.sum(np.tanh(tf_idf_arr)) 153 | 154 | sum_vector = np.zeros(300) 155 | for i, feature in enumerate(features): 156 | if feature not in embeddings_dict: 157 | continue 158 | sum_vector += weights[i] * embeddings_dict[feature] 159 | vectors.append(sum_vector) 160 | 161 | assert len(rows) == len(vectors) 162 | for (row, vec) in zip(rows, vectors): 163 | for i in range(len(vec)): 164 | row['f{}'.format(i+1)] = vec[i] 165 | print(json.dumps(row)) 166 | 167 | 168 | if __name__ == '__main__': 169 | vectorize() 170 | -------------------------------------------------------------------------------- /dags/utils/metadata_manager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import time 4 | 5 | from utils import gcs_utils 6 | 7 | OSM_ENTITIES = ["nodes", "ways", "relations"] 8 | 9 | 10 | def file_name_without_ext(file_name): 11 | if "." in file_name: 12 | return file_name.split(".")[0] 13 | else: 14 | return file_name 15 | 16 | 17 | def file_name_from_path(file_path): 18 | if "/" in file_path: 19 | return file_path.split("/")[-1] 20 | else: 21 | return file_path 22 | 23 | 24 | def get_index_metadata_file_path(src_osm_name, num_db_shards): 25 | return file_name_without_ext(src_osm_name) + "_{}_index_shards.metadata.txt".format(num_db_shards) 26 | 27 | 28 | def get_result_shard_metadata_file_path(src_osm_name, entity_type, index, num_results_shards): 29 | return file_name_without_ext(src_osm_name) + "_{}_{}_{}.metadata.txt".format(entity_type, index + 1, 30 | num_results_shards) 31 | 32 | 33 | def download_and_read_metadata_file(gcs_bucket, gcs_dir_name, src_osm_name, num_db_shards, num_results_shards): 34 | src_osm_file_name = file_name_from_path(src_osm_name) 35 | 36 | index_metadata_file_path = get_index_metadata_file_path(src_osm_file_name, num_db_shards) 37 | index_metadata_blob_name = gcs_dir_name + index_metadata_file_path 38 | if gcs_utils.is_gcs_blob_exists(gcs_bucket, index_metadata_blob_name): 39 | gcs_utils.from_gcs_to_local_file(gcs_bucket, index_metadata_blob_name, index_metadata_file_path) 40 | 41 | shards_metadata_files = {} 42 | for entity in OSM_ENTITIES: 43 | shards_metadata_files_by_entity = {} 44 | for index in range(num_results_shards): 45 | result_shard_metadata_file_path = get_result_shard_metadata_file_path(src_osm_file_name, entity, index, 46 | num_results_shards) 47 | result_shard_metadata_blob_name = gcs_dir_name + result_shard_metadata_file_path 48 | if gcs_utils.is_gcs_blob_exists(gcs_bucket, result_shard_metadata_blob_name): 49 | gcs_utils.from_gcs_to_local_file(gcs_bucket, result_shard_metadata_blob_name, 50 | result_shard_metadata_file_path) 51 | shards_metadata_files_by_entity[str(index)] = result_shard_metadata_file_path 52 | shards_metadata_files[entity] = shards_metadata_files_by_entity 53 | return ProcessingMetadata(index_metadata_file_path, shards_metadata_files) 54 | 55 | 56 | def save_and_upload_metadata_to_gcs(metadata, 57 | dest_bucket, 58 | dest_dir_name, 59 | save_only_shard_by_entity_and_index=None, 60 | only_db_metadata=False): 61 | files_to_save = metadata.save_to_json_files(save_only_shard_by_entity_and_index, only_db_metadata) 62 | 63 | for file_to_save in files_to_save: 64 | timestamps_file_name = file_name_from_path(file_to_save) 65 | timestamps_file_blob_name = dest_dir_name + timestamps_file_name 66 | gcs_utils.upload_file_to_gcs(file_to_save, dest_bucket, timestamps_file_blob_name) 67 | 68 | 69 | class ProcessingMetadata(object): 70 | 71 | def __init__(self, index_metadata_file_path, shards_metadata_files): 72 | self.index_metadata_file_path = index_metadata_file_path 73 | self.shards_metadata_files = shards_metadata_files 74 | try: 75 | with open(index_metadata_file_path, "r") as f: 76 | metadata_json = json.load(f) 77 | self.elements_counter = MetadataCounter(metadata_json["elements_counter"]) 78 | self.index_db_timestamps = FileTimestamps(metadata_json["index_db"]) 79 | except Exception as e: 80 | logging.info(str(e)) 81 | self.elements_counter = MetadataCounter() 82 | self.index_db_timestamps = FileTimestamps() 83 | 84 | self.shards_timestamps = {} 85 | for entity, shards_metadata_files_by_entity in shards_metadata_files.items(): 86 | shards_timestamps_by_entity = {} 87 | for index_str, shards_metadata_file in shards_metadata_files_by_entity.items(): 88 | try: 89 | with open(shards_metadata_file, "r") as f: 90 | metadata_json = json.load(f) 91 | shards_timestamps_by_entity[index_str] = FileTimestamps(metadata_json) 92 | except Exception as e: 93 | logging.info(str(e)) 94 | shards_timestamps_by_entity[index_str] = FileTimestamps() 95 | self.shards_timestamps[entity] = shards_timestamps_by_entity 96 | 97 | def update_db_max_timestamp(self, db_max_timestamp): 98 | self.index_db_timestamps.update_max_timestamp(db_max_timestamp) 99 | 100 | def update_db_last_updated(self, db_last_updated): 101 | self.index_db_timestamps.update_last_updated(db_last_updated) 102 | 103 | def update_processing_counter(self, counter_dict): 104 | self.elements_counter.update(counter_dict) 105 | 106 | def get_min_history_results_last_updated_timestamp(self): 107 | return min([min([shard_timestamps.last_updated for shard_index_str, shard_timestamps in 108 | shards_timestamps_by_entity.items()]) for entity, shards_timestamps_by_entity in 109 | self.shards_timestamps.items()]) 110 | 111 | def get_history_results_max_timestamps(self): 112 | last_elements_timestamps = {} 113 | for entity, shards_timestamps_by_entity in self.shards_timestamps.items(): 114 | last_elements_timestamps[entity] = {shard_index_str: shard_timestamps.max_timestamp for 115 | shard_index_str, shard_timestamps in 116 | shards_timestamps_by_entity.items()} 117 | return last_elements_timestamps 118 | 119 | def update_history_result_timestamps(self, entity_type, shard_index): 120 | self.shards_timestamps[entity_type][str(shard_index)].update_max_timestamp( 121 | self.index_db_timestamps.max_timestamp) 122 | self.shards_timestamps[entity_type][str(shard_index)].update_last_updated( 123 | int(time.time())) 124 | 125 | def save_to_json_files(self, specific_history_results_shards_to_save=None, only_db_metadata=False): 126 | files_to_save = [] 127 | if not specific_history_results_shards_to_save: 128 | self.save_db_metadata(files_to_save) 129 | if not only_db_metadata: 130 | for entity, shards_timestamps_by_entity in self.shards_timestamps.items(): 131 | for shard_index_str, shard_timestamps in shards_timestamps_by_entity.items(): 132 | if not specific_history_results_shards_to_save or ( 133 | entity == specific_history_results_shards_to_save[0] and int(shard_index_str) == 134 | int(specific_history_results_shards_to_save[1])): 135 | shard_file = self.shards_metadata_files[entity][shard_index_str] 136 | with open(shard_file, "w") as f: 137 | json.dump(shard_timestamps.to_dict(), f) 138 | files_to_save.append(shard_file) 139 | return files_to_save 140 | 141 | def save_db_metadata(self, files_to_save): 142 | with open(self.index_metadata_file_path, "w") as f: 143 | json.dump({"elements_counter": self.elements_counter.to_dict(), 144 | "index_db": self.index_db_timestamps.to_dict()}, f) 145 | files_to_save.append(self.index_metadata_file_path) 146 | return files_to_save 147 | 148 | def to_dict(self): 149 | history_results = {} 150 | for entity, shards_timestamps_by_entity in self.shards_timestamps.items(): 151 | history_results[entity] = {shard_index_str: shard_timestamps.to_dict() for shard_index_str, shard_timestamps 152 | in 153 | shards_timestamps_by_entity.items()} 154 | return {"elements_counter": self.elements_counter.to_dict(), 155 | "index_db": self.index_db_timestamps.to_dict(), 156 | "history_results": history_results} 157 | 158 | 159 | class MetadataCounter(object): 160 | 161 | def __init__(self, counter_dict=None): 162 | if counter_dict: 163 | self.counter = {entity: counter_dict[entity] for entity in OSM_ENTITIES} 164 | else: 165 | self.counter = {entity: 0 for entity in OSM_ENTITIES} 166 | 167 | def update(self, counter): 168 | self.counter = counter 169 | 170 | def to_dict(self): 171 | return self.counter 172 | 173 | 174 | class FileTimestamps(object): 175 | 176 | def __init__(self, timestamps_dict=None): 177 | if timestamps_dict: 178 | self.max_timestamp = timestamps_dict["max_timestamp"] 179 | self.last_updated = timestamps_dict["last_updated"] 180 | else: 181 | self.max_timestamp = 0 182 | self.last_updated = 0 183 | 184 | def update_max_timestamp(self, max_timestamp): 185 | self.max_timestamp = max_timestamp 186 | 187 | def update_last_updated(self, last_updated): 188 | self.last_updated = last_updated 189 | 190 | def to_dict(self): 191 | return {"max_timestamp": self.max_timestamp, "last_updated": self.last_updated} 192 | -------------------------------------------------------------------------------- /examples/clustering/cities/cities.csv: -------------------------------------------------------------------------------- 1 | Name,Class,Latitude,Longitude,Radius 2 | London,Alpha++,51.497084,-0.133168,27700 3 | New York City,Alpha++,40.712728,-74.006015,25000 4 | Beijing,Alpha+,39.906217,116.391276,30000 5 | Dubai,Alpha+,25.20474,55.270707,45000 6 | Hong Kong,Alpha+,22.311106,114.183065,5350 7 | Paris,Alpha+,48.856697,2.351462,10000 8 | Shanghai,Alpha+,31.269942,121.543961,60000 9 | Singapore,Alpha+,1.399896,103.800868,27150 10 | Sydney,Alpha+,-33.834302,151.002299,31800 11 | Tokyo,Alpha+,35.606673,139.679901,50200 12 | Bangkok,Alpha,13.748917,100.526046,27700 13 | Brussels,Alpha,50.864711,4.382204,10000 14 | Buenos Aires,Alpha,-34.6409,-58.529138,36260 15 | Chicago,Alpha,41.853047,-87.677981,20600 16 | Frankfurt,Alpha,50.110644,8.682092,8200 17 | Guangzhou,Alpha,23.130196,113.259294,15000 18 | Istanbul,Alpha,41.086265,28.984227,36400 19 | Jakarta,Alpha,-6.220461,106.827184,30000 20 | Kuala Lumpur,Alpha,3.154439,101.672264,13500 21 | Los Angeles,Alpha,34.053691,-118.242767,40000 22 | Madrid,Alpha,40.416705,-3.703582,18300 23 | Melbourne,Alpha,-37.814218,144.963161,35700 24 | Mexico City,Alpha,19.431334,-99.085799,24500 25 | Miami,Alpha,26.136567,-80.210505,87000 26 | Milan,Alpha,45.4668,9.1905,12000 27 | Moscow,Alpha,55.739994,37.614064,20000 28 | Mumbai,Alpha,19.109836,72.87991,23500 29 | São Paulo,Alpha,-23.571414,-46.611434,33200 30 | Seoul,Alpha,37.554146,126.939154,23000 31 | Taipei,Alpha,25.031295,121.509434,12800 32 | Toronto,Alpha,43.629414,-79.57349,40600 33 | Warsaw,Alpha,52.225226,21.019777,15000 34 | Zürich,Alpha,47.376774,8.531428,8500 35 | Amsterdam,Alpha−,52.371185,4.87738,11000 36 | Barcelona,Alpha−,41.367435,2.14035,11500 37 | Bogotá,Alpha−,4.638176,-74.104699,15700 38 | Budapest,Alpha−,47.486316,19.102289,13200 39 | Dublin,Alpha−,53.349764,-6.260273,10000 40 | Houston,Alpha−,29.806496,-95.366273,41000 41 | Johannesburg,Alpha−,-26.205,28.049722,27100 42 | Lisbon,Alpha−,38.776836,-9.180555,10000 43 | Luxembourg City,Alpha−,49.622753,6.119711,7600 44 | Manila,Alpha−,14.589404,121.00706,30000 45 | Montreal,Alpha−,45.523191,-73.607643,20000 46 | Munich,Alpha−,48.137108,11.575382,12700 47 | New Delhi,Alpha−,28.590348,77.239942,35000 48 | Prague,Alpha−,50.087238,14.444594,15000 49 | Riyadh,Alpha−,24.658759,46.718812,26200 50 | Rome,Alpha−,41.891275,12.493922,11000 51 | San Francisco,Alpha−,37.59455,-122.07066,51200 52 | Santiago,Alpha−,-33.471731,-70.655087,18800 53 | Shenzhen,Alpha−,22.658862,113.985722,25500 54 | Stockholm,Alpha−,59.325117,18.071093,8600 55 | Vienna,Alpha−,48.206523,16.401343,13150 56 | "Washington, D.C.",Alpha−,38.892663,-77.021701,7500 57 | Athens,Beta+,38.007191,23.736531,13000 58 | Atlanta,Beta+,33.749099,-84.390185,10000 59 | Auckland,Beta+,-36.850995,174.790646,18700 60 | Bangalore,Beta+,12.970415,77.598166,16600 61 | Boston,Beta+,42.366592,-71.085065,18300 62 | Bucharest,Beta+,44.43957,26.102027,11000 63 | Cairo,Beta+,30.048819,31.243666,42700 64 | Chengdu,Beta+,30.66242,104.063322,26700 65 | Copenhagen,Beta+,55.665856,12.48629,15500 66 | Dallas,Beta+,32.800537,-96.975442,50800 67 | Doha,Beta+,25.262347,51.486612,17500 68 | Düsseldorf,Beta+,51.225402,6.776314,10000 69 | Hamburg,Beta+,53.550341,10.000654,15000 70 | Hangzhou,Beta+,30.179977,120.306829,41200 71 | Hanoi,Beta+,20.984588,105.860798,33000 72 | Ho Chi Minh City,Beta+,10.854796,106.713898,36800 73 | Lima,Beta+,-12.064102,-77.038591,23700 74 | Perth,Beta+,-31.952712,115.86048,30000 75 | Tel Aviv,Beta+,32.039458,34.835288,18700 76 | Vancouver,Beta+,49.152955,-122.54212,55100 77 | Abu Dhabi,Beta,24.478113,54.638329,35700 78 | Beirut,Beta,33.867536,35.544112,12200 79 | Berlin,Beta,52.521147,13.390237,23400 80 | Brisbane,Beta,-27.509236,153.01936,40300 81 | Calgary,Beta,51.023207,-114.062588,20500 82 | Cape Town,Beta,-33.946082,18.596636,21900 83 | Caracas,Beta,10.472662,-66.884744,17400 84 | Casablanca,Beta,33.541338,-7.496539,23700 85 | Chennai,Beta,13.049404,80.228901,18600 86 | Denver,Beta,39.729745,-104.947775,36200 87 | Karachi,Beta,24.891087,67.097051,21600 88 | Kyiv,Beta,50.422067,30.51054,15600 89 | Kuwait City,Beta,29.159419,47.920013,37000 90 | Lagos,Beta,6.643991,3.369447,31700 91 | Manama,Beta,26.179761,50.571947,16900 92 | Minneapolis,Beta,44.926914,-93.235285,36100 93 | Montevideo,Beta,-34.803526,-56.10077,21300 94 | Nairobi,Beta,-1.283439,36.836267,24700 95 | Nanjing,Beta,32.095898,118.809465,31700 96 | Oslo,Beta,59.87104,10.762376,19900 97 | Philadelphia,Beta,39.93799,-75.156659,32400 98 | Rio de Janeiro,Beta,-22.860427,-43.285597,51200 99 | Sofia,Beta,42.684239,23.333859,10800 100 | Tianjin,Beta,39.051002,117.382345,44000 101 | Wuhan,Beta,30.561002,114.315305,26600 102 | Zagreb,Beta,45.804714,15.970193,10000 103 | Almaty,Beta−,43.271979,76.90453,11500 104 | Antwerp,Beta−,51.270318,4.34782,13700 105 | Belgrade,Beta−,44.814647,20.426678,9100 106 | Birmingham,Beta−,52.49245,-1.882436,5400 107 | Bratislava,Beta−,48.148836,17.137814,7300 108 | Changsha,Beta−,28.207032,112.987807,17400 109 | Chongqing,Beta−,29.570495,106.547904,20800 110 | Dalian,Beta−,38.999537,121.717205,21600 111 | Dhaka,Beta−,23.800602,90.409042,11800 112 | Edinburgh,Beta−,55.941777,-3.206826,7300 113 | Geneva,Beta−,46.216487,6.136297,6800 114 | George Town,Beta−,5.410574,100.312495,5800 115 | Helsinki,Beta−,60.213146,24.904762,16800 116 | Jeddah,Beta−,21.559918,39.181155,31300 117 | Jinan,Beta−,36.675486,117.02887,14100 118 | Kampala,Beta−,0.322524,32.59784,14400 119 | Lyon,Beta−,45.747499,4.878689,12500 120 | Manchester,Beta−,53.48285,-2.232186,14800 121 | Monterrey,Beta−,25.723503,-100.307021,22200 122 | Nicosia,Beta−,35.159334,33.355705,10100 123 | Panama City,Beta−,9.041305,-79.463458,13800 124 | Port Louis,Beta−,-20.226562,57.506832,12400 125 | Qingdao,Beta−,36.126453,120.306022,26500 126 | Quito,Beta−,-0.208148,-78.491728,19000 127 | San José,Beta−,9.955537,-84.118714,17100 128 | San Juan,Beta−,18.400626,-66.096104,15500 129 | San Salvador,Beta−,13.714335,-89.182499,14900 130 | Seattle,Beta−,47.598014,-122.200917,69500 131 | Shenyang,Beta−,41.784356,123.454416,24400 132 | Stuttgart,Beta−,48.795404,9.196154,9100 133 | Suzhou,Beta−,31.283432,120.635324,25400 134 | Tunis,Beta−,36.789929,10.217612,16700 135 | Valencia,Beta−,39.469108,-0.407841,16600 136 | Xiamen,Beta−,24.491775,118.059237,18200 137 | Accra,Gamma+,5.631676,-0.212893,37600 138 | Adelaide,Gamma+,-34.915843,138.549128,34400 139 | Cleveland,Gamma+,41.41456,-81.652182,28700 140 | Colombo,Gamma+,6.912506,79.901911,19700 141 | Dar es Salaam,Gamma+,-6.842749,39.235055,16000 142 | Detroit,Gamma+,42.406132,-83.125588,39300 143 | Glasgow,Gamma+,55.857124,-4.23721,17200 144 | Guatemala City,Gamma+,14.569075,-90.545258,17400 145 | Guayaquil,Gamma+,-2.151989,-79.900603,15600 146 | Harare,Gamma+,-17.831774,31.045,16100 147 | Hyderabad,Gamma+,25.389097,68.321651,9200 148 | Lahore,Gamma+,31.525877,74.312124,19600 149 | Muscat,Gamma+,23.570524,58.381711,16100 150 | Osaka,Gamma+,34.723985,135.212998,67300 151 | Pune,Gamma+,18.593798,73.844187,21100 152 | Riga,Gamma+,56.950147,24.136084,11600 153 | Rotterdam,Gamma+,51.869146,4.486787,18400 154 | Xi'an,Gamma+,34.30514,108.898319,23900 155 | Zhengzhou,Gamma+,34.758051,113.651027,19729 156 | Ahmedabad,Gamma,23.042155,72.595156,11700 157 | Algiers,Gamma,36.77151,3.11203,14700 158 | Amman,Gamma,32.006863,35.996048,20400 159 | Ankara,Gamma,39.94147,32.771183,19600 160 | Asunción,Gamma,-25.334873,-57.454647,23200 161 | Austin,Gamma,30.402545,-97.695763,36200 162 | Baku,Gamma,40.41852,49.880112,12200 163 | Baltimore,Gamma,39.301992,-76.651303,25700 164 | Belfast,Gamma,54.591213,-5.956015,12500 165 | Bilbao,Gamma,43.289994,-2.977223,13400 166 | Bristol,Gamma,51.478047,-2.596283,9900 167 | Charlotte,Gamma,35.217061,-80.798446,18500 168 | Guadalajara,Gamma,20.635282,-103.367083,18000 169 | Hefei,Gamma,31.834972,117.283132,20200 170 | Islamabad,Gamma,33.613946,73.084453,15200 171 | Kolkata,Gamma,22.588522,88.365255,16100 172 | Kunming,Gamma,24.966022,102.755536,20000 173 | La Paz,Gamma,-16.509127,-68.172818,7800 174 | Ljubljana,Gamma,46.070454,14.518177,8100 175 | Luanda,Gamma,-8.954777,13.278967,27100 176 | Lusaka,Gamma,-15.392474,28.359294,16100 177 | Phoenix,Gamma,33.48492,-112.028775,51000 178 | Porto,Gamma,41.463435,-7.853176,16100 179 | Saint Petersburg,Gamma,59.942837,30.288084,17400 180 | San Diego,Gamma,32.766346,-117.078526,24575 181 | San Jose,Gamma,37.366646,-121.954236,23300 182 | Santo Domingo,Gamma,18.491268,-69.906053,18200 183 | St. Louis,Gamma,38.712435,-90.383864,49400 184 | Taiyuan,Gamma,37.810584,112.591887,23400 185 | Tallinn,Gamma,59.424525,24.795728,15800 186 | Tampa,Gamma,27.987801,-82.397335,22300 187 | Tbilisi,Gamma,41.719537,44.831698,16100 188 | Tegucigalpa,Gamma,14.073792,-87.202263,7900 189 | Turin,Gamma,45.071154,7.662587,12800 190 | Vilnius,Gamma,54.689817,25.272954,7600 191 | Wellington,Gamma,-41.276648,174.782394,8700 192 | Belo Horizonte,Gamma−,-19.899461,-44.027494,20800 193 | Cologne,Gamma−,50.934034,6.962721,15800 194 | Curitiba,Gamma−,-25.503295,-49.212235,25300 195 | Durban,Gamma−,-29.845191,30.96253,22200 196 | Fuzhou,Gamma−,27.951573,116.359736,8400 197 | Johor Bahru,Gamma−,1.578175,103.699511,16800 198 | Maputo,Gamma−,-25.817914,32.591492,29700 199 | Medellín,Gamma−,6.249858,-75.577845,14400 200 | Milwaukee,Gamma−,43.04401,-88.014503,24800 201 | Minsk,Gamma−,53.890378,27.57764,12500 202 | Nantes,Gamma−,47.217697,-1.566498,13600 203 | Nashville,Gamma−,36.188744,-86.638442,35800 204 | Orlando,Gamma−,28.510731,-81.380408,38600 205 | Ottawa,Gamma−,45.386109,-75.721147,25700 206 | Penang,Gamma−,5.422595,100.508378,36100 207 | Phnom Penh,Gamma−,11.550778,104.8888,13200 208 | Poznań,Gamma−,52.410371,16.929402,13000 209 | Sacramento,Gamma−,38.62148,-121.310583,29900 210 | San Antonio,Gamma−,29.429281,-98.484132,29200 211 | Tirana,Gamma−,41.338097,19.777162,10100 212 | Wrocław,Gamma−,51.117738,17.037314,8700 213 | Yangon,Gamma−,16.901835,96.1555,16600 -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/elements_processing.py: -------------------------------------------------------------------------------- 1 | import json 2 | import hashlib 3 | 4 | 5 | def generate_complex_id(obj_dict): 6 | return "{}_{}".format(obj_dict["id"], obj_dict["version"]) 7 | 8 | 9 | def get_uniformly_shard_index_from_id(id, num_shards): 10 | return int(hashlib.md5(str(id).encode("utf-8")).hexdigest(), 16) % num_shards 11 | 12 | 13 | class IdManager(object): 14 | 15 | def __init__(self): 16 | self.relation_id_map = {} 17 | self.way_id_map = {} 18 | self.node_id_map = {} 19 | 20 | self.id_counter = 1 21 | 22 | def replace_ids_in_way_and_its_dependencies(self, way_dict, way_nodes_dicts): 23 | self.replace_ids_in_obj_list([way_dict], self.way_id_map) 24 | local_id_map = self.replace_ids_in_obj_list(way_nodes_dicts, self.node_id_map) 25 | way_dict["nodes"] = [local_id_map[node_id] if node_id in local_id_map else node_id 26 | for node_id in way_dict["nodes"]] 27 | 28 | def replace_ids_in_relation_and_its_dependencies(self, relation_dict, relation_nodes_dicts, 29 | relation_ways_dicts, relation_relations_dicts): 30 | self.replace_ids_in_obj_list([relation_dict], self.relation_id_map) 31 | nodes_local_id_map = self.replace_ids_in_obj_list(relation_nodes_dicts, self.node_id_map) 32 | ways_local_id_map = self.replace_ids_in_obj_list(relation_ways_dicts, self.way_id_map) 33 | relations_local_id_map = self.replace_ids_in_obj_list(relation_relations_dicts, self.relation_id_map) 34 | 35 | for relation_way_dict in relation_ways_dicts: 36 | self.replace_ids_in_way_nodes(relation_way_dict, nodes_local_id_map) 37 | for relation_relation_dict in relation_relations_dicts: 38 | self.replace_ids_in_relation_members(relation_relation_dict, nodes_local_id_map, 39 | ways_local_id_map, relations_local_id_map) 40 | self.replace_ids_in_relation_members(relation_dict, nodes_local_id_map, 41 | ways_local_id_map, relations_local_id_map) 42 | 43 | def replace_ids_in_way_nodes(self, way_dict, id_map): 44 | way_dict["nodes"] = [id_map[node_id] if node_id in id_map else node_id for node_id in way_dict["nodes"]] 45 | 46 | def replace_ids_in_relation_members(self, relation_dict, nodes_id_map, ways_id_map, relations_id_map): 47 | for index in range(len(relation_dict["members"])): 48 | member = relation_dict["members"][index] 49 | member_type, member_id, member_role = member 50 | if member_type == "n": 51 | relation_dict["members"][index] = (member_type, 52 | nodes_id_map[member_id] if member_id in nodes_id_map else member_id, member_role) 53 | elif member_type == "w": 54 | relation_dict["members"][index] = (member_type, 55 | ways_id_map[member_id] if member_id in ways_id_map else member_id, member_role) 56 | elif member_type == "r": 57 | relation_dict["members"][index] = (member_type, 58 | relations_id_map[member_id] if member_id in relations_id_map else member_id, member_role) 59 | 60 | def replace_ids_in_obj_list(self, osm_obj_dicts, id_map): 61 | local_id_map = {} 62 | for osm_obj_dict in osm_obj_dicts: 63 | osm_obj_complex_id = generate_complex_id(osm_obj_dict) 64 | if osm_obj_complex_id in id_map: 65 | local_id_map[osm_obj_dict["id"]] = id_map[osm_obj_complex_id] 66 | osm_obj_dict["id"] = id_map[osm_obj_complex_id] 67 | else: 68 | id_map[osm_obj_complex_id] = self.id_counter 69 | local_id_map[osm_obj_dict["id"]] = self.id_counter 70 | osm_obj_dict["id"] = self.id_counter 71 | self.id_counter = self.id_counter + 1 72 | return local_id_map 73 | 74 | def get_simplified_id_and_original_id_maps(self): 75 | result_relations_ids_map = {simple_id: int(complex_id.split("_")[0]) 76 | for complex_id, simple_id in self.relation_id_map.items()} 77 | result_ways_ids_map = {simple_id: int(complex_id.split("_")[0]) 78 | for complex_id, simple_id in self.way_id_map.items()} 79 | result_nodes_ids_map = {simple_id: int(complex_id.split("_")[0]) 80 | for complex_id, simple_id in self.node_id_map.items()} 81 | return result_nodes_ids_map, result_ways_ids_map, result_relations_ids_map 82 | 83 | def reset(self): 84 | self.relation_id_map.clear() 85 | self.way_id_map.clear() 86 | self.node_id_map.clear() 87 | self.id_counter = 1 88 | 89 | 90 | class BatchManager(object): 91 | 92 | def __init__(self, gdal_batch_size, entities_number): 93 | self.entities_number = entities_number 94 | 95 | self.nodes_batch = {} 96 | self.ways_batch = {} 97 | self.all_relations_batch = {} 98 | self.main_relation_batch = {} 99 | self.ways_batch_counter = 0 100 | self.gdal_batch_size = gdal_batch_size 101 | 102 | self.id_manager = IdManager() 103 | 104 | def add_osm_dicts_to_batches(self, node_dicts_list=list(), way_dicts_list=list(), relation_dicts_list=list(), 105 | main_relation_dict=None): 106 | for node_dict in node_dicts_list: 107 | self.nodes_batch[node_dict["id"]] = node_dict 108 | for way_dict in way_dicts_list: 109 | self.ways_batch[way_dict["id"]] = way_dict 110 | for relation_dict in relation_dicts_list: 111 | self.all_relations_batch[relation_dict["id"]] = relation_dict 112 | if main_relation_dict is not None: 113 | self.all_relations_batch[main_relation_dict["id"]] = main_relation_dict 114 | self.main_relation_batch[main_relation_dict["id"]] = main_relation_dict 115 | self.ways_batch_counter = self.ways_batch_counter + 1 116 | 117 | def sorted_obj_batch_values(self, obj_batch): 118 | return sorted(list(obj_batch.values()), key=lambda obj: obj["id"]) 119 | 120 | def get_batches_values_sorted_lists(self): 121 | return self.sorted_obj_batch_values(self.nodes_batch), \ 122 | self.sorted_obj_batch_values(self.ways_batch), \ 123 | self.sorted_obj_batch_values(self.all_relations_batch) 124 | 125 | def get_ways_simplified_ids(self): 126 | return list(self.ways_batch.keys()) 127 | 128 | def get_main_relations_simplified_ids(self): 129 | return list(self.main_relation_batch.keys()) 130 | 131 | def get_simplified_id_and_original_id_maps(self): 132 | return self.id_manager.get_simplified_id_and_original_id_maps() 133 | 134 | def replace_ids_in_way_and_its_dependencies(self, way_dict, way_nodes_dicts): 135 | self.id_manager.replace_ids_in_way_and_its_dependencies(way_dict, way_nodes_dicts) 136 | 137 | def replace_ids_in_relation_and_its_dependencies(self, relation_dict, relation_nodes_dicts, 138 | relation_ways_dicts, relation_relations_dicts): 139 | self.id_manager.replace_ids_in_relation_and_its_dependencies(relation_dict, relation_nodes_dicts, 140 | relation_ways_dicts, relation_relations_dicts) 141 | 142 | def restore_ways_ids_and_add_geometry(self, id_geometry_map, result_func): 143 | result_nodes_ids_map, result_ways_ids_map, _ = self.id_manager.get_simplified_id_and_original_id_maps() 144 | 145 | for way_dict_id, way_dict in self.ways_batch.items(): 146 | if way_dict["id"] in id_geometry_map: 147 | way_dict["geometry"] = json.dumps(id_geometry_map[way_dict["id"]]) 148 | way_dict["id"] = result_ways_ids_map[way_dict["id"]] 149 | way_dict["nodes"] = [result_nodes_ids_map[node_id] if node_id in result_nodes_ids_map else node_id for 150 | node_id in way_dict["nodes"]] 151 | result_func(way_dict) 152 | 153 | def restore_relations_ids_and_add_geometry(self, id_geometry_map, result_func): 154 | result_nodes_ids_map, result_ways_ids_map, result_relations_ids_map = \ 155 | self.id_manager.get_simplified_id_and_original_id_maps() 156 | 157 | for relation_dict_id, relation_dict in self.main_relation_batch.items(): 158 | if relation_dict["id"] in id_geometry_map: 159 | relation_dict["geometry"] = json.dumps(id_geometry_map[relation_dict["id"]]) 160 | relation_dict["id"] = result_relations_ids_map[relation_dict["id"]] 161 | self.id_manager.replace_ids_in_relation_members(relation_dict, result_nodes_ids_map, 162 | result_ways_ids_map, result_relations_ids_map) 163 | result_func(relation_dict) 164 | 165 | def generate_batch_osm_file_name(self, work_dir, current_entity_type, current_index, pool_size): 166 | batch_end = current_index 167 | batch_start = batch_end - (self.get_batch_limit_for_current_entity(current_entity_type)*pool_size) 168 | return work_dir + '{}_{}_{}.osm'.format(current_entity_type, batch_start, batch_end) 169 | 170 | def is_full(self, entity_type): 171 | return self.ways_batch_counter >= self.get_batch_limit_for_current_entity(entity_type) 172 | 173 | def get_batch_limit_for_current_entity(self, entity_type): 174 | return self.gdal_batch_size if entity_type != "relations" else self.gdal_batch_size/2 175 | 176 | def reset(self): 177 | self.ways_batch_counter = 0 178 | self.nodes_batch.clear() 179 | self.ways_batch.clear() 180 | self.all_relations_batch.clear() 181 | self.main_relation_batch.clear() 182 | self.id_manager.reset() 183 | -------------------------------------------------------------------------------- /tasks_docker_images/osm_converter_with_history_index/src/cache_manager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import datetime 4 | import time 5 | 6 | import gcs_service 7 | import file_service 8 | 9 | OSM_ENTITIES = ["nodes", "ways", "relations"] 10 | 11 | 12 | def create_processing_counter(): 13 | return {key: 0 for key in OSM_ENTITIES} 14 | 15 | 16 | def is_file_fresh(last_updated_timestamp, data_freshness_exp_days): 17 | db_freshness = datetime.timedelta(seconds=int(time.time()) - last_updated_timestamp) 18 | logging.info("Freshness: {}".format(db_freshness)) 19 | return db_freshness < datetime.timedelta(days=data_freshness_exp_days) 20 | 21 | 22 | def download_db_if_exists(dbs_file_paths, 23 | dest_bucket, 24 | dest_dir_name): 25 | db_gcs_and_local_paths = [] 26 | for db_file_path in dbs_file_paths: 27 | db_name = file_service.file_name_from_path(db_file_path) 28 | db_blob_name = dest_dir_name + db_name 29 | 30 | if not gcs_service.is_gcs_blob_exists(dest_bucket, db_blob_name): 31 | return False 32 | else: 33 | db_gcs_and_local_paths.append((db_blob_name, db_file_path)) 34 | 35 | for db_blob_name, db_file_path in db_gcs_and_local_paths: 36 | gcs_service.from_gcs_to_local_file(dest_bucket, db_blob_name, db_file_path) 37 | return True 38 | 39 | 40 | def get_index_metadata_file_path(src_osm_name, num_db_shards): 41 | return file_service.file_name_without_ext(src_osm_name) + "_{}_index_shards.metadata.txt".format(num_db_shards) 42 | 43 | 44 | def get_result_shard_metadata_file_path(src_osm_name, entity_type, index, num_results_shards): 45 | return file_service.file_name_without_ext(src_osm_name) + "_{}_{}_{}.metadata.txt".format(entity_type, index + 1, 46 | num_results_shards) 47 | 48 | 49 | def download_and_read_metadata_file(gcs_bucket, gcs_dir_name, src_osm_name, num_db_shards, num_results_shards): 50 | src_osm_file_name = file_service.file_name_from_path(src_osm_name) 51 | 52 | index_metadata_file_path = get_index_metadata_file_path(src_osm_file_name, num_db_shards) 53 | index_metadata_blob_name = gcs_dir_name + index_metadata_file_path 54 | if gcs_service.is_gcs_blob_exists(gcs_bucket, index_metadata_blob_name): 55 | gcs_service.from_gcs_to_local_file(gcs_bucket, index_metadata_blob_name, index_metadata_file_path) 56 | 57 | shards_metadata_files = {} 58 | for entity in OSM_ENTITIES: 59 | shards_metadata_files_by_entity = {} 60 | for index in range(num_results_shards): 61 | result_shard_metadata_file_path = get_result_shard_metadata_file_path(src_osm_file_name, entity, index, 62 | num_results_shards) 63 | result_shard_metadata_blob_name = gcs_dir_name + result_shard_metadata_file_path 64 | if gcs_service.is_gcs_blob_exists(gcs_bucket, result_shard_metadata_blob_name): 65 | gcs_service.from_gcs_to_local_file(gcs_bucket, result_shard_metadata_blob_name, 66 | result_shard_metadata_file_path) 67 | shards_metadata_files_by_entity[str(index)] = result_shard_metadata_file_path 68 | shards_metadata_files[entity] = shards_metadata_files_by_entity 69 | return ProcessingMetadata(index_metadata_file_path, shards_metadata_files) 70 | 71 | 72 | def save_and_upload_metadata_to_gcs(metadata, 73 | dest_bucket, 74 | dest_dir_name, 75 | save_only_shard_by_entity_and_index=None, 76 | only_db_metadata=False): 77 | files_to_save = metadata.save_to_json_files(save_only_shard_by_entity_and_index, only_db_metadata) 78 | 79 | for file_to_save in files_to_save: 80 | timestamps_file_name = file_service.file_name_from_path(file_to_save) 81 | timestamps_file_blob_name = dest_dir_name + timestamps_file_name 82 | gcs_service.upload_file_to_gcs(file_to_save, dest_bucket, timestamps_file_blob_name) 83 | 84 | 85 | class ProcessingMetadata(object): 86 | 87 | def __init__(self, index_metadata_file_path, shards_metadata_files): 88 | self.index_metadata_file_path = index_metadata_file_path 89 | self.shards_metadata_files = shards_metadata_files 90 | try: 91 | with open(index_metadata_file_path, "r") as f: 92 | metadata_json = json.load(f) 93 | self.elements_counter = MetadataCounter(metadata_json["elements_counter"]) 94 | self.index_db_timestamps = FileTimestamps(metadata_json["index_db"]) 95 | except Exception as e: 96 | logging.info(str(e)) 97 | self.elements_counter = MetadataCounter() 98 | self.index_db_timestamps = FileTimestamps() 99 | 100 | self.shards_timestamps = {} 101 | for entity, shards_metadata_files_by_entity in shards_metadata_files.items(): 102 | shards_timestamps_by_entity = {} 103 | for index_str, shards_metadata_file in shards_metadata_files_by_entity.items(): 104 | try: 105 | with open(shards_metadata_file, "r") as f: 106 | metadata_json = json.load(f) 107 | shards_timestamps_by_entity[index_str] = FileTimestamps(metadata_json) 108 | except Exception as e: 109 | logging.info(str(e)) 110 | shards_timestamps_by_entity[index_str] = FileTimestamps() 111 | self.shards_timestamps[entity] = shards_timestamps_by_entity 112 | 113 | def update_db_max_timestamp(self, db_max_timestamp): 114 | self.index_db_timestamps.update_max_timestamp(db_max_timestamp) 115 | 116 | def update_db_last_updated(self, db_last_updated): 117 | self.index_db_timestamps.update_last_updated(db_last_updated) 118 | 119 | def update_processing_counter(self, counter_dict): 120 | self.elements_counter.update(counter_dict) 121 | 122 | def get_min_history_results_last_updated_timestamp(self): 123 | return min([min([shard_timestamps.last_updated for shard_index_str, shard_timestamps in 124 | shards_timestamps_by_entity.items()]) for entity, shards_timestamps_by_entity in 125 | self.shards_timestamps.items()]) 126 | 127 | def get_history_results_max_timestamps(self): 128 | last_elements_timestamps = {} 129 | for entity, shards_timestamps_by_entity in self.shards_timestamps.items(): 130 | last_elements_timestamps[entity] = {shard_index_str: shard_timestamps.max_timestamp for 131 | shard_index_str, shard_timestamps in 132 | shards_timestamps_by_entity.items()} 133 | return last_elements_timestamps 134 | 135 | def update_history_result_timestamps(self, entity_type, shard_index): 136 | self.shards_timestamps[entity_type][str(shard_index)].update_max_timestamp( 137 | self.index_db_timestamps.max_timestamp) 138 | self.shards_timestamps[entity_type][str(shard_index)].update_last_updated( 139 | int(time.time())) 140 | 141 | def save_to_json_files(self, specific_history_results_shards_to_save=None, only_db_metadata=False): 142 | files_to_save = [] 143 | if not specific_history_results_shards_to_save: 144 | self.save_db_metadata(files_to_save) 145 | if not only_db_metadata: 146 | for entity, shards_timestamps_by_entity in self.shards_timestamps.items(): 147 | for shard_index_str, shard_timestamps in shards_timestamps_by_entity.items(): 148 | if not specific_history_results_shards_to_save or ( 149 | entity == specific_history_results_shards_to_save[0] and int(shard_index_str) == 150 | int(specific_history_results_shards_to_save[1])): 151 | shard_file = self.shards_metadata_files[entity][shard_index_str] 152 | with open(shard_file, "w") as f: 153 | json.dump(shard_timestamps.to_dict(), f) 154 | files_to_save.append(shard_file) 155 | return files_to_save 156 | 157 | def save_db_metadata(self, files_to_save): 158 | with open(self.index_metadata_file_path, "w") as f: 159 | json.dump({"elements_counter": self.elements_counter.to_dict(), 160 | "index_db": self.index_db_timestamps.to_dict()}, f) 161 | files_to_save.append(self.index_metadata_file_path) 162 | return files_to_save 163 | 164 | def to_dict(self): 165 | history_results = {} 166 | for entity, shards_timestamps_by_entity in self.shards_timestamps.items(): 167 | history_results[entity] = {shard_index_str: shard_timestamps.to_dict() for shard_index_str, shard_timestamps 168 | in 169 | shards_timestamps_by_entity.items()} 170 | return {"elements_counter": self.elements_counter.to_dict(), 171 | "index_db": self.index_db_timestamps.to_dict(), 172 | "history_results": history_results} 173 | 174 | 175 | class MetadataCounter(object): 176 | 177 | def __init__(self, counter_dict=None): 178 | if counter_dict: 179 | self.counter = {entity: counter_dict[entity] for entity in OSM_ENTITIES} 180 | else: 181 | self.counter = {entity: 0 for entity in OSM_ENTITIES} 182 | 183 | def update(self, counter): 184 | self.counter = counter 185 | 186 | def to_dict(self): 187 | return self.counter 188 | 189 | 190 | class FileTimestamps(object): 191 | 192 | def __init__(self, timestamps_dict=None): 193 | if timestamps_dict: 194 | self.max_timestamp = timestamps_dict["max_timestamp"] 195 | self.last_updated = timestamps_dict["last_updated"] 196 | else: 197 | self.max_timestamp = 0 198 | self.last_updated = 0 199 | 200 | def update_max_timestamp(self, max_timestamp): 201 | self.max_timestamp = max_timestamp 202 | 203 | def update_last_updated(self, last_updated): 204 | self.last_updated = last_updated 205 | 206 | def to_dict(self): 207 | return {"max_timestamp": self.max_timestamp, "last_updated": self.last_updated} 208 | --------------------------------------------------------------------------------