├── .gitignore ├── LICENSE ├── README.md ├── doc └── changes │ ├── changelog.md │ └── changes_1.0.0.md ├── error_code_config.yml ├── examples └── tensorflow-with-gpu-preview │ ├── .gitignore │ ├── EXAConf │ ├── fetch_output_redirect_from_last_statement.sh │ ├── gcloud-create-instance.sh │ ├── gcloud-setup.sh │ ├── start_output_redirect_server.sh │ ├── system-status.sh │ ├── tensorflow-gpu-preview.ipynb │ └── tensorflow_udf │ ├── .gitignore │ ├── __init__.py │ ├── column_encoder.py │ ├── dataset_utils.py │ ├── identity_feature_column.py │ ├── keras_layer.py │ ├── requirements.txt │ ├── tensorflow_config.yaml │ ├── tensorflow_udf.py │ └── utils.py └── tutorials ├── README.md ├── machine-learning ├── README.md ├── python │ ├── AzureML │ │ ├── ConnectAzureMLtoExasol.ipynb │ │ ├── Introduction.ipynb │ │ ├── InvokeModelFromExasolDBwithUDF.ipynb │ │ ├── TrainModelInAzureML.ipynb │ │ ├── img_src │ │ │ ├── access_key_azure.png │ │ │ ├── azureML_public_ip.png │ │ │ ├── cluster_creation.png │ │ │ ├── conda_file_artifact.png │ │ │ ├── connection_detail_generate.png │ │ │ ├── connection_details_acess_token.png │ │ │ ├── consume_endpoint.png │ │ │ ├── create_datastore.png │ │ │ ├── data_blobstore.png │ │ │ ├── download_all.png │ │ │ ├── download_file_arifact.png │ │ │ ├── file_path_bucketfs.png │ │ │ ├── get_data_link.png │ │ │ ├── get_data_link_2.png │ │ │ ├── manage_udf_files.png │ │ │ ├── registered_model.png │ │ │ └── resource_group.png │ │ ├── main.py │ │ └── score.py │ ├── README.md │ ├── sagemaker │ │ ├── ConnectSagemakerToExasol.ipynb │ │ ├── LoadExampleDataIntoExasol.ipynb │ │ ├── TrainSagemakerModelWithExasolData.ipynb │ │ └── UseSagemakerModelFromExasol.ipynb │ └── scikit-learn │ │ ├── README.md │ │ └── classification.ipynb └── sagemaker-extension │ ├── images │ ├── sme_deployment.png │ ├── sme_overview.png │ └── sme_training.png │ └── tutorial.md ├── script-languages ├── README.md ├── bash_runner.py ├── requirements.txt ├── script-languages.ipynb └── slc_main_build_steps.svg └── spatial-analysis ├── README.md └── visualizing_spatial_queries ├── README.md ├── geojsonfiles └── README.md └── visualizing_spatial_queries.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | venv/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Exasol 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ⚠ This project has been archived please check [Exasol's AI Lab](https://github.com/exasol/ai-lab) for data science examples. 2 | 3 | # Data Science with Exasol 4 | This repository contains a collection of examples and tutorials for Data Science and Machine Learning with Exasol. In those examples and tutorials you learn how to explore and prepare your data and build, train and deploy your model with and within Exasol. 5 | 6 | **Currently, this repository is under development and we will add more and more examples and tutorials in the future.** 7 | 8 | ## What's inside: 9 | 10 | * [Tutorials](tutorials): Tutorials show a complete workflow on a realistic use case and data. 11 | * [Examples](examples): Examples only show how to integrate a specific technology, but not a whole data science workflow with it. 12 | 13 | ## Prerequisites: 14 | 15 | In general, you need: 16 | * Exasol, in particular with user-defined functions (UDFs). In most cases Version 6.0 and above with [Script Language Container](https://github.com/exasol/script-languages) support is required. We provide a [Community Edition](https://www.exasol.com/portal/display/DOC/EXASOL+Community+Edition+Quick+Start+Guide) or [Docker images](https://github.com/exasol/docker-db). 17 | * Many examples or tutorials are provided as [Jupyter](https://jupyter.org/) Notebooks. We recommend to install a Jupyter server with access to the Database and the BucketFS (Documentation can be found in the [Exasol User Manual](https://www.exasol.com/portal/display/DOC/User+Manual+6.1.0) in Section 3.6.4). 18 | * Furthermore, many examples heavily use [pyexasol](https://github.com/badoo/pyexasol) to communicate with the Database. We recommend to install it on your Jupyter server. 19 | 20 | Specific prerequisites are stated in each tutorial. 21 | -------------------------------------------------------------------------------- /doc/changes/changelog.md: -------------------------------------------------------------------------------- 1 | # Changes 2 | 3 | * [1.0.0](changes_1.0.0.md) -------------------------------------------------------------------------------- /doc/changes/changes_1.0.0.md: -------------------------------------------------------------------------------- 1 | # data-science-examples 1.0.0, released 2023-10-16 2 | 3 | First release of this collection of examples for Integrating the Exasol Database with datascience 4 | focused applications and packages. 5 | 6 | ## Features / Enhancements 7 | 8 | * #6: Added Tensorflow GPU UDF preview 9 | * #8: Added SciKit-learn classification Example 10 | * #21: Added an example for connecting from AWS Sagemaker to an Exasol database 11 | * #23: Added an example for training a Sagemaker model with data from Exasol 12 | * #25: Added an example for using a Sagemaker model from within Exasol 13 | * #29: Add script-languages build and customization tutorial 14 | * #34: Added an example for loading example data into the Exasol database 15 | * #35: Added tutorial for Sagemaker-Extension 16 | * #43: Updated to Python3.8 minimal flavor in script-languages tutorial 17 | * #45: Added error_code_config 18 | * #39-#48: Added tutorial Series for Connection to AzureML 19 | 20 | ## Bugs 21 | 22 | * #53: Fixed error_code_config.yaml 23 | * #38: Fix typo 24 | 25 | -------------------------------------------------------------------------------- /error_code_config.yml: -------------------------------------------------------------------------------- 1 | error-tags: 2 | DSE: 3 | highest-index: 0 -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/.gitignore: -------------------------------------------------------------------------------- 1 | tfhub_modules 2 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/EXAConf: -------------------------------------------------------------------------------- 1 | [Global] 2 | Revision = 15 3 | Checksum = COMMIT 4 | ClusterName = cl4 5 | Platform = Docker 6 | LicenseFile = /exa/etc/license.xml 7 | CoredPort = 10001 8 | SSHPort = 22 9 | XMLRPCPort = 443 10 | # List of networks for this cluster: 'private' is mandatory, 'public' is optional. 11 | Networks = private 12 | # Comma-separated list of nameservers for this cluster. 13 | NameServers = 8.8.8.8 14 | Timezone = Europe/Berlin 15 | # Nr. of hugepages ('0' = disabled, 'host' = manually configured on the host, 'auto' = set automatically based on DB config) 16 | Hugepages = 0 17 | ConfVersion = 6.1.3 18 | OSVersion = 6.1.3 19 | REVersion = 6.1.3 20 | DBVersion = 6.1.3 21 | ImageVersion = 6.1.3-d1 22 | 23 | # SSL options 24 | [SSL] 25 | # The SSL certificate, private key and CA for all EXASOL services 26 | Cert = /path/to/ssl.crt 27 | CertKey = /path/to/ssl.key 28 | CertAuth = /path/to/ssl.ca 29 | 30 | # Docker related options 31 | [Docker] 32 | # The directory that contains all data related to this docker cluster 33 | # (except for mapped devices) 34 | RootDir = /exa/etc 35 | # The EXASOL docker image used for all containers of this cluster 36 | Image = exasol/docker-db:latest 37 | # The type of storage devices for this cluster: 'block' or 'file' 38 | DeviceType = file 39 | # Comma-separated list of volumes to be mounted in all containers (e. g. '/mnt/my_data:/exa/my_data:rw' ) 40 | # These user-defined volumes are mounted additionally to the internal ones (like the node root volume) 41 | AdditionalVolumes = 42 | 43 | [Groups] 44 | [[root]] 45 | ID = 0 46 | [[exausers]] 47 | ID = 500 48 | [[exadbadm]] 49 | ID = 1001 50 | [[exastoradm]] 51 | ID = 1002 52 | [[exabfsadm]] 53 | ID = 1003 54 | [[exaadm]] 55 | ID = 1004 56 | 57 | [Users] 58 | [[root]] 59 | ID = 0 60 | Group = root 61 | LoginEnabled = True 62 | AdditionalGroups = exausers, exadbadm, exastoradm, exabfsadm, exaadm 63 | [[exadefusr]] 64 | ID = 500 65 | Group = exausers 66 | LoginEnabled = False 67 | AdditionalGroups = exadbadm, exastoradm, exabfsadm, exaadm 68 | 69 | [Node : 11] 70 | PrivateNet = 172.17.0.2/16 71 | PublicNet = 72 | Name = n11 73 | UUID = ECD384A2153246AA9EFC9E88E5292806CE8451C2 74 | DockerVolume = n11 75 | # Ports to be exposed (container : host) 76 | ExposedPorts = 8888:8899, 6583:6594 77 | [[Disk : disk1]] 78 | Component = exastorage 79 | Devices = dev.1 80 | Mapping = dev.1:/exa/data/storage 81 | 82 | # Global EXAStorage options 83 | [EXAStorage] 84 | # Enable or disable background recovery / data restoration (does not affect on-demand recovery) 85 | BgRecEnabled = True 86 | # Max. throughput for background recovery / data restoration (in MiB/s) 87 | BgRecLimit = 88 | # Space usage threshold (in percent, per node) for sending a warning 89 | SpaceWarnThreshold = 90 90 | 91 | # An EXAStorage volume 92 | [EXAVolume : DataVolume1] 93 | # Type of volume: 'data' | 'archive' 94 | Type = data 95 | # Volume size (e. g. '1 TiB') 96 | Size = 90 GiB 97 | # Name of the disk to be used for this volume. 98 | # This disk must exist on all volume nodes. 99 | Disk = disk1 100 | # Comma-separated list of node IDs to be used for this volume (incl. redundancy nodes) 101 | Nodes = 11 102 | # OPTIONAL: Nr. of master nodes for this volume (default: use all nodes) 103 | NumMasterNodes = 1 104 | # Desired redundancy for this volume 105 | Redundancy = 1 106 | # Volume owner (user and group ID) 107 | Owner = 500 : 500 108 | Permissions = rwx 109 | BlockSize = 4 KiB 110 | StripeSize = 256 KiB 111 | # OPTIONAL: shared volumes can be opened (for writing) by multiple clients simultaneously 112 | Shared = True 113 | # OPTIONAL: I/O priority (0 = highest, 20 = lowest) 114 | Priority = 10 115 | 116 | # An EXASOL database 117 | [DB : DB1] 118 | # The EXASOL version to be used for this database 119 | Version = 6.1.3 120 | # Memory size over all nodes (e. g. '1 TiB') 121 | MemSize = 28 GiB 122 | Port = 8888 123 | Nodes = 11 124 | Owner = 500 : 500 125 | NumMasterNodes = 1 126 | DataVolume = DataVolume1 127 | # JDBC driver configuration 128 | [[JDBC]] 129 | # BucketFS that contains the JDBC driver 130 | BucketFS = bfsdefault 131 | # Bucket that contains the JDBC driver 132 | Bucket = default 133 | # Directory within the bucket that contains the drivers 134 | Dir = drivers/jdbc 135 | # Oracle driver configuration 136 | [[Oracle]] 137 | # BucketFS that contains the JDBC drivers 138 | BucketFS = bfsdefault 139 | # Bucket that contains the JDBC drivers 140 | Bucket = default 141 | # Directory within the bucket that contains the drivers 142 | Dir = drivers/oracle 143 | 144 | # Global BucketFS options 145 | [BucketFS] 146 | # User and group ID of the BucketFS process. 147 | ServiceOwner = 500 : 500 148 | 149 | # A Bucket filesystem 150 | [BucketFS : bfsdefault] 151 | # HTTP port number (0 = disabled) 152 | HttpPort = 6583 153 | # HTTPS port number (0 = disabled) 154 | HttpsPort = 0 155 | SyncKey = aW5oUzFMdGpUanNyUTdBMXR5ZGlSekdDSXdqNjFiUGQ= 156 | SyncPeriod = 30000 157 | 158 | # The default bucket (auto-generated) 159 | [[Bucket : default]] 160 | ReadPasswd = cmVhZAo= 161 | WritePasswd = d3JpdGU= 162 | Public = True 163 | AdditionalFiles = EXAClusterOS:/usr/opt/EXASuite-6/EXAClusterOS-6.1.3/var/clients/packages/ScriptLanguages-*, EXASolution-6.1.3:/usr/opt/EXASuite-6/EXASolution-6.1.3/bin/udf/* 164 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/fetch_output_redirect_from_last_statement.sh: -------------------------------------------------------------------------------- 1 | gcloud compute ssh $* -- "tac udf.log | grep 'NEW STATEMENT' -B10000 -m1 | tac" 2 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/gcloud-create-instance.sh: -------------------------------------------------------------------------------- 1 | NAME=$1 2 | shift 3 | gcloud compute instances create $NAME \ 4 | --custom-memory=30GB \ 5 | --custom-cpu=8 \ 6 | --boot-disk-auto-delete \ 7 | --boot-disk-size=200GB \ 8 | --image=projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20190514 \ 9 | --boot-disk-type=pd-standard \ 10 | --maintenance-policy=TERMINATE \ 11 | --scopes=bigquery,storage-ro,storage-rw \ 12 | --metadata=startup-script-url=https://raw.githubusercontent.com/exasol/data-science-examples/master/examples/tensorflow-with-gpu-preview/gcloud-setup.sh $* 13 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/gcloud-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x -e -o pipefail -u 4 | { 5 | ##### Install Nvidia Driver ##### 6 | sudo echo "Install Nvidia Driver" >> /setup.log 7 | 8 | curl -o NVIDIA-Linux-x86_64-410.104.run http://de.download.nvidia.com/tesla/410.104/NVIDIA-Linux-x86_64-410.104.run 9 | chmod +x NVIDIA-Linux-x86_64-410.104.run 10 | sudo apt-get update 11 | sudo DEBIAN_FRONTEND=noninteractive \ 12 | apt-get install -yq --no-install-recommends \ 13 | cpp=4:7.3.0-3ubuntu2 \ 14 | cpp-7=7.3.0-16ubuntu3 \ 15 | g++=4:7.3.0-3ubuntu2 \ 16 | g++-7=7.3.0-16ubuntu3 \ 17 | gcc=4:7.3.0-3ubuntu2 \ 18 | gcc-7=7.3.0-16ubuntu3 \ 19 | gcc-7-base=7.3.0-16ubuntu3 \ 20 | libasan4=7.3.0-16ubuntu3 \ 21 | libcilkrts5=7.3.0-16ubuntu3 \ 22 | libgcc-7-dev=7.3.0-16ubuntu3 \ 23 | libstdc++-7-dev=7.3.0-16ubuntu3 \ 24 | libubsan0=7.3.0-16ubuntu3 25 | sudo apt-mark hold cpp cpp-7 g++ g++-7 gcc gcc-7 gcc-7-base libasan4 \ 26 | libcilkrts5 libgcc-7-dev libstdc++-7-dev libubsan0 27 | sudo dpkg --add-architecture i386 28 | sudo apt-get update 29 | sudo DEBIAN_FRONTEND=noninteractive \ 30 | apt-get install -yq --no-install-recommends \ 31 | apt-utils \ 32 | build-essential \ 33 | ca-certificates \ 34 | curl \ 35 | kmod \ 36 | libc6:i386 \ 37 | libelf-dev 38 | sudo curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey 39 | sudo curl -fsSL -o /usr/local/bin/extract-vmlinux https://raw.githubusercontent.com/torvalds/linux/master/scripts/extract-vmlinux 40 | sudo chmod +x /usr/local/bin/donkey /usr/local/bin/extract-vmlinux 41 | ./NVIDIA-Linux-x86_64-410.104.run --silent 42 | sudo curl https://raw.githubusercontent.com/NVIDIA/nvidia-persistenced/master/init/systemd/nvidia-persistenced.service.template | sed 's/__USER__/root/' > /etc/systemd/system/nvidia-persistenced.service 43 | sudo systemctl enable nvidia-persistenced 44 | sudo systemctl start nvidia-persistenced 45 | 46 | #### Install Docker ##### 47 | sudo echo "Install Docker" >> /setup.log 48 | 49 | sudo DEBIAN_FRONTEND=noninteractive \ 50 | apt-get install -yq --no-install-recommends \ 51 | apt-transport-https \ 52 | ca-certificates \ 53 | curl \ 54 | gnupg-agent \ 55 | software-properties-common 56 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 57 | sudo apt-key fingerprint 0EBFCD88 58 | sudo add-apt-repository \ 59 | "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" 60 | sudo apt-get update 61 | sudo DEBIAN_FRONTEND=noninteractive \ 62 | apt-get install -yq --no-install-recommends docker-ce docker-ce-cli containerd.io 63 | sudo docker run hello-world 64 | 65 | #### Nvidia Docker ###### 66 | sudo echo "Install Nvidia Docker" >> /setup.log 67 | 68 | # Add the package repositories 69 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - 70 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) 71 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \ 72 | sudo tee /etc/apt/sources.list.d/nvidia-docker.list 73 | sudo apt-get update 74 | # Install nvidia-docker2 and reload the Docker daemon configuration 75 | sudo DEBIAN_FRONTEND=noninteractive \ 76 | apt-get install -yq --no-install-recommends nvidia-docker2 77 | sudo pkill -SIGHUP dockerd 78 | # Test nvidia-smi with the latest official CUDA image 79 | sudo docker run --runtime=nvidia --rm nvidia/cuda:9.0-base nvidia-smi 80 | 81 | ##### Install Exasol ##### 82 | sudo echo "Install Exasol" >> /setup.log 83 | 84 | wget https://raw.githubusercontent.com/tkilias/data-science-examples/tensorflow-gpu-preview/examples/tensorflow-with-gpu-preview/EXAConf 85 | sudo mkdir -p /exa/{etc,data/storage} 86 | sudo cp EXAConf /exa/etc/EXAConf 87 | SIZE="$((100*1073741824))" 88 | sudo dd if=/dev/zero of=/exa/data/storage/dev.1 bs=1 count=1 seek=$SIZE 89 | sudo chmod +rw /exa 90 | sudo nvidia-docker run --name exasoldb -p 8888:8888 -p 6583:6583 -v /exa:/exa --detach --privileged --stop-timeout 120 --restart always exasol/docker-db:6.1.3-d1 91 | 92 | ##### Install Python ##### 93 | sudo echo "Install Python" >> /setup.log 94 | 95 | sudo DEBIAN_FRONTEND=noninteractive \ 96 | apt-get install -yq python3-pip 97 | sudo pip3 install pyexasol tensorboard tensorflow 98 | #### Download scripts #### 99 | sudo echo "Download scripts" >> /setup.log 100 | 101 | wget https://raw.githubusercontent.com/tkilias/data-science-examples/tensorflow-gpu-preview/examples/tensorflow-with-gpu-preview/system-status.sh 102 | 103 | #### Finish Setup ##### 104 | sudo echo "Wait for Exasol" >> /setup.log 105 | 106 | sleep 180 # Wait for database to startup 107 | sudo bash -x /system-status.sh &> status.log 108 | sudo cp status.log / 109 | 110 | sudo echo "Finished" >> /setup.log 111 | } &> /tmp/setup_script.log 112 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/start_output_redirect_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | old_pid=$(ps --no-headers -exo "uname:1,pid:1,args:1" | grep "[t]mux new -d python3 -m pyexasol_utils.script_output" | cut -f 2 -d " ") 5 | if [ -z "$old_pid" ] 6 | then 7 | tmux new -d "python3 -m pyexasol_utils.script_output --port 9999 &> udf.log" 8 | fi 9 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/system-status.sh: -------------------------------------------------------------------------------- 1 | sudo nvidia-smi 2 | sudo nvidia-docker logs exasoldb 3 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/tensorflow_udf/.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .idea -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/tensorflow_udf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/examples/tensorflow-with-gpu-preview/tensorflow_udf/__init__.py -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/tensorflow_udf/column_encoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, Tuple, List 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.python.keras import metrics 7 | from tensorflow.python.keras import Input 8 | from tensorflow.python.keras.layers import Dense 9 | 10 | from identity_feature_column import identity_column 11 | from keras_layer import TFHubTextLayer 12 | 13 | 14 | class ColumnEncoder: 15 | 16 | def create_categorical_column_with_hash_bucket(self, column, column_config): 17 | hash_bucket_size = column_config["hash_bucket_size"] 18 | embedding_dimensions = column_config["embedding_dimensions"] 19 | feature_column = tf.feature_column.categorical_column_with_hash_bucket( 20 | key=column.name, hash_bucket_size=hash_bucket_size, 21 | dtype=tf.dtypes.as_dtype(np.dtype(column.type)) 22 | ) 23 | return hash_bucket_size, embedding_dimensions, feature_column 24 | 25 | def min_max_scaling(self, x, min_value, max_value): 26 | return (x - min_value) / (max_value - min_value) 27 | 28 | def get_numeric_column(self, column, column_config: Dict): 29 | min_value = column_config["min_value"] 30 | max_value = column_config["max_value"] 31 | feature_column = tf.feature_column.numeric_column( 32 | key=column.name, 33 | normalizer_fn=lambda x: 34 | self.min_max_scaling(x, min_value, max_value)) 35 | return feature_column 36 | 37 | def generate_string_inputs(self, column, column_config: Dict): 38 | os.environ["TFHUB_DOWNLOAD_PROGRESS"] = "1" 39 | keras_input = Input(name=column.name, shape=[1], dtype=tf.string) 40 | hub_layer = TFHubTextLayer("default", column_config["module_url"], trainable=True)(keras_input) 41 | feature_column = identity_column(column.name) 42 | return feature_column, keras_input, hub_layer 43 | 44 | def generate_categorical_input(self, column, column_config: Dict): 45 | hash_bucket_size, embedding_dimensions, feature_column = \ 46 | self.create_categorical_column_with_hash_bucket(column, column_config) 47 | embedding_feature_column = \ 48 | tf.feature_column.embedding_column( 49 | feature_column, dimension=embedding_dimensions) 50 | keras_input = Input(name=column.name, shape=[embedding_dimensions]) 51 | return embedding_feature_column, keras_input, keras_input 52 | 53 | def generate_numeric_input(self, column, column_config: Dict): 54 | feature_column = self.get_numeric_column(column, column_config) 55 | keras_input = Input(name=column.name, shape=[1]) 56 | return feature_column, keras_input, keras_input 57 | 58 | def generate_categorical_output(self, column, net, column_config: Dict): 59 | hash_bucket_size, embedding_dimensions, feature_column = \ 60 | self.create_categorical_column_with_hash_bucket(column, column_config) 61 | indicator_feature_column = tf.feature_column.indicator_column(feature_column) 62 | keras_output = Dense(hash_bucket_size, activation='relu', name="output_" + column.name)(net) 63 | loss = ("output_%s" % column.name, 'categorical_crossentropy', 1) 64 | output_metrics = ("output_%s" % column.name, "categorical_accuracy") 65 | return indicator_feature_column, keras_output, loss, output_metrics 66 | 67 | def generate_numeric_output(self, column, net, column_config: Dict): 68 | feature_column = self.get_numeric_column(column, column_config) 69 | keras_output = Dense(1, name="output_" + column.name)(net) 70 | loss = ("output_%s" % column.name, 'mean_squared_error', 1) 71 | output_metrics = ("output_%s" % column.name, 'mae') 72 | return feature_column, keras_output, loss, output_metrics 73 | 74 | def generate_input_feature_columns(self, input_columns, config: Dict): 75 | for column in input_columns: 76 | if column.name in config: 77 | column_config = config[column.name] 78 | if column_config["type"] == "categorical" and \ 79 | (column.type == int or column.type == str): 80 | yield self.generate_categorical_input(column, column_config) 81 | elif column_config["type"] == "float" and column.type == float: 82 | yield self.generate_numeric_input(column, column_config) 83 | elif column_config["type"] == "string" and column.type == str: 84 | yield self.generate_string_inputs(column, column_config) 85 | else: 86 | raise Exception(f"Unsupported Type for column {column.name}") 87 | 88 | def generate_output_feature_columns(self, output_columns, net: tf.keras.Model, config: Dict): 89 | for column in output_columns: 90 | if column.name in config: 91 | column_config = config[column.name] 92 | if column_config["type"] == "categorical" and \ 93 | (column.type == int or column.type == int): 94 | yield self.generate_categorical_output(column, net, column_config) 95 | elif column_config["type"] == "float" and column.type == float: 96 | yield self.generate_numeric_output(column, net, column_config) 97 | yield self.generate_numeric_output(column, net, column_config) 98 | else: 99 | raise Exception("Unsupported Type") 100 | 101 | def generate_inputs(self, input_columns, config: Dict): 102 | inputs = config["input"] 103 | input_columns = \ 104 | [column 105 | for column in input_columns 106 | if column.name in inputs] 107 | input_feature_columns = list(self.generate_input_feature_columns(input_columns, inputs)) 108 | input_columns, keras_inputs, preprocessed_keras_inputs = zip(*input_feature_columns) 109 | return input_columns, keras_inputs, preprocessed_keras_inputs 110 | 111 | def generate_outputs(self, input_columns, net, config: Dict) -> \ 112 | Tuple[List, List, Dict, Dict, Dict]: 113 | outputs = config["output"] 114 | output_columns = [column 115 | for column in input_columns 116 | if column.name in outputs] 117 | output_feature_columns = list(self.generate_output_feature_columns(output_columns, net, outputs)) 118 | output_columns, keras_outputs, losses, output_metrics = zip(*output_feature_columns) 119 | loss_weights = {name: weight for name, loss, weight in losses} 120 | losses = {name: loss for name, loss, weight in losses} 121 | output_metrics = {name: metrics for name, metrics in output_metrics} 122 | return output_columns, keras_outputs, losses, loss_weights, output_metrics 123 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/tensorflow_udf/dataset_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.python.feature_column.feature_column import input_layer 4 | 5 | class DatasetUtils: 6 | 7 | def generator(self, ctx, epochs: int, batch_size: int, use_cache: bool): 8 | steps_per_epoch = ctx.size() // batch_size 9 | for epoch in range(epochs): 10 | for batch in range(steps_per_epoch): 11 | df = ctx.get_dataframe(num_rows=batch_size) 12 | if df is not None: 13 | to_dict = df.to_dict(orient="series") 14 | yield to_dict 15 | else: 16 | break 17 | if not use_cache: 18 | ctx.reset() 19 | 20 | def create_generator_dataset(self, ctx, epochs: int, batch_size: int, use_cache: bool, input_columns): 21 | ds = tf.data.Dataset.from_generator( 22 | lambda: self.generator(ctx, epochs, batch_size, use_cache), 23 | {column.name: np.dtype(column.type) for column in input_columns}, 24 | {column.name: tf.TensorShape([None]) for column in input_columns} 25 | ) 26 | return ds 27 | 28 | def add_feature_columns_to_dataset( 29 | self, dataset: tf.data.Dataset, input_columns, output_columns): 30 | dataset = dataset.map( 31 | lambda x: ( 32 | tuple(input_layer(x, column) for column in input_columns), 33 | tuple(input_layer(x, column) for column in output_columns) 34 | ), num_parallel_calls=4 35 | ).apply(tf.data.experimental.unbatch()) 36 | return dataset 37 | 38 | def create_dataset(self, dataset: tf.data.Dataset, 39 | input_columns, output_columns, 40 | batch_size: int, use_cache: bool): 41 | dataset = self.add_feature_columns_to_dataset(dataset, input_columns, output_columns) 42 | if use_cache: 43 | dataset = dataset.cache("cache").repeat() 44 | dataset = dataset.shuffle(1000, reshuffle_each_iteration=True) 45 | dataset = dataset.batch(batch_size, drop_remainder=True) 46 | return dataset 47 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/tensorflow_udf/identity_feature_column.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python import tf_export, dtypes, collections, deprecation, tensor_shape 2 | from tensorflow.python.feature_column import feature_column as fc_old 3 | from tensorflow.python.feature_column.feature_column_v2 import _check_shape, _assert_key_is_string, DenseColumn, \ 4 | _FEATURE_COLUMN_DEPRECATION_DATE, _FEATURE_COLUMN_DEPRECATION, \ 5 | _check_config_keys 6 | from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib 7 | from tensorflow.python.ops import parsing_ops 8 | 9 | 10 | @tf_export('feature_column.identity_column') 11 | def identity_column(key, 12 | shape=(1,), 13 | dtype=dtypes.string, ): 14 | shape = _check_shape(shape, key) 15 | _assert_key_is_string(key) 16 | return IdentityColumn(key, shape=shape, dtype=dtype) 17 | 18 | 19 | class IdentityColumn( 20 | DenseColumn, 21 | fc_old._DenseColumn, 22 | collections.namedtuple( 23 | 'IdentityColumn', 24 | ('key', 'shape', 'dtype'))): 25 | """see `numeric_column`.""" 26 | 27 | @property 28 | def _is_v2_column(self): 29 | return True 30 | 31 | @property 32 | def name(self): 33 | """See `FeatureColumn` base class.""" 34 | return self.key 35 | 36 | @property 37 | @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 38 | _FEATURE_COLUMN_DEPRECATION) 39 | def _parse_example_spec(self): 40 | return self.parse_example_spec 41 | 42 | @property 43 | def parse_example_spec(self): 44 | """See `FeatureColumn` base class.""" 45 | return { 46 | self.key: 47 | parsing_ops.FixedLenFeature(self.shape, self.dtype, 48 | self.default_value) 49 | } 50 | 51 | def _transform_input_tensor(self, input_tensor): 52 | if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 53 | raise ValueError( 54 | 'The corresponding Tensor of numerical column must be a Tensor. ' 55 | 'SparseTensor is not supported. key: {}'.format(self.key)) 56 | return input_tensor 57 | 58 | @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 59 | _FEATURE_COLUMN_DEPRECATION) 60 | def _transform_feature(self, inputs): 61 | input_tensor = inputs.get(self.key) 62 | return self._transform_input_tensor(input_tensor) 63 | 64 | def transform_feature(self, transformation_cache, state_manager): 65 | input_tensor = transformation_cache.get(self.key, state_manager) 66 | return self._transform_input_tensor(input_tensor) 67 | 68 | @property 69 | def variable_shape(self): 70 | """See `DenseColumn` base class.""" 71 | return tensor_shape.TensorShape(self.shape) 72 | 73 | @property 74 | @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 75 | _FEATURE_COLUMN_DEPRECATION) 76 | def _variable_shape(self): 77 | return self.variable_shape 78 | 79 | def get_dense_tensor(self, transformation_cache, state_manager): 80 | return transformation_cache.get(self, state_manager) 81 | 82 | @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 83 | _FEATURE_COLUMN_DEPRECATION) 84 | def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 85 | del weight_collections 86 | del trainable 87 | return inputs.get(self) 88 | 89 | @property 90 | def parents(self): 91 | """See 'FeatureColumn` base class.""" 92 | return [self.key] 93 | 94 | def _get_config(self): 95 | """See 'FeatureColumn` base class.""" 96 | config = dict(zip(self._fields, self)) 97 | config['dtype'] = self.dtype.name 98 | return config 99 | 100 | @classmethod 101 | def _from_config(cls, config, custom_objects=None, columns_by_name=None): 102 | """See 'FeatureColumn` base class.""" 103 | _check_config_keys(config, cls._fields) 104 | kwargs = config.copy() 105 | kwargs['dtype'] = dtypes.as_dtype(config['dtype']) 106 | return cls(**kwargs) 107 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/tensorflow_udf/keras_layer.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | import tensorflow_hub as tfhub 3 | from tensorflow.python.keras.engine import InputSpec 4 | from tensorflow.python.layers.base import Layer 5 | 6 | 7 | class TFHubTextLayer(Layer): 8 | """ 9 | Layer that encapsulates the following: 10 | - Take full text level input 11 | - Return TFHub model's output according to provided input and output signature 12 | 13 | # Input Shape 14 | 1D string tensor with shape `(batch_size)` 15 | # Output Shape 16 | Determined by the output_key 17 | """ 18 | 19 | def __init__(self, output_key, module_uri, max_strlen=10000, **kwargs): 20 | self._name = "TFHubTextLayer" 21 | super(TFHubTextLayer, self).__init__(**kwargs) 22 | self.input_spec = InputSpec( 23 | ndim=2, dtype=tensorflow.string) 24 | 25 | self.output_key = output_key 26 | # lol fucking tensorflow hub can't handle unicode URIs 27 | self.module_uri = str(module_uri) 28 | self.max_strlen = max_strlen 29 | 30 | def get_config(self): 31 | config = { 32 | 'output_key': self.output_key, 33 | 'module_uri': self.module_uri, 34 | 'max_strlen': self.max_strlen, 35 | } 36 | base_config = super(TFHubTextLayer, self).get_config() 37 | config.update(base_config) 38 | return config 39 | 40 | def build(self, input_shape): 41 | self.embedder = tfhub.Module(self.module_uri, trainable=self.trainable) 42 | self.embedder_spec = tfhub.load_module_spec(self.module_uri) 43 | variables_ = [v for v in tensorflow.trainable_variables() if v in self.embedder.variables] 44 | self.trainable_weights.extend(variables_) 45 | self.weights.extend(variables_) 46 | self.trainable_variables.extend(variables_) 47 | super(TFHubTextLayer, self).build(input_shape) 48 | 49 | def call(self, str_inp): 50 | # we're basically always going to let TFHub modules do space 51 | # tokenization for us 52 | 53 | # blech, it's not really possible to actually define a Keras input w/a shape of ndim 1 54 | str_inp_squeezed = tensorflow.squeeze(str_inp, axis=1) 55 | 56 | # let's apply the max strlen to prevent OOM hopefully 57 | str_inp_cutoff = tensorflow.strings.substr(str_inp_squeezed, 0, self.max_strlen) 58 | 59 | return self.embedder(str_inp_cutoff, as_dict=True)[self.output_key] 60 | 61 | def compute_output_shape(self, input_shape): 62 | output_shape_spec = map(int, self.embedder_spec.get_output_info_dict[self.output_key].get_shape()._dims) 63 | # change this to be whatever the batch size is 64 | output_shape_spec[0] = input_shape[0] 65 | return output_shape_spec -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/tensorflow_udf/requirements.txt: -------------------------------------------------------------------------------- 1 | Keras 2 | Keras-Applications 3 | Keras-Preprocessing 4 | numpy 5 | pandas 6 | pyexasol 7 | python-dateutil 8 | pytz 9 | PyYAML 10 | scipy 11 | stopwatch.py 12 | tensorboard 13 | tensorflow 14 | tensorflow-hub 15 | requests -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/tensorflow_udf/tensorflow_config.yaml: -------------------------------------------------------------------------------- 1 | columns: 2 | input: 3 | f_text_0: 4 | type: "string" 5 | module_url: "https://tfhub.dev/google/universal-sentence-encoder-large/3" 6 | # dan_module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" 7 | f_text_1: 8 | type: "string" 9 | module_url: "https://tfhub.dev/google/universal-sentence-encoder-large/3" 10 | f_int_0: 11 | type: "categorical" 12 | hash_bucket_size: 100 13 | embedding_dimensions: 100 14 | output: 15 | f_float_0: 16 | type: "float" 17 | min_value: 0 18 | max_value: 1 19 | f_int_1: 20 | type: "categorical" 21 | hash_bucket_size: 100 22 | embedding_dimensions: 100 23 | use_cache: false 24 | batch_size: 100 25 | epochs: 5 26 | profile: true 27 | device: "/device:GPU:0" # "/cpu:0" 28 | model_load_bucketfs_path: 29 | model_save_bucketfs_url: "http://w@write:localhost:6583/default/tensorflow/save" 30 | model_temporary_save_path: "save" # UDF need to write to /tmp -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/tensorflow_udf/tensorflow_udf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import urllib.parse 4 | 5 | import requests 6 | import tensorflow as tf 7 | import yaml 8 | from tensorflow.python.keras.engine.training import Model 9 | from tensorflow.python.keras.layers import Dense, Concatenate 10 | 11 | from column_encoder import ColumnEncoder 12 | from dataset_utils import DatasetUtils 13 | from utils import Utils 14 | 15 | 16 | class TensorflowUDF(): 17 | CONNECTION_NAME = "tensorflow_config" 18 | 19 | def create_table_network(self, preprocessed_keras_inputs): 20 | concat = Concatenate()(list(preprocessed_keras_inputs)) 21 | net = Dense(100, activation='relu')(concat) 22 | net = Dense(100, activation='relu')(net) 23 | return net 24 | 25 | def read_config(self,exa): 26 | config_file_url = exa.get_connection(self.CONNECTION_NAME).address 27 | url_data = urllib.parse.urlparse(config_file_url) 28 | config_file = urllib.parse.unquote(url_data.path) 29 | with open(config_file) as file: 30 | config = yaml.load(file, yaml.Loader) 31 | with open(config_file) as file: 32 | print(file.read()) 33 | return config 34 | 35 | def run(self, ctx, exa, train:bool): 36 | session_config = tf.ConfigProto( 37 | allow_soft_placement=True, 38 | log_device_placement=False) 39 | session = tf.Session(config=session_config) 40 | tf.keras.backend.set_session(session) 41 | 42 | config = self.read_config(exa) 43 | batch_size = config["batch_size"] 44 | epochs = config["epochs"] 45 | steps_per_epoch = ctx.size() // batch_size 46 | use_cache = config["use_cache"] 47 | load_path = None 48 | if "model_load_bucketfs_path" in config: 49 | load_path = config["model_load_bucketfs_path"] 50 | save_url = None 51 | if "model_save_bucketfs_url" in config: 52 | save_url = config["model_save_bucketfs_url"] 53 | save_path = config["model_temporary_save_path"] 54 | dataset = DatasetUtils().create_generator_dataset( 55 | ctx, epochs, batch_size, use_cache, exa.meta.input_columns) 56 | 57 | with tf.device(config["device"]): 58 | input_columns, keras_inputs, preprocessed_keras_inputs = \ 59 | ColumnEncoder().generate_inputs( 60 | exa.meta.input_columns, config["columns"]) 61 | table_network = self.create_table_network(preprocessed_keras_inputs) 62 | output_columns, keras_outputs, losses, loss_weights, output_metrics = \ 63 | ColumnEncoder().generate_outputs( 64 | exa.meta.input_columns, table_network, config["columns"]) 65 | session.run(tf.tables_initializer()) 66 | 67 | dataset = DatasetUtils().create_dataset(dataset, 68 | input_columns, output_columns, 69 | batch_size, use_cache) 70 | 71 | session.run(tf.global_variables_initializer()) 72 | session.run(tf.local_variables_initializer()) 73 | 74 | dataset_iterator = dataset.make_initializable_iterator() 75 | session.run(dataset_iterator.initializer) 76 | 77 | saver = tf.train.Saver(max_to_keep=1,save_relative_paths=True) 78 | print("load_path",load_path,flush=True) 79 | if load_path is not None and load_path != "": 80 | initial_epoch = Utils().restore_model_and_get_inital_epoch(session, saver, load_path+"/checkpoints/tmp/save") 81 | else: 82 | initial_epoch = 0 83 | callbacks = Utils().create_callbacks(session, saver, save_path) 84 | 85 | model = Model(inputs=keras_inputs, outputs=keras_outputs) 86 | profile = config["profile"] 87 | profile_model_options = Utils().add_profiler(callbacks, profile, session, save_path) 88 | print(output_metrics, flush=True) 89 | model.compile(optimizer='rmsprop', loss=losses, loss_weights=loss_weights, metrics=output_metrics, 90 | **profile_model_options) 91 | print(model.summary(),flush=True) 92 | 93 | if train: 94 | print("Starting training",flush=True) 95 | history = model.fit(dataset_iterator, steps_per_epoch=steps_per_epoch, 96 | epochs=initial_epoch + epochs, verbose=2, callbacks=callbacks, 97 | initial_epoch=initial_epoch ) 98 | ctx.emit(str(history.history)) 99 | print("save_url", save_url,flush=True) 100 | if save_url != "" and save_url is not None: 101 | tarfile = f"/tmp/save" 102 | os.makedirs(tarfile,exist_ok=True) 103 | self.tar_save(save_path, tarfile) 104 | self.upload_save(save_url, tarfile) 105 | 106 | else: 107 | print("Starting prediction",flush=True) 108 | for i in range(steps_per_epoch): 109 | print(f"Predicting Batch {i}/steps_per_epoch",flush=True) 110 | output = model.predict(dataset_iterator, steps=1) 111 | ctx.emit(output) 112 | 113 | def upload_save(self, save_url, tarfile): 114 | print("Upload save", flush=True) 115 | with open(f"{tarfile}/metrics.tar", "rb") as f: 116 | requests.put(f"{save_url}/metrics.tar", data=f) 117 | with open(f"{tarfile}/checkpoints.tar", "rb") as f: 118 | requests.put(f"{save_url}/checkpoints.tar", data=f) 119 | 120 | def tar_save(self, save_path, tarfile): 121 | print("Tar save",flush=True) 122 | try: 123 | subprocess.check_output(f"tar -czf {tarfile}/metrics.tar {save_path}/metrics", shell=True) 124 | subprocess.check_output(f"tar -czf {tarfile}/checkpoints.tar {save_path}/checkpoints", shell=True) 125 | except subprocess.CalledProcessError as e: 126 | print(e) 127 | print(e.output, flush=True) 128 | -------------------------------------------------------------------------------- /examples/tensorflow-with-gpu-preview/tensorflow_udf/utils.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import tensorflow as tf 4 | from tensorflow.python import keras 5 | from tensorflow.python.profiler import option_builder 6 | from tensorflow.python.profiler.model_analyzer import Profiler 7 | 8 | 9 | class Utils: 10 | 11 | def save_graph(self, epoch, logs, 12 | session: tf.Session, 13 | saver: tf.train.Saver, checkpoint_path: str, 14 | save_summary_writer: tf.summary.FileWriter): 15 | save_summary_writer.add_graph(session.graph) 16 | saver.save(session, save_path=f"{checkpoint_path}/{epoch}") 17 | 18 | def create_callbacks(self, session: tf.Session, 19 | saver: tf.train.Saver, 20 | save_path: str): 21 | checkpoint_path = self.get_checkpoint_path(save_path) 22 | save_summary_writer = tf.summary.FileWriter(checkpoint_path) 23 | save_callback = keras.callbacks.LambdaCallback( 24 | on_epoch_end=lambda epoch, logs: self.save_graph(epoch, logs, session, saver, checkpoint_path, 25 | save_summary_writer)) 26 | log_callback = \ 27 | keras.callbacks.TensorBoard( 28 | log_dir=f'{save_path}/metrics', histogram_freq=0, batch_size=32, 29 | write_graph=True, 30 | write_grads=False, 31 | write_images=False, embeddings_freq=0, 32 | embeddings_layer_names=None, 33 | embeddings_metadata=None, embeddings_data=None, 34 | update_freq='epoch') 35 | callbacks = [log_callback, save_callback] 36 | return callbacks 37 | 38 | def get_checkpoint_path(self, save_path): 39 | checkpoint_path = f"{save_path}/checkpoints" 40 | return checkpoint_path 41 | 42 | def restore_model_and_get_inital_epoch( 43 | self, session: tf.Session, 44 | saver: tf.train.Saver, 45 | load_path: str): 46 | print("load_path", load_path, flush=True) 47 | checkpoint_path = self.get_checkpoint_path(load_path) 48 | print("checkpoint_path",checkpoint_path, flush=True) 49 | latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path) 50 | print("latest_checkpoint", latest_checkpoint, flush=True) 51 | if latest_checkpoint is not None: 52 | saver.restore(session, latest_checkpoint) 53 | return int(pathlib.Path(latest_checkpoint).name) 54 | else: 55 | return 0 56 | 57 | def add_profile(self, epoch, logs, 58 | run_metadata: tf.RunMetadata, 59 | profiler: tf.profiler.Profiler, 60 | profile_writer: tf.summary.FileWriter, 61 | save_path: str): 62 | timeline_path = f"{save_path}/timeline" 63 | pathlib.Path(timeline_path).mkdir(exist_ok=True, parents=True) 64 | profiler.add_step(epoch, run_meta=run_metadata) 65 | opts = (option_builder.ProfileOptionBuilder( 66 | option_builder.ProfileOptionBuilder.time_and_memory()) 67 | .with_step(epoch) 68 | .with_timeline_output(f"{timeline_path}/step").build()) 69 | profiler.profile_graph(options=opts) 70 | profile_writer.add_run_metadata(run_metadata, f"step{epoch}") 71 | 72 | def add_profiler(self, callbacks, profile: bool, session: tf.Session, save_path: str): 73 | if profile: 74 | profiler = Profiler(session.graph) 75 | options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 76 | run_metadata = tf.RunMetadata() 77 | profile_writer = tf.summary.FileWriter(f"{save_path}/profile") 78 | profile_writer.add_graph(session.graph) 79 | profiler_callback = keras.callbacks.LambdaCallback( 80 | on_epoch_end=lambda batch, logs: self.add_profile(batch, logs, run_metadata, 81 | profiler, profile_writer, save_path)) 82 | callbacks.append(profiler_callback) 83 | additional_options = dict(options=options, run_metadata=run_metadata) 84 | else: 85 | additional_options = dict() 86 | return additional_options 87 | -------------------------------------------------------------------------------- /tutorials/README.md: -------------------------------------------------------------------------------- 1 | ## Tutorials 2 | This section contains tutorials which show complete data science workflows on a realistic scenario and data. We are going to provide examples for different languages, frameworks, tasks and use cases. 3 | 4 | For general prerequisites, please refer to [Prerequisites](../README.md). 5 | 6 | **Currently, this repository is under development and we will add more and more tutorials in the future.** 7 | 8 | ### Overview 9 | 10 | * [Machine Learning](machine-learning) 11 | * [Python](machine-learning/python): 12 | * [AzureML](machine-learning/python/AzureML/Introduction.ipynb) 13 | * [Scikit-learn](machine-learning/python/scikit-learn) 14 | * [Sagemaker](machine-learning/python/sagemaker) Using AWS sagemaker for machine learning with Exasol 15 | * [SageMaker Extension](machine-learning/sagemaker-extension) 16 | * [Script-Language Container](script-languages) 17 | * [Spatial Analysis](spatial-analysis) 18 | * [Visualizing Spatial Queries](spatial-analysis/visualizing_spatial_queries) 19 | -------------------------------------------------------------------------------- /tutorials/machine-learning/README.md: -------------------------------------------------------------------------------- 1 | ## Machine Learning Tutorials 2 | This section contains tutorials for doing Machine Learning within the Exasol database. We are going to provide examples for different languages and frameworks, tasks and use cases. 3 | 4 | ### Languages: 5 | 6 | * [Python](python) 7 | * [Scikit-learn](python/scikit-learn): 8 | 9 | ### Prerequisites 10 | 11 | For general prerequisites, please refer to [Prerequisites](../README.md). However, these tutorials typically need a specific flavor of the [Script Language Container](https://github.com/exasol/script-languages) which has the required dependencies installed. For these purposes, we provide the python3-ds-* and the fancy-r-* flavors which already contain the dependencies for the frameworks used in these tutorials. Prepackaged releases for this flavor can be found on the [release page](https://github.com/exasol/script-languages/releases). 12 | -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/ConnectAzureMLtoExasol.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Connect to Exasol from AzureML" 7 | ], 8 | "metadata": { 9 | "collapsed": false 10 | } 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "source": [ 15 | "In this Tutorial we will:\n", 16 | " - Connect to Exasol SaaS from AzureML\n", 17 | " - Preprocess data\n", 18 | " - Export Exasol tables to an Azure Blobstore Container\n", 19 | " - Create a Datastore" 20 | ], 21 | "metadata": { 22 | "collapsed": false 23 | } 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "source": [ 28 | "## Prerequisites\n", 29 | "\n", 30 | "You will need:\n", 31 | " - Your running Exasol Saas Cluster with your data loaded into it\n", 32 | " - Authentication information for your Exasol Saas Cluster\n", 33 | " - An AzureML account and Azure Storage account\n", 34 | " - AzureML set up with a:\n", 35 | " - Workspace\n", 36 | " - Compute instance" 37 | ], 38 | "metadata": { 39 | "collapsed": false 40 | } 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "source": [ 45 | "## Why using Azure blobstorage is necessary\n", 46 | "\n", 47 | "In this tutorial we copy the data from an Exasol Saas Database into an Azure Blob Storage Container. This is necessary because while AzureML has functionality to import directly from SQL databases, the Exasol SQL dialect is not supported by AzureML at the moment of writing.\n" 48 | ], 49 | "metadata": { 50 | "collapsed": false 51 | } 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "source": [ 56 | "## AzureML setup\n", 57 | "\n", 58 | "If you do not know how to set up your AzureML studio, please refer to the [AzureML documentation](https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources).\n", 59 | "Once you are set up with a workspace and compute instance, you can copy this notebook into your notebook files. Open it and select your compute instance in the drop-down menu at the top of your notebook. Now we can get started with connecting to the Exasol Saas cluster.\n" 60 | ], 61 | "metadata": { 62 | "collapsed": false 63 | } 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "source": [ 68 | "### Connect to Exasol Saas\n", 69 | "\n", 70 | "\n", 71 | "We are going to use the [PyExasol](https://docs.exasol.com/db/latest/connect_exasol/drivers/python/pyexasol.htm) package in order to connect to the Exasol database and read the data. First we need to install PyExasol using pip in your AzureML Compute." 72 | ], 73 | "metadata": { 74 | "collapsed": false 75 | } 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "outputs": [], 81 | "source": [ 82 | "!pip install pyexasol" 83 | ], 84 | "metadata": { 85 | "collapsed": false, 86 | "pycharm": { 87 | "name": "#%%\n" 88 | } 89 | } 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "source": [ 94 | "Then we need to connect with PyExasol to our Exasol Saas Cluster with the data. Change these values to reflect your Cluster.\n", 95 | "We ask for 10 lines of our \"IDA.TEST\" table from the [Scania Trucks](https://archive.ics.uci.edu/ml/datasets/IDA2016Challenge) to check if our connection is working." 96 | ], 97 | "metadata": { 98 | "collapsed": false 99 | } 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "outputs": [], 105 | "source": [ 106 | "import pyexasol\n", 107 | "import pandas\n", 108 | "\n", 109 | "EXASOL_HOST = \".clusters.exasol.com\" # change\n", 110 | "EXASOL_PORT = \"8563\" # change if needed\n", 111 | "EXASOL_USER = \"\" # change\n", 112 | "EXASOL_PASSWORD = \"exa_pat_\" # change\n", 113 | "EXASOL_SCHEMA = \"IDA\" # change if needed\n", 114 | "\n", 115 | "# get the connection\n", 116 | "EXASOL_CONNECTION = \"{host}:{port}\".format(host=EXASOL_HOST, port=EXASOL_PORT)\n", 117 | "exasol = pyexasol.connect(dsn=EXASOL_CONNECTION, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True)\n", 118 | "\n", 119 | "# check if the connection is working\n", 120 | "exasol.export_to_pandas(\"SELECT * FROM TABLE IDA.TEST LIMIT 10\")" 121 | ], 122 | "metadata": { 123 | "collapsed": false, 124 | "pycharm": { 125 | "name": "#%%\n" 126 | } 127 | } 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "source": [ 132 | "We will also need to get access to the Azure Storage Account, which we will use later to transfer the data. For that, you need to insert your Azure Storage Account Name and Access Key. To find your Access Key, in the Azure portal navigate to your Storage Account, and click on \"Access Keys\" under \"Security + networking\" and copy one of your Access Keys.\n", 133 | "\n", 134 | "![](img_src/access_key_azure.png)\n" 135 | ], 136 | "metadata": { 137 | "collapsed": false 138 | } 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "outputs": [], 144 | "source": [ 145 | "from azure.ai.ml.entities import AccountKeyConfiguration\n", 146 | "\n", 147 | "my_storage_account_name = \"your_storage_account_name\" # change\n", 148 | "account_key=\"your_storage_account_key\" # change\n", 149 | "\n", 150 | "credentials= AccountKeyConfiguration(account_key)" 151 | ], 152 | "metadata": { 153 | "collapsed": false, 154 | "pycharm": { 155 | "name": "#%%\n" 156 | } 157 | } 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "source": [ 162 | "### Data Preprocessing\n", 163 | "\n", 164 | "Now that we are set up for the data transfer, we are first going to preprocess the data in the Exasol Database before pulling the data into Azure. We want to replace the text based \"CLASS\" column all data tables with a boolean column called \"CLASS_POS\" which will make classifying easier.\n", 165 | "\n", 166 | "For your own project, you need to evaluate which preprocessing steps to run in the efficient Exasol Database and which might be easier to accomplish later on the CSV files in Azure Blob Storage.\n", 167 | "\n", 168 | "First, we create a new table \"TRAIN_PREPARED\" which is a copy of the \"TRAIN\" table, with the replaced \"CLASS_POS\" column." 169 | ], 170 | "metadata": { 171 | "collapsed": false 172 | } 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "outputs": [], 178 | "source": [ 179 | "all_columns = exasol.export_to_pandas(\"SELECT * FROM IDA.TRAIN LIMIT 1;\")\n", 180 | "column_names = list(all_columns)\n", 181 | "column_names.remove(\"CLASS\")\n", 182 | "exasol.execute(\"\"\"CREATE OR REPLACE TABLE IDA.TRAIN_PREPARED AS (\n", 183 | " SELECT\n", 184 | " (CLASS = 'pos') as CLASS_POS, {all_columns_except_class!q} FROM IDA.TRAIN)\"\"\",\n", 185 | " {\"all_columns_except_class\": column_names})\n", 186 | "\n", 187 | "\n", 188 | "\n", 189 | "exasol.export_to_pandas(\"SELECT * FROM IDA.TRAIN_PREPARED LIMIT 4\")" 190 | ], 191 | "metadata": { 192 | "collapsed": false, 193 | "pycharm": { 194 | "name": "#%%\n" 195 | } 196 | } 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "source": [ 201 | "Then we create a new \"TEST_PREPARED\" table as a copy of the \"TEST\" table with replaced \"CLASS_POS\" column." 202 | ], 203 | "metadata": { 204 | "collapsed": false 205 | } 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "outputs": [], 211 | "source": [ 212 | "exasol.execute(\"\"\"CREATE OR REPLACE TABLE IDA.TEST_PREPARED AS (\n", 213 | " SELECT\n", 214 | " (CLASS = 'pos') as CLASS_POS, {all_columns_except_class!q} FROM IDA.TEST)\"\"\",\n", 215 | " {\"all_columns_except_class\": column_names})\n", 216 | "\n", 217 | "\n", 218 | "\n", 219 | "exasol.export_to_pandas(\"SELECT * FROM IDA.TEST_PREPARED LIMIT 4\")" 220 | ], 221 | "metadata": { 222 | "collapsed": false, 223 | "pycharm": { 224 | "name": "#%%\n" 225 | } 226 | } 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "source": [ 231 | "### Load Data into AzureML Blob Storage\n", 232 | "\n", 233 | "Now that our data is prepared and we have access to our Azure Storage Account and our Exasol Saas Cluster, we use an \"EXPORT TABLE\" command for each of our data tables to export them into a CSV file in our Blob Storage using \"INTO CSV AT CLOUD AZURE BLOBSTORAGE\". You can find [the domumentation for this export command](https://docs.exasol.com/db/latest/sql/export.htm) in the Exasol documentation.\n", 234 | "If you choose an existing Azure Blob Storage container, this command will save your files in this container. Otherwise, a new container with the given name will be created automatically.\n", 235 | "When you created your AzureML Workspace, an Azure Blob Container was [created automatically](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-access-data) and added as a Datastore named \"workspaceblobstore\" to your workspace. You can use it here and then skip the \"Create a Datastore\" step below if you want. For this you would need to find its name (\"azureml-blobstore-some-ID\") in the Datastore Information and insert it here." 236 | ], 237 | "metadata": { 238 | "collapsed": false 239 | } 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "source": [ 244 | "Some of the 170 features of the Scania Trucks dataset do not have a notable influence on the classification or contain a big amount of empty values. Because of this we select only some columns to actually use for the training. Since we only want to use them, we import only these features to Azure.\n", 245 | "\n", 246 | "Once we have selected the column names we want to use, we transfer the \"TEST_PREPARED\" table using the exasol EXPORT command." 247 | ], 248 | "metadata": { 249 | "collapsed": false 250 | } 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "outputs": [], 256 | "source": [ 257 | "table = \"TEST_PREPARED\"\n", 258 | "column_names = ['CLASS_POS', 'AA_000', 'AG_005', 'AH_000', 'AL_000', 'AM_0', 'AN_000', 'AO_000', 'AP_000', 'AQ_000',\n", 259 | " 'AZ_004', 'BA_002', 'BB_000', 'BC_000', 'BD_000', 'BE_000',\n", 260 | " 'BF_000', 'BG_000', 'BH_000', 'BI_000', 'BJ_000', 'BS_000', 'BT_000', 'BU_000', 'BV_000',\n", 261 | " 'BX_000', 'BY_000', 'BZ_000', 'CA_000', 'CB_000', 'CC_000', 'CI_000', 'CN_004', 'CQ_000',\n", 262 | " 'CS_001', 'DD_000', 'DE_000', 'DN_000', 'DS_000', 'DU_000', 'DV_000', 'EB_000', 'EE_005']\n", 263 | "\n", 264 | "blobstorage_name = \"azureml-tutorial\" # change, remember to you might need to remove the \"_datastore\" suffix\n", 265 | "\n", 266 | "save_path = f'{blobstorage_name}/ida/{table}'\n", 267 | "sql_export = \"\"\"EXPORT (SELECT {column_names!q} FROM IDA.{table!q})\n", 268 | " INTO CSV AT CLOUD AZURE BLOBSTORAGE 'DefaultEndpointsProtocol=https;EndpointSuffix=core.windows.net'\n", 269 | " USER '{my_storage_account_name!q}' IDENTIFIED BY '{account_key!q}'\n", 270 | " FILE '{save_path!q}' WITH COLUMN NAMES REPLACE\"\"\"\n", 271 | "\n", 272 | "\n", 273 | "exasol.execute(sql_export, {\"column_names\": column_names,\n", 274 | " \"table\": table,\n", 275 | " \"my_storage_account_name\": my_storage_account_name,\n", 276 | " \"account_key\": credentials.account_key,\n", 277 | " \"save_path\": save_path})\n", 278 | "print(f\"saved {table} in file {save_path}\")" 279 | ], 280 | "metadata": { 281 | "collapsed": false, 282 | "pycharm": { 283 | "name": "#%%\n" 284 | } 285 | } 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "source": [ 290 | "Then we do the same with the TRAIN_PREPARED table:" 291 | ], 292 | "metadata": { 293 | "collapsed": false 294 | } 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "outputs": [], 300 | "source": [ 301 | "table = \"TRAIN_PREPARED\"\n", 302 | "save_path = f'{blobstorage_name}/ida/{table}'\n", 303 | "\n", 304 | "exasol.execute(sql_export, {\"column_names\": column_names,\n", 305 | " \"table\": table,\n", 306 | " \"my_storage_account_name\": my_storage_account_name,\n", 307 | " \"account_key\": credentials.account_key,\n", 308 | " \"save_path\": save_path})\n", 309 | "print(f\"saved {table} in file {save_path}\")" 310 | ], 311 | "metadata": { 312 | "collapsed": false, 313 | "pycharm": { 314 | "name": "#%%\n" 315 | } 316 | } 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "source": [ 321 | "Delete the temporary tables from the Exasol Saas Database in order to not pollute the database." 322 | ], 323 | "metadata": { 324 | "collapsed": false 325 | } 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "outputs": [], 331 | "source": [ 332 | "for table in [\"TRAIN_PREPARED\", \"TEST_PREPARED\"]:\n", 333 | " exasol.execute(f\"DROP TABLE IDA.{table};\")" 334 | ], 335 | "metadata": { 336 | "collapsed": false, 337 | "pycharm": { 338 | "name": "#%%\n" 339 | } 340 | } 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "source": [ 345 | "You can check the success of the command by navigating to your Container in the Azure portal using your Azure storage account.\n", 346 | "In the menu on the left, you can find \"Containers\" under \"Data Storage\". Find the container named \"your-container-name\" and click on it. Your files should be there.\n" 347 | ], 348 | "metadata": { 349 | "collapsed": false 350 | } 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "source": [ 355 | "### Create a Datastore\n", 356 | "\n", 357 | "We recommend that you create a connection between your Azure Storage Container and your AzureML Workspace. For this, enter your workspace in AzureML Studio and select \"Data\" under \"Assets\" in the menu on the left. Now select \"Datastores\" and click on \"+Create\".\n", 358 | "\n", 359 | "![](img_src/create_datastore.png)\n", 360 | "\n", 361 | "In the view that opens you need to enter the info for your datastore. Enter a name and select the type as \"Azure Blob Storage\". Then select your Azure subscription and the blob container we loaded the data into from the drop-down menu. Use Authentication type Account key and enter your Azure storage account access key. Click create." 362 | ], 363 | "metadata": { 364 | "collapsed": false 365 | } 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "source": [ 370 | "![](img_src/data_blobstore.png)\n", 371 | "\n", 372 | "You can now see your data directly in AzureML by navigating to \"Datastores\" and clicking on . If you then change into the \"Browse\" view you can open your files and have a look at them if you want.\n", 373 | "\n", 374 | "\n", 375 | "Great, we successfully connected to our Exasol Saas instance and loaded data from there into our Azure Blob Storage!\n", 376 | "\n", 377 | "Now we move on to [working with the data in AzureML and training a model on it](TrainModelInAzureML.ipynb)." 378 | ], 379 | "metadata": { 380 | "collapsed": false 381 | } 382 | } 383 | ], 384 | "metadata": { 385 | "kernelspec": { 386 | "display_name": "Python 3", 387 | "language": "python", 388 | "name": "python3" 389 | }, 390 | "language_info": { 391 | "codemirror_mode": { 392 | "name": "ipython", 393 | "version": 2 394 | }, 395 | "file_extension": ".py", 396 | "mimetype": "text/x-python", 397 | "name": "python", 398 | "nbconvert_exporter": "python", 399 | "pygments_lexer": "ipython2", 400 | "version": "2.7.6" 401 | } 402 | }, 403 | "nbformat": 4, 404 | "nbformat_minor": 0 405 | } -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Apply Microsoft's AzureML training on Exasol data" 7 | ], 8 | "metadata": { 9 | "collapsed": false 10 | } 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "source": [ 15 | "In this series of tutorials you will learn how to :\n", 16 | "\n", 17 | " - Connect your Exasol (Saas) Database to AzureML ([link](ConnectAzureMLtoExasol.ipynb))\n", 18 | " - Use your data from Exasol to train a Machine learning model in AzureML ([link](TrainModelInAzureML.ipynb))\n", 19 | " - Invoke your trained model from your Exasol Database and receive the results directly into your Database ([link](InvokeModelFromExasolDBwithUDF.ipynb))\n", 20 | " - Export the trained model to Exasol and run it directly in the Database using UDF's ([link](InvokeModelFromExasolDBwithUDF.ipynb))\n" 21 | ], 22 | "metadata": { 23 | "collapsed": false 24 | } 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "source": [ 29 | "## Who this tutorial is for\n", 30 | "\n", 31 | "If you are an Exasol user and want to make more out of your data using Azure Machine Learning, this tutorial shows you how to get started. Or maybe you are already using Azure Machine Learning to analyze your data but are interested in hosting the data somewhere else, preferably in an analytic database to get the best out of your data. Then this tutorial might give you an insight into how easy it is to switch out your database while not having to disrupt or rebuild your Machine Learning processes." 32 | ], 33 | "metadata": { 34 | "collapsed": false 35 | } 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "source": [ 40 | "## What this tutorial does not cover\n", 41 | " - Using the exasol data in AzureML directly (not supported by AzureML)\n", 42 | " - Setting training parameters/invoking training automatically from Exasol\n", 43 | " - Check training/prediction progress from Exasol\n", 44 | " - Get training parameters of trained model from Exasol\n", 45 | " - Check if the model is running from Exasol\n", 46 | " - Monitor AzureML node utilisation from Exasol\n", 47 | "\n", 48 | "Many of these things are possible but not covered here. After finishing this tutorial you should have the necessary tools to get started implementing solutions for these tasks though, provided you know your way around Azure and AzureML." 49 | ], 50 | "metadata": { 51 | "collapsed": false 52 | } 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "source": [ 57 | "## Prerequisites\n", 58 | " - For this tutorial we will use an [Exasol Saas](https://www.exasol.com/exasol-saas/) instance. Other versions of Exasol should also work as long as you are able to connect to them via PyExasol.\n", 59 | " - [Microsoft AzureML Studio](https://studio.azureml.net/) access (works in conjunction with [Microsoft Azure](https://azure.microsoft.com/de-de/free/search/?&ef_id=EAIaIQobChMIkZ-J_bzg_QIVCthRCh3Uyga7EAAYASAAEgIK0PD_BwE:G:s&OCID=AIDcmmzzaokddl_SEM_EAIaIQobChMIkZ-J_bzg_QIVCthRCh3Uyga7EAAYASAAEgIK0PD_BwE:G:s&gclid=EAIaIQobChMIkZ-J_bzg_QIVCthRCh3Uyga7EAAYASAAEgIK0PD_BwE))\n", 60 | " - The [PyExasol package](https://pypi.org/project/pyexasol/) ([documentation](https://docs.exasol.com/db/latest/connect_exasol/drivers/python/pyexasol.htm))" 61 | ], 62 | "metadata": { 63 | "collapsed": false 64 | } 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "source": [ 69 | "## Setting up Exasol SaaS for this tutorial\n", 70 | "\n", 71 | "Firstly, if you do not have an accounr aleady, [sign up for Exasol Saas free trail](https://cloud.exasol.com/signup?_gl=1*l5pvjo*_ga*MTAwNTY5MzY5NC4xNjc2Mzc3NzA2*_ga_3M805TBTX9*MTY3Nzc2MDM1MC4yLjAuMTY3Nzc2MDM1MC42MC4wLjA.), or sign in to your existing account.\n", 72 | "\n", 73 | "Once signed in, click the \"Add database\" button on the top left, choose a database name and your preferred region, then click \"next\" define your first database cluster by setting a cluster name and your preferred cluster size. If you are on the free trial the price for your cluster will be deducted from your free credits.\n", 74 | "You can also set the automatic shutoff time for your cluster in this view (or change it later).\n", 75 | "\n", 76 | "![](img_src/cluster_creation.png)\n", 77 | "\n", 78 | "We choose the smallest available cluster size (XSmall, 8 vCPU, 64 GB Memory) for this tutorial as it is sufficient.\n", 79 | " When you are ready click \"Add database\" and your cluster will be set up and started (this might take some time).\n" 80 | ], 81 | "metadata": { 82 | "collapsed": false, 83 | "pycharm": { 84 | "name": "#%% md\n" 85 | } 86 | } 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "source": [ 91 | "### Allow connection from AzureML\n", 92 | "\n", 93 | "Once you set up your Exasol database, you need to allow incoming connections from AzureML Studio. You also need to get the connection info for your cluster, so we can use it later to set up the PyExasol connection to your cluster.\n", 94 | "\n", 95 | "Firstly you need to find the public IP of your AzureML compute instance. If you do not yet have an AzureML compute instance you will need to [set one up](https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources).\n", 96 | "To find the public IP, click the \"Compute\" entry in the menu on the left in AzureML Studio below \"Manage\". Find your compute and open it. Toward the bottom you will find the \"Public IP\".\n", 97 | "\n", 98 | "![](img_src/azureML_public_ip.png)\n", 99 | "\n", 100 | "Now you need to register the IP with your Saas Database to allow incoming requests form your AzureML compute.\n", 101 | "In the Saas portal, navigate to your Cluster and click on \"Connect via tools\" on the right. Enter the IP of your AzureML compute instance\n", 102 | "and \"Add\". Then click \"next\" two times. You will see a screen \"Connection details.\".\n", 103 | "\n", 104 | "![](img_src/connection_detail_generate.png)\n", 105 | "\n", 106 | " Click the bottom column to generate a Personal access token for your Database.\n", 107 | "\n", 108 | "![](img_src/connection_details_acess_token.png)\n", 109 | "\n", 110 | "We will use this token to connect to the database from AzureML.\n", 111 | "**Remember the connection string, port, user-name and access token/password.**\n", 112 | "If you accidentally added a wrong IP you can remove them again under \"Security\"\n", 113 | "\n", 114 | "Now that you got your connection information, run [this notebook](../sagemaker/LoadExampleDataIntoExasol.ipynb) to load the [Scania Trucks](https://archive.ics.uci.edu/ml/datasets/IDA2016Challenge) dataset into your Exasol Saas Instance (Don't forget to change the connection info in the first cell).\n", 115 | "You can run this from your AzureML cluster, or from your local machine(remember to add your local IP beforehand like we did the AzureML cluster IP above).\n", 116 | "\n", 117 | "If want to use other tool for your data upload multiple ways are documented [here](https://docs.exasol.com/saas/connect_exasol.htm).\n", 118 | "\n", 119 | "Now we have our Exasol Saas set up with some data to play around with, we can move on to [next part of the tutorial](ConnectAzureMLtoExasol.ipynb)." 120 | ], 121 | "metadata": { 122 | "collapsed": false, 123 | "pycharm": { 124 | "name": "#%% md\n" 125 | } 126 | } 127 | } 128 | ], 129 | "metadata": { 130 | "kernelspec": { 131 | "display_name": "Python 3", 132 | "language": "python", 133 | "name": "python3" 134 | }, 135 | "language_info": { 136 | "codemirror_mode": { 137 | "name": "ipython", 138 | "version": 2 139 | }, 140 | "file_extension": ".py", 141 | "mimetype": "text/x-python", 142 | "name": "python", 143 | "nbconvert_exporter": "python", 144 | "pygments_lexer": "ipython2", 145 | "version": "2.7.6" 146 | } 147 | }, 148 | "nbformat": 4, 149 | "nbformat_minor": 0 150 | } -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/TrainModelInAzureML.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Train an ML model on Exasol Data\n", 7 | "\n", 8 | "In this tutorial, you will load the data from Azure Blob Storage, and run a Python script as an AzureML job to preprocess the data and train a simple scikit-learn model. Then You will register the trained model with AzureML for further use." 9 | ], 10 | "metadata": { 11 | "collapsed": false 12 | } 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "source": [ 17 | "## Prerequisites\n", 18 | "You completed the [previous part of this tutorial series](ConnectAzureMLtoExasol.ipynb) and therefore have:\n", 19 | " - A running AzureML compute instance\n", 20 | " - An Azure Storage account\n", 21 | " - The [Scania Trucks](https://archive.ics.uci.edu/ml/datasets/IDA2016Challenge) dataset loaded into Azure Blob Storage\n" 22 | ], 23 | "metadata": { 24 | "collapsed": false 25 | } 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "source": [ 30 | "## Python script for training the model\n", 31 | "\n", 32 | "We will use a Python script to create and train a SciKit-Learn model on the data we loaded from Exasol. You can find the script [here](main.py).\n", 33 | "The script loads the data from the files we saved in the Azure Blob Storage, does data preprocessing to combat the unbalanced nature of the dataset and removes empty values so the training can work properly.\n", 34 | "Then, it creates a simple SciKit-Learn model and trains it on the data. The model is evaluated using the test dataset and registered in the AzureML Workspace using MLflow.\n", 35 | "\n", 36 | "This script creates a model that only uses Python packages available in Exasol Saas UDFs natively. This means you can upload this model directly to your exasol Database and run it using an UDF. If your own models use different packages but you still need to run them on the cluster directly you need to [build and install yout own Script-Language Container](https://docs.exasol.com/db/latest/database_concepts/udf_scripts/adding_new_packages_script_languages.htm). Information on which packages are supported out of the box can be found [here](https://docs.exasol.com/saas/database_concepts/udf_scripts/python3.htm).\n" 37 | ], 38 | "metadata": { 39 | "collapsed": false 40 | } 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "source": [ 45 | "## Prepare AzureML studio to run the Python script\n", 46 | "\n", 47 | "This notebook is meant to be run in AzureML Studio, so upload it to your Notebooks, open it and select your compute instance in the drop-down menu at the top of your notebook. The same steps can be achieved by accessing AzureML using remote scripts, but for demonstration purposes we use AzureML Studio here.\n", 48 | "\n", 49 | "First, we install some AzureML functionality." 50 | ], 51 | "metadata": { 52 | "collapsed": false 53 | } 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "outputs": [], 59 | "source": [ 60 | "!pip install azure-identity\n", 61 | "!pip install azure-ai-ml==1.3.0" 62 | ], 63 | "metadata": { 64 | "collapsed": false, 65 | "pycharm": { 66 | "name": "#%%\n" 67 | } 68 | } 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "source": [ 73 | "Then, we create an MLClient for accessing our AzureML jobs programmatically. For this we need our AzureML subscription id, resource group name and workspace name. If you are not sure what your resource group name is, you can find it by clicking your subscription in the top left oft AzureML Studio\n", 74 | "Make sure to use the workspace you set up in the previous tutorial.\n", 75 | "\n", 76 | "![](img_src/resource_group.png)" 77 | ], 78 | "metadata": { 79 | "collapsed": false 80 | } 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "outputs": [], 86 | "source": [ 87 | "# Handle to the workspace\n", 88 | "from azure.ai.ml import MLClient\n", 89 | "\n", 90 | "# Authentication package\n", 91 | "from azure.identity import DefaultAzureCredential\n", 92 | "\n", 93 | "credential = DefaultAzureCredential()\n", 94 | "# Get a handle to the workspace\n", 95 | "ml_client = MLClient(\n", 96 | " credential=credential,\n", 97 | " subscription_id=\"\", # change\n", 98 | " resource_group_name=\"\", # change\n", 99 | " workspace_name=\"\", # change\n", 100 | ")" 101 | ], 102 | "metadata": { 103 | "collapsed": false, 104 | "pycharm": { 105 | "name": "#%%\n" 106 | } 107 | } 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "source": [ 112 | "### Create a new Python Environment\n", 113 | "\n", 114 | "To run our Python script we need to create a new environment and install the required dependencies. For this, we first create a new directory called \"dependencies\"." 115 | ], 116 | "metadata": { 117 | "collapsed": false 118 | } 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "outputs": [], 124 | "source": [ 125 | "#make env\n", 126 | "import os\n", 127 | "\n", 128 | "dependencies_dir = \"./dependencies\"\n", 129 | "os.makedirs(dependencies_dir, exist_ok=True)" 130 | ], 131 | "metadata": { 132 | "collapsed": false, 133 | "pycharm": { 134 | "name": "#%%\n" 135 | } 136 | } 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "source": [ 141 | "In order for our model to be usable in the Exasol Saas Database later, we need to make sure the SciKit-learn version we use matches the version in Saas." 142 | ], 143 | "metadata": { 144 | "collapsed": false 145 | } 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "outputs": [], 151 | "source": [ 152 | "%%writefile {dependencies_dir}/conda.yml\n", 153 | "name: model-env\n", 154 | "channels:\n", 155 | " - conda-forge\n", 156 | "dependencies:\n", 157 | " - python=3.8\n", 158 | " - numpy=1.21.2\n", 159 | " - scikit-learn=1.0.2\n", 160 | " - pandas>=1.1,<1.2\n", 161 | " - pip:\n", 162 | " - inference-schema[numpy-support]==1.3.0\n", 163 | " - mlflow== 1.26.1\n", 164 | " - azureml-mlflow==1.42.0\n" 165 | ], 166 | "metadata": { 167 | "collapsed": false, 168 | "pycharm": { 169 | "name": "#%%\n" 170 | } 171 | } 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "source": [ 176 | "Next, we will create a new environment to run our job in. We will use the new dependencies file and use an Ubuntu image as the base for our environment. Then we will create the new environment on our *MLClient*." 177 | ], 178 | "metadata": { 179 | "collapsed": false 180 | } 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 1, 185 | "outputs": [ 186 | { 187 | "ename": "ModuleNotFoundError", 188 | "evalue": "No module named 'azure'", 189 | "output_type": "error", 190 | "traceback": [ 191 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", 192 | "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", 193 | "Input \u001B[0;32mIn [1]\u001B[0m, in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mazure\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mai\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mml\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mentities\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Environment\n\u001B[1;32m 2\u001B[0m custom_env_name \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;66;03m# change\u001B[39;00m\n\u001B[1;32m 4\u001B[0m pipeline_job_env \u001B[38;5;241m=\u001B[39m Environment(\n\u001B[1;32m 5\u001B[0m name\u001B[38;5;241m=\u001B[39mcustom_env_name,\n\u001B[1;32m 6\u001B[0m description\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCustom environment for azureML tut\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 9\u001B[0m image\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 10\u001B[0m )\n", 194 | "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'azure'" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "from azure.ai.ml.entities import Environment\n", 200 | "custom_env_name = \"\" # change\n", 201 | "\n", 202 | "pipeline_job_env = Environment(\n", 203 | " name=custom_env_name,\n", 204 | " description=\"Custom environment for AzureML tutorial\",\n", 205 | " tags={\"scikit-learn\": \"1.0.2\"},\n", 206 | " conda_file=os.path.join(dependencies_dir, \"conda.yml\"),\n", 207 | " image=\"mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest\",\n", 208 | ")\n", 209 | "pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)\n", 210 | "\n", 211 | "print(\n", 212 | " f\"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}.\"\n", 213 | ")" 214 | ], 215 | "metadata": { 216 | "collapsed": false, 217 | "pycharm": { 218 | "name": "#%%\n" 219 | } 220 | } 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "source": [ 225 | "## Run the Python script\n", 226 | "\n", 227 | "Now we need to create an AzureML job with the following inputs:\n", 228 | "\n", 229 | " - The path to the Python script\n", 230 | " - A command to run the script\n", 231 | " - Information which AzureML Compute and Environment to use\n", 232 | "\n", 233 | "This job will be used to run our Python script on our Compute using the environment we created in the step before.\n", 234 | "The script takes links to the data files we loaded ino Azure Blob Storage in the previous tutorial as input. You can find these links by naviating to your data files in your data store and clicking the kebab menu besides each file. A drop down menu will open where you can select the \"Copy URI\" option. This opens a pop-up window where you can copy the link to the file.\n", 235 | "![](img_src/get_data_link.png)\n", 236 | "\n", 237 | "This opens a pop-up window where you can copy the link to the file.\n", 238 | "![](img_src/get_data_link_2.png)\n", 239 | "\n", 240 | "Also don't forget to change the variables for your Compute.\n" 241 | ], 242 | "metadata": { 243 | "collapsed": false 244 | } 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "outputs": [], 250 | "source": [ 251 | "from azure.ai.ml import command\n", 252 | "from azure.ai.ml import Input\n", 253 | "from azure.ai.ml.constants import AssetTypes\n", 254 | "\n", 255 | "\n", 256 | "job = command(\n", 257 | " inputs=dict(\n", 258 | " train_data=Input(\n", 259 | " type=AssetTypes.URI_FILE,\n", 260 | " path=\"< link to training data file >\", # change\n", 261 | " ),\n", 262 | " test_data=Input(\n", 263 | " type=AssetTypes.URI_FILE,\n", 264 | " path=\"< link to test data file >\", # change\n", 265 | " ),\n", 266 | " learning_rate=0.05\n", 267 | " ),\n", 268 | " code=\".\", # location of source code\n", 269 | " command=\"python main.py --train_data ${{inputs.train_data}} --test_data ${{inputs.test_data}} --learning_rate ${{inputs.learning_rate}}\",\n", 270 | " environment=pipeline_job_env,\n", 271 | " compute=\"\", # change\n", 272 | " experiment_name=\"\", # change\n", 273 | " display_name=\"\", # change\n", 274 | ")\n" 275 | ], 276 | "metadata": { 277 | "collapsed": false, 278 | "pycharm": { 279 | "name": "#%%\n" 280 | } 281 | } 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "source": [ 286 | "Now, we can run the script on our compute instance. A link will show up below, which you can click on to see the job details and output logs." 287 | ], 288 | "metadata": { 289 | "collapsed": false 290 | } 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "outputs": [], 296 | "source": [ 297 | "ml_client.create_or_update(job)" 298 | ], 299 | "metadata": { 300 | "collapsed": false, 301 | "pycharm": { 302 | "name": "#%%\n" 303 | } 304 | } 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "source": [ 309 | "Here is the Confusion Matrix of our trained model.\n", 310 | "\n", 311 | "\n", 312 | "| | predicted neg | predicted pos |\n", 313 | "|------------|----------------|----------------|\n", 314 | "|actual neg | 14841 | 784 |\n", 315 | "|actual pos | 13 | 362 |\n", 316 | "\n", 317 | "The model has a total cost of 14340 according to the ida-score we implemented in accordance to the problem description of the Scania Trucks dataset." 318 | ], 319 | "metadata": { 320 | "collapsed": false 321 | } 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "source": [ 326 | "## Save the trained model\n", 327 | "\n", 328 | "The script will directly register the trained model in your AzureML Workspace, so you can use it to run inference in AzureML. It will also save the model in the output of the job. From there, you can extract it to run it in your Exasol cluster. You can find your registered model under the Assets, Model entry in the AzureML Studio menu on the left.\n", 329 | "\n", 330 | "![](img_src/registered_model.png)\n", 331 | "\n", 332 | "Now that we have trained and registered a model on the data we imported from our Exasol Saas instance, we can move on to the\n", 333 | "[next part](InvokeModelFromExasolDBwithUDF.ipynb), where we will use this model from with in our Exasol Cluster to classify some data." 334 | ], 335 | "metadata": { 336 | "collapsed": false 337 | } 338 | } 339 | ], 340 | "metadata": { 341 | "kernelspec": { 342 | "display_name": "Python 3", 343 | "language": "python", 344 | "name": "python3" 345 | }, 346 | "language_info": { 347 | "codemirror_mode": { 348 | "name": "ipython", 349 | "version": 2 350 | }, 351 | "file_extension": ".py", 352 | "mimetype": "text/x-python", 353 | "name": "python", 354 | "nbconvert_exporter": "python", 355 | "pygments_lexer": "ipython2", 356 | "version": "2.7.6" 357 | } 358 | }, 359 | "nbformat": 4, 360 | "nbformat_minor": 0 361 | } -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/access_key_azure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/access_key_azure.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/azureML_public_ip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/azureML_public_ip.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/cluster_creation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/cluster_creation.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/conda_file_artifact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/conda_file_artifact.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/connection_detail_generate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/connection_detail_generate.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/connection_details_acess_token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/connection_details_acess_token.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/consume_endpoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/consume_endpoint.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/create_datastore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/create_datastore.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/data_blobstore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/data_blobstore.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/download_all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/download_all.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/download_file_arifact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/download_file_arifact.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/file_path_bucketfs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/file_path_bucketfs.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/get_data_link.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/get_data_link.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/get_data_link_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/get_data_link_2.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/manage_udf_files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/manage_udf_files.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/registered_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/registered_model.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/img_src/resource_group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/resource_group.png -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.pipeline import Pipeline 8 | from sklearn.impute import SimpleImputer 9 | from sklearn.preprocessing import StandardScaler 10 | 11 | from sklearn.ensemble import ExtraTreesClassifier 12 | from sklearn.metrics import make_scorer 13 | from sklearn.model_selection import ParameterGrid 14 | from sklearn.model_selection import GridSearchCV 15 | from sklearn.metrics import confusion_matrix 16 | 17 | import mlflow 18 | import mlflow.sklearn 19 | 20 | 21 | def main(): 22 | """Main function of the script.""" 23 | 24 | # input and output arguments 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--train_data", type=str, help="path to input train data") 27 | parser.add_argument("--test_data", type=str, help="path to input test data") 28 | parser.add_argument("--learning_rate", required=False, default=0.1, type=float) 29 | args = parser.parse_args() 30 | print(" ".join(f"{k}={v}" for k, v in vars(args).items())) 31 | 32 | # read the data from the AzureML Blob Storage. This is a good way for the data used for this example, 33 | # but for your own data another approach might be better. Check here for more info: 34 | # https://learn.microsoft.com/en-us/azure/machine-learning/how-to-read-write-data-v2?view=azureml-api-2&tabs=cli 35 | train_df_no_scale = pd.read_csv(args.train_data, header=0) 36 | test_df_no_scale = pd.read_csv(args.test_data, header=0) 37 | 38 | train_data_and_labels = get_labels(train_df_no_scale, class_col_name='CLASS_POS') 39 | test_data_and_labels = get_labels(test_df_no_scale, class_col_name='CLASS_POS') 40 | 41 | # get transformer for data preparation: 42 | # normalization, removing nans from dataset(important for back propagation), 43 | _, transformer = get_transformer(train_data_and_labels) 44 | 45 | # build classifier and find best training parameters 46 | clf, grid_search = build_et_classifier(train_data_and_labels, transformer) 47 | print(grid_search.best_params_['n_estimators']) 48 | print(grid_search.best_params_['max_depth']) 49 | print(str(grid_search.best_params_['class_weight'])) 50 | 51 | # Train and evaluate the model. 52 | clf.fit(train_data_and_labels[1], train_data_and_labels[0].ravel()) 53 | 54 | # Evaluate the trained classifier using test data. Output can be found in the logs of the AzureML job run. 55 | y_pred = test_eval(test_data_and_labels, clf) 56 | 57 | # Save the trained model and register it with AzureML Workspace 58 | mlflow.sklearn.log_model( 59 | sk_model=clf, 60 | registered_model_name="registered_model_name_sklearn", 61 | artifact_path="./outputs/model/sklearn_model_sklearn_save" 62 | ) 63 | 64 | # get class labels from dataset 65 | def get_labels(df, class_col_name): 66 | y = df.loc[:, class_col_name] 67 | X_data = df.loc[:, df.columns != class_col_name] 68 | return [y, X_data] 69 | 70 | # get transformer and train for data preprocessing 71 | def get_transformer(data_and_labels): 72 | transformer = Pipeline([ 73 | ('imputer', SimpleImputer(strategy="median")), 74 | ('scaler', StandardScaler()) 75 | ]) 76 | train_df_transformed = transformer.fit_transform(data_and_labels[1]) 77 | return train_df_transformed, transformer 78 | 79 | 80 | def build_et_classifier(data_and_labels, transformer): 81 | y = data_and_labels[0] 82 | X_data = data_and_labels[1] 83 | X_data = transformer.transform(X_data) 84 | 85 | # Create classifier 86 | clf = ExtraTreesClassifier(n_jobs=-1) 87 | 88 | # Specify parameter search grid 89 | # The grid size is kept small to reduce the computation time 90 | # Good values (known from offline grid search) are: 91 | # 'n_estimators': 61 92 | # 'max_depth': 10 93 | # 'class_weight': {{0: 1, 1: 89}} 94 | param_grid = [ 95 | {'n_estimators': [30, 61], 96 | 'max_depth': [5, 10], 97 | 'class_weight': [{0: 1, 1: 30}, {0: 1, 1: 50}, {0: 1, 1: 89}]} 98 | ] 99 | 100 | ida_scorer = make_scorer(ida_score) 101 | 102 | # Search for optimal values in grid using 5-fold cross validation 103 | grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=ida_scorer, n_jobs=-1) 104 | grid_search.fit(X_data, y.values.ravel()) 105 | 106 | # Create new model with optimal parameter values 107 | clf = ExtraTreesClassifier(n_jobs=-1, 108 | n_estimators=grid_search.best_params_['n_estimators'], 109 | max_depth=grid_search.best_params_['max_depth'], 110 | class_weight=grid_search.best_params_['class_weight']) 111 | 112 | # fuse the classifier and the transformer into one pipeline. 113 | # This guarantees the preprocessing stays the same for each use of the model. 114 | model = Pipeline([ 115 | ('transform', transformer), 116 | ('clf', clf) 117 | ]) 118 | 119 | return model, grid_search 120 | 121 | # Evaluate the trained model 122 | def test_eval(data_and_labels, clf): 123 | y = data_and_labels[0] 124 | X_data = data_and_labels[1] 125 | 126 | # Predict classes of test data 127 | y_pred = clf.predict(X_data) 128 | 129 | # Examine the results 130 | confusion_mat = confusion_matrix(y, y_pred) 131 | confusion_matrix_df = pd.DataFrame(confusion_mat, 132 | index=['actual neg', 'actual pos'], 133 | columns=['predicted neg', 'predicted pos']) 134 | 135 | print("Total Cost:", - ida_score(y, y_pred), "\n") 136 | print("Confusion Matrix:\n", confusion_matrix_df) 137 | 138 | 139 | # Define scoring metric for grid search from problem description of the Scania Trucks dataset 140 | def ida_score(y, y_pred): 141 | false_preds = y - y_pred 142 | num_false_pos = (false_preds < 0).sum() 143 | num_false_neg = (false_preds > 0).sum() 144 | return -(num_false_pos * 10 + num_false_neg * 500) 145 | 146 | 147 | if __name__ == "__main__": 148 | main() 149 | -------------------------------------------------------------------------------- /tutorials/machine-learning/python/AzureML/score.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import json 4 | import numpy 5 | import joblib 6 | import mlflow 7 | 8 | 9 | def init(): 10 | """ 11 | This function is called when the container is initialized/started, typically after create/update of the deployment. 12 | You can write the logic here to perform init operations like caching the model in memory 13 | """ 14 | global model 15 | # AZUREML_MODEL_DIR is an environment variable created during deployment. 16 | # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) 17 | model_path = os.path.join( 18 | os.getenv("AZUREML_MODEL_DIR"), "sklearn_model_sklearn_save/" 19 | ) 20 | # deserialize the model file back into a sklearn model 21 | model = mlflow.sklearn.load_model(model_path) 22 | logging.info("Init complete") 23 | 24 | 25 | def run(raw_data): 26 | """ 27 | This function is called for every invocation of the endpoint to perform the actual scoring/prediction. 28 | In the example we extract the data from the json input and call the scikit-learn model's predict() 29 | method and return the result back. 30 | raw_data: The json input received by the endpoint. needs to include a "data" field with a table that holds the 42 31 | features of each item to be classified 32 | these correspond to the following column names from the IDA tables: 33 | ['AA_000', 'AG_005', 'AH_000', 'AL_000', 'AM_0', 'AN_000', 'AO_000', 'AP_000', 'AQ_000', 34 | 'AZ_004', 'BA_002', 'BB_000', 'BC_000', 'BD_000', 'BE_000', 35 | 'BF_000', 'BG_000', 'BH_000', 'BI_000', 'BJ_000', 'BS_000', 'BT_000', 'BU_000', 'BV_000', 36 | 'BX_000', 'BY_000', 'BZ_000', 'CA_000', 'CB_000', 'CC_000', 'CI_000', 'CN_004', 'CQ_000', 37 | 'CS_001', 'DD_000', 'DE_000', 'DN_000', 'DS_000', 'DU_000', 'DV_000', 'EB_000', 'EE_005'] 38 | """ 39 | logging.error(json.loads(raw_data)) 40 | json_in = json.loads(raw_data) 41 | data = json_in["data"] 42 | 43 | data = numpy.array(data) 44 | response = model.predict(data) 45 | return {"result": str(response)} 46 | 47 | 48 | -------------------------------------------------------------------------------- /tutorials/machine-learning/python/README.md: -------------------------------------------------------------------------------- 1 | ## Python Tutorials 2 | This section contains tutorials with the Python Programming Language. We are going to provide examples for different frameworks, tasks and use cases. 3 | 4 | ### AzureML: 5 | [AzureML](https://azure.microsoft.com/de-de/products/machine-learning) is a Microsoft service for the Machine 6 | learning lifecycle in Azure. 7 | 8 | This tutorial will show you: 9 | 10 | * [A general introduction to the topic](AzureML/Introduction.ipynb), we recommend you start here 11 | * [How to connect AzureML to Exasol](AzureML/ConnectAzureMLtoExasol.ipynb) 12 | * [How to Train a model using data from Exasol](AzureML/TrainModelInAzureML.ipynb) 13 | * [How to Invoke the trained model from an Exasol UDF](AzureML/InvokeModelFromExasolDBwithUDF.ipynb) 14 | 15 | 16 | ### Frameworks: 17 | 18 | * [Scikit-learn](scikit-learn): 19 | 20 | [Scikit-learn](https://scikit-learn.org/stable/) is a free software machine learning library for the Python 21 | programming language. It features various classification, regression and clustering algorithms including support 22 | vector machines, random forests, gradient boosting, k-means and DBSCAN, and is designed to interoperate with the 23 | Python numerical and scientific libraries NumPy and SciPy. Its scalability of the training is typically limited. 24 | Out-of-core learning is not for all algorithms available, such that the usage of these algorithms is limited by the 25 | available main memory. Scikit-learn supports parallel execution through python multi-processing and linear algebra 26 | libraries. Distributed training and GPU acceleration is not out of the box available. You can find more details about 27 | scalability [here](https://scikit-learn.org/stable/modules/computing.html). 28 | 29 | * [AWS Sagemaker](sagemaker) 30 | 31 | [AWS Sagemaker](https://aws.amazon.com/de/sagemaker/) is an AWS cloud service for machine learning. In contains 32 | hosted [Jupyter notebooks](https://jupyter.org/) but also 33 | a [SDK for machine learning](https://sagemaker.readthedocs.io/en/stable/). 34 | 35 | This tutorial will show you: 36 | 37 | * [How to connect from a SageMaker Notebook to Exasol](sagemaker/ConnectSagemakerToExasol.ipynb) 38 | * [How to load example dataset](sagemaker/LoadExampleDataIntoExasol.ipynb) 39 | * [How to train a Sagemaker model with data from Exasol](sagemaker/TrainSagemakerModelWithExasolData.ipynb) 40 | * [How to use a Sagemaker model from inside of Exasol](sagemaker/UseSagemakerModelFromExasol.ipynb) 41 | 42 | ### Prerequisites: 43 | 44 | For general prerequisites, please refer to [Prerequisites](../README.md). However, these tutorials typically need a specific flavor of the [Script Language Container](https://github.com/exasol/script-languages) which has the required dependencies installed. For these purposes, we provide the python3-ds-* flavors which already contain the dependencies for the frameworks used in these tutorials. Prepackaged releases for this flavor can be found on the [release page](https://github.com/exasol/script-languages/releases). 45 | -------------------------------------------------------------------------------- /tutorials/machine-learning/python/sagemaker/ConnectSagemakerToExasol.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Connect to Exasol from Sagemaker\n", 8 | "\n", 9 | "This example shows you how to connect from AWS Sagemaker to an Exasol database.\n", 10 | "\n", 11 | "First we install pyexasol, as a driver." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "Requirement already satisfied: pyexasol in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (0.14.3)\n", 24 | "Requirement already satisfied: rsa in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyexasol) (4.5)\n", 25 | "Requirement already satisfied: websocket-client>=0.47.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyexasol) (0.57.0)\n", 26 | "Requirement already satisfied: pyasn1>=0.1.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from rsa->pyexasol) (0.4.8)\n", 27 | "Requirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from websocket-client>=0.47.0->pyexasol) (1.14.0)\n", 28 | "\u001B[33mWARNING: You are using pip version 20.0.2; however, version 20.3.1 is available.\n", 29 | "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001B[0m\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "!pip install pyexasol\n", 35 | "import pyexasol" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Now let's connect:" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import pyexasol\n", 52 | "EXASOL_HOST = \"\" # change\n", 53 | "EXASOL_PORT = \"8563\" # change if needed\n", 54 | "EXASOL_CONNECTION = \"{host}:{port}\".format(host=EXASOL_HOST, port=EXASOL_PORT)\n", 55 | "EXASOL_USER = \"sys\" # change if needed\n", 56 | "EXASOL_PASSWORD = \"\" # change\n", 57 | "EXASOL_SCHEMA = \"IDA\" # change if needed\n", 58 | "exasol = pyexasol.connect(dsn=EXASOL_CONNECTION, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "... and run an example query:" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/html": [ 76 | "
\n", 77 | "\n", 90 | "\n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | "
USER_NAMECREATEDUSER_CONSUMER_GROUPUSER_COMMENT
0SYSNaNSYS_CONSUMER_GROUPSYS is the system user and possesses universal...
\n", 110 | "
" 111 | ], 112 | "text/plain": [ 113 | " USER_NAME CREATED USER_CONSUMER_GROUP \\\n", 114 | "0 SYS NaN SYS_CONSUMER_GROUP \n", 115 | "\n", 116 | " USER_COMMENT \n", 117 | "0 SYS is the system user and possesses universal... " 118 | ] 119 | }, 120 | "execution_count": 4, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "users = exasol.export_to_pandas(\"SELECT * FROM SYS.EXA_ALL_USERS\")\n", 127 | "users" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "conda_python3", 134 | "language": "python", 135 | "name": "conda_python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.6.10" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 4 152 | } -------------------------------------------------------------------------------- /tutorials/machine-learning/python/sagemaker/LoadExampleDataIntoExasol.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load Example Data Into the Exasol database\n", 8 | "\n", 9 | "In this Notebook we will load the \"Air pressure system failures in Scania trucks\" dataset into the exasol database using Python and Pyexasol. This Scania trucks dataset is a predictive maintenance scenario:\n", 10 | "\n", 11 | "> The dataset consists of data collected from heavy Scania trucks in everyday usage. The system in focus is the Air Pressure system (APS) which generates pressurized air that is utilized in various functions in a truck, such as braking and gear changes. The datasets' positive class consists of component failures for a specific component of the APS system. The negative class consists of trucks with failures for components not related to the APS. The data consists of a subset of all available data, selected by experts.\n", 12 | "\n", 13 | "You can find further information [here](https://archive.ics.uci.edu/ml/datasets/IDA2016Challenge)." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "source": [ 19 | "For this we need:\n", 20 | "\n", 21 | " - Connection information of the running Exasol database we want to load the data into.\n", 22 | " - The url of the dataset we want to load (and knowledge of its structure).\n", 23 | "\n", 24 | "\n", 25 | "First we enter the connection details for the Exasol database we want to load the dataset into.\n", 26 | "Then we install pyexasol and import some dependencies." 27 | ], 28 | "metadata": { 29 | "collapsed": false, 30 | "pycharm": { 31 | "name": "#%% md\n" 32 | } 33 | } 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "EXASOL_HOST = \"\" # change, in case of Exasol Saas this can be a \"connection string\"\n", 42 | "EXASOL_PORT = \"8563\" # change if needed\n", 43 | "EXASOL_USER = \"sys\" # change if needed\n", 44 | "EXASOL_PASSWORD = \"\" # change, in case of Exasol Saas this can be a personal access token\n", 45 | "EXASOL_SCHEMA = \"IDA\"" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "!pip install pyexasol\n", 55 | "\n", 56 | "import pyexasol\n", 57 | "from io import BytesIO\n", 58 | "from urllib.request import urlopen\n", 59 | "import pandas as pd\n", 60 | "from zipfile import ZipFile" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "source": [ 66 | "Next we can use the pyexasol connection to connect to our Exasol DB." 67 | ], 68 | "metadata": { 69 | "collapsed": false, 70 | "pycharm": { 71 | "name": "#%% md\n" 72 | } 73 | } 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "EXASOL_CONNECTION = \"{host}:{port}\".format(host=EXASOL_HOST, port=EXASOL_PORT)\n", 82 | "exasol = pyexasol.connect(dsn=EXASOL_CONNECTION, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## Download Example Data\n", 90 | "\n", 91 | "Now we download the dataset and write it to a zip-file." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "DATA_URL = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00414/to_uci.zip\"\n", 101 | "\n", 102 | "resp = urlopen(DATA_URL)\n", 103 | "with open('to_uci.zip', 'wb') as f: \n", 104 | " f.write(resp.read())\n", 105 | " \n", 106 | "print(\"data downloaded\")" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "source": [ 112 | "And then we read the contents of the downloaded zip-file into \"train_set\" and \"test_set\" variables respectively, using pandas to load the train- and test-tables from the csv files." 113 | ], 114 | "metadata": { 115 | "collapsed": false, 116 | "pycharm": { 117 | "name": "#%% md\n" 118 | } 119 | } 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "TRAINING_FILE = \"to_uci/aps_failure_training_set.csv\"\n", 128 | "TEST_FILE = \"to_uci/aps_failure_test_set.csv\"\n", 129 | "\n", 130 | "# Data is preceded with a 20-line header (copyright & license)\n", 131 | "NUM_SKIP_ROWS = 20\n", 132 | "NA_VALUE = \"na\"\n", 133 | "\n", 134 | "with ZipFile('to_uci.zip') as z:\n", 135 | " with z.open(TRAINING_FILE, \"r\") as f:\n", 136 | " train_set = pd.read_csv(f, skiprows=NUM_SKIP_ROWS, na_values=NA_VALUE)\n", 137 | " with z.open(TEST_FILE, \"r\") as f:\n", 138 | " test_set = pd.read_csv(f, skiprows=NUM_SKIP_ROWS, na_values=NA_VALUE)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "## Import Example Data\n", 146 | "\n", 147 | "In the last step we want to load the dataset into the exasol database. First we need to create a new schema \"EXASOL_SCHEMA\" using the pyexasol connection." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "exasol.execute(query=\"CREATE SCHEMA IF NOT EXISTS {schema!i}\", query_params={\"schema\": EXASOL_SCHEMA})" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "source": [ 162 | "Then we need to create the \"EXASOL_SCHEMA.TRAIN\" and \"EXASOL_SCHEMA.TEST\" tables in the Exasol database with column names and types that match the tables from the data set. We do this by extracting the column names from the pandas table we created in the previous step. The column types for the Scania Trucks data set are VARCHAR(3) for the first column (\"class\"), and DECIMAL(18,2) for all other columns. We use the pyexasol connection we created previously to create these tables." 163 | ], 164 | "metadata": { 165 | "collapsed": false, 166 | "pycharm": { 167 | "name": "#%% md\n" 168 | } 169 | } 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "# Define column names and types\n", 178 | "column_names = list(train_set.columns)\n", 179 | "column_types = [\"VARCHAR(3)\"] + [\"DECIMAL(18,2)\"] * (len(column_names) - 1)\n", 180 | "column_desc = [\" \".join(t) for t in zip(column_names, column_types)]\n", 181 | "\n", 182 | "params = {\"schema\": EXASOL_SCHEMA, \"column_names\": column_names, \"column_desc\": column_desc}\n", 183 | "\n", 184 | "# Create tables for data\n", 185 | "exasol.execute(query=\"CREATE OR REPLACE TABLE {schema!i}.TRAIN(\" + \", \".join(column_desc) + \")\", query_params=params)\n", 186 | "exasol.execute(query=\"CREATE OR REPLACE TABLE {schema!i}.TEST LIKE {schema!i}.TRAIN\", query_params=params)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "source": [ 192 | "Finally, we can use pyexasol's \"import_from_pandas\" functionality to import our pandas tables into our newly created Exasol tables using the pyexasol connection." 193 | ], 194 | "metadata": { 195 | "collapsed": false, 196 | "pycharm": { 197 | "name": "#%% md\n" 198 | } 199 | } 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "outputs": [], 205 | "source": [ 206 | "# Import data into Exasol\n", 207 | "exasol.import_from_pandas(train_set, (EXASOL_SCHEMA, \"TRAIN\"))\n", 208 | "print(f\"Imported {exasol.last_statement().rowcount()} rows into TRAIN.\")\n", 209 | "exasol.import_from_pandas(test_set, (EXASOL_SCHEMA, \"TEST\"))\n", 210 | "print(f\"Imported {exasol.last_statement().rowcount()} rows into TEST.\")" 211 | ], 212 | "metadata": { 213 | "collapsed": false, 214 | "pycharm": { 215 | "name": "#%%\n" 216 | } 217 | } 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "source": [ 222 | "Now te Scania Trucks dataset should be available in the Exasol database in the Schema \"EXASOL_SCHEMA\" sorted into the \"TRAIN\" and the \"TEST\" tables." 223 | ], 224 | "metadata": { 225 | "collapsed": false, 226 | "pycharm": { 227 | "name": "#%% md\n" 228 | } 229 | } 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "conda_python3", 235 | "language": "python", 236 | "name": "conda_python3" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 3 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython3", 248 | "version": "3.6.10" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 4 253 | } -------------------------------------------------------------------------------- /tutorials/machine-learning/python/sagemaker/UseSagemakerModelFromExasol.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true, 7 | "pycharm": { 8 | "name": "#%% md\n" 9 | } 10 | }, 11 | "source": [ 12 | "# Use an AWS Sagemaker model from within Exasol\n", 13 | "\n", 14 | "In this notebook we will use an AWS Sagemaker model for predicitions from within Exasol queries.\n", 15 | "\n", 16 | "For that our exasol database needs permissions to use the Sagemaker inference Notebook.\n", 17 | "For that you can:\n", 18 | "\n", 19 | "* Provide credentials\n", 20 | "* Grant the permissions to the Role of the databases EC2 role.\n", 21 | "\n", 22 | "In this guide we will use the second approach.\n", 23 | "\n", 24 | "Grant the following permissions to your EC2 instance role:\n", 25 | "\n", 26 | "* `sts:AssumeRole` with a resource filter for the EC2 role itself.\n", 27 | "* `sagemaker:InvokeEndpoint` with a resource filter on your Sagemaker endpoint.\n", 28 | "\n", 29 | "In case you want to take the first approach, you can modify the UDF code below to use credentials." 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "source": [ 35 | "## Parameters" 36 | ], 37 | "metadata": { 38 | "collapsed": false 39 | } 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "outputs": [], 45 | "source": [ 46 | "EXASOL_HOST = \"\" # change\n", 47 | "EXASOL_PORT = \"8563\" # change if needed\n", 48 | "EXASOL_CONNECTION = \"{host}:{port}\".format(host=EXASOL_HOST, port=EXASOL_PORT)\n", 49 | "EXASOL_USER = \"sys\" # change if needed\n", 50 | "EXASOL_PASSWORD = \"\" # change\n", 51 | "EXASOL_SCHEMA = \"IDA\" # change if needed\n", 52 | "EXASOL_CLUSTER_ROLE = \"\" #change\n", 53 | "EXASOL_REGION = \"eu-central-1\" #change if needed\n", 54 | "ENDPOINT_NAME = \"\" #change" 55 | ], 56 | "metadata": { 57 | "collapsed": false, 58 | "pycharm": { 59 | "name": "#%%\n" 60 | } 61 | } 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "source": [ 66 | "## Setup" 67 | ], 68 | "metadata": { 69 | "collapsed": false, 70 | "pycharm": { 71 | "name": "#%% md\n" 72 | } 73 | } 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "outputs": [], 79 | "source": [ 80 | "!pip install pyexasol\n", 81 | "\n", 82 | "import pyexasol\n", 83 | "import pandas as pd\n", 84 | "exasol = pyexasol.connect(dsn=EXASOL_CONNECTION, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True)" 85 | ], 86 | "metadata": { 87 | "collapsed": false, 88 | "pycharm": { 89 | "name": "#%%\n" 90 | } 91 | } 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "source": [ 96 | "## Install UDF\n", 97 | "\n", 98 | "In order to use the Sagemaker inference Endpoint from within the Exasol database, we will create a Python UDF that does API calls to the endpoint with the data from the query." 99 | ], 100 | "metadata": { 101 | "collapsed": false 102 | } 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "outputs": [], 108 | "source": [ 109 | "# create schema\n", 110 | "exasol.execute(\"CREATE SCHEMA IF NOT EXISTS DATA_SCIENCE\")\n", 111 | "\n", 112 | "#create UDF\n", 113 | "exasol.execute(\"\"\"\n", 114 | "CREATE OR REPLACE PYTHON3 SET SCRIPT DATA_SCIENCE.PREDICT(...) EMITS(id DECIMAL(20,0), \"result\" BOOLEAN) AS\n", 115 | "def run(ctx):\n", 116 | " import boto3\n", 117 | " import pandas as pd\n", 118 | " import os\n", 119 | " f = open(\"/tmp/.config\", \"w\")\n", 120 | " f.write(\n", 121 | " \"[default]\\\\nregion = {region!r}\\\\nrole_arn = {role!r}\\\\ncredential_source = Ec2InstanceMetadata\")\n", 122 | " f.close()\n", 123 | " os.environ['AWS_CONFIG_FILE'] = '/tmp/.config'\n", 124 | " while True:\n", 125 | " df = ctx.get_dataframe(1000)\n", 126 | " if df is None:\n", 127 | " break\n", 128 | " id_column = df[\"0\"]\n", 129 | " df = df.drop(\"0\", 1)\n", 130 | " client = boto3.client('sagemaker-runtime')\n", 131 | " endpoint_name = \"{endpoint_name!r}\"\n", 132 | " response = client.invoke_endpoint(\n", 133 | " EndpointName=endpoint_name,\n", 134 | " ContentType='text/csv',\n", 135 | " Body=df.to_csv(header=False, index=False)\n", 136 | " )\n", 137 | " result_list = response['Body'].read().decode('ascii').split(\",\")\n", 138 | " rounded_result = map(lambda x: bool(round(float(x))),result_list)\n", 139 | " result = pd.DataFrame(list(rounded_result))\n", 140 | " ctx.emit(pd.concat([id_column,result],axis=1))\n", 141 | "/\n", 142 | "\"\"\", {\n", 143 | " \"region\": EXASOL_REGION,\n", 144 | " \"role\": EXASOL_CLUSTER_ROLE,\n", 145 | " \"endpoint_name\": ENDPOINT_NAME\n", 146 | "})" 147 | ], 148 | "metadata": { 149 | "collapsed": false, 150 | "pycharm": { 151 | "name": "#%%\n" 152 | } 153 | } 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "source": [ 158 | "## Run Query\n", 159 | "\n", 160 | "So let's run predictions on the test data table in Exasol." 161 | ], 162 | "metadata": { 163 | "collapsed": false 164 | } 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "outputs": [], 170 | "source": [ 171 | "all_columns = exasol.export_to_pandas(\"SELECT * FROM \" + EXASOL_SCHEMA + \".TEST LIMIT 1;\")\n", 172 | "column_names = list(all_columns)\n", 173 | "column_names.remove(\"CLASS\")\n", 174 | "result = exasol.export_to_pandas(\"\"\"SELECT CLASS = 'pos' as \"expected\", \"result\" FROM (\n", 175 | " SELECT DATA_SCIENCE.PREDICT(ROWID, {columns_without_class!q}) FROM IDA.TEST t) r\n", 176 | " JOIN IDA.TEST o ON r.ID = o.ROWID\"\"\", {\"columns_without_class\": column_names})\n", 177 | "pd.crosstab(index=result['expected'], columns=result[\"result\"], rownames=['actuals'], colnames=['predictions'])" 178 | ], 179 | "metadata": { 180 | "collapsed": false, 181 | "pycharm": { 182 | "name": "#%%\n" 183 | } 184 | } 185 | } 186 | ], 187 | "metadata": { 188 | "kernelspec": { 189 | "display_name": "Python 3", 190 | "language": "python", 191 | "name": "python3" 192 | }, 193 | "language_info": { 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 2 197 | }, 198 | "file_extension": ".py", 199 | "mimetype": "text/x-python", 200 | "name": "python", 201 | "nbconvert_exporter": "python", 202 | "pygments_lexer": "ipython2", 203 | "version": "2.7.6" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 0 208 | } -------------------------------------------------------------------------------- /tutorials/machine-learning/python/scikit-learn/README.md: -------------------------------------------------------------------------------- 1 | ## Scikit-learn Python Tutorials 2 | 3 | This section contains tutorials with Scikit-learn in Python. We are going to provide examples for different tasks and use cases. 4 | 5 | **Currently, this repository is under development and we will add more and more tutorials in the future.** 6 | 7 | ### Overview 8 | 9 | * [Classification](classification.ipynb) 10 | -------------------------------------------------------------------------------- /tutorials/machine-learning/sagemaker-extension/images/sme_deployment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/sagemaker-extension/images/sme_deployment.png -------------------------------------------------------------------------------- /tutorials/machine-learning/sagemaker-extension/images/sme_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/sagemaker-extension/images/sme_overview.png -------------------------------------------------------------------------------- /tutorials/machine-learning/sagemaker-extension/images/sme_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/sagemaker-extension/images/sme_training.png -------------------------------------------------------------------------------- /tutorials/machine-learning/sagemaker-extension/tutorial.md: -------------------------------------------------------------------------------- 1 | # SageMaker Extension Tutorial 2 | 3 | ## 1. Introduction 4 | This tutorial walks you through the setup of the Exasol SageMaker-Extension 5 | project and presents an use-case of how this extension can be used in Exasol. 6 | 7 | The Exasol Sagemaker Extension enables you to develop an end-to-end machine 8 | learning project on data stored in Exasol using the AWS SageMaker Autopilot service. 9 | 10 | The use-case handles a publicly available real-world dataset provided by a heavy 11 | truck manufacturer (see [Use Case](#use-case)). With the 12 | provided extension, a machine learning model is developed which allows 13 | predicting whether the truck failures are related to a particular component. 14 | 15 | ### 1.1 AWS Sagemaker Autopilot Service 16 | AWS SageMaker is an AWS public cloud service in which users can build and deploy 17 | machine learning models. SageMaker provides a number of levels of abstraction to 18 | users while developing machine learning models. At one of the its highest level 19 | of abstraction, SageMaker enables users to use an Automated machine learning 20 | (AutoML) service, called Autopilot in AWS, that automatizes the process of 21 | applying machine learning to real world problems. 22 | 23 | Autopilot covers a complete pipeline of developing an end-to end machine learning 24 | project, from raw data to a deployable model. It is able to automatically build, 25 | train and tune a number of machine learning models by inspecting your data set. 26 | In this way, the following tasks, which are repeatedly applied by ML-experts 27 | in machine learning projects, are automated: 28 | - Pre-process and clean the data. 29 | - Perform feature engineering and select the most appropriate ones 30 | - Determine the most appropriate ML algorithm. 31 | - Tune and optimize hyper-parameters of model. 32 | - Post-process machine learning models. 33 | 34 | The Exasol Sagemaker Extension takes these advantages of AWS Autopilot and enables 35 | users to easily create an effective and efficient machine learning models 36 | without expert knowledge. 37 | 38 | ### 1.2 Exasol SageMaker Extension 39 | 40 | The Exasol Sagemaker Extension provides a Python library together with Exasol 41 | Scripts and UDFs that train Machine Learning Models on data stored in Exasol 42 | using AWS SageMaker Autopilot service. 43 | 44 | The extension basically exports a given Exasol table into AWS S3, and then 45 | triggers Machine Learning training using the AWS Autopilot service with the 46 | specified parameters. In addition, the training status can be polled using 47 | the auxiliary scripts provided within the scope of the project. In order to 48 | perform prediction on a trained Autopilot model, one of the methods is to 49 | deploy the model to the real-time AWS endpoint. This extension provides Lua 50 | scripts for creating/deleting real-time endpoint and creates a model-specific 51 | UDF script for making real-time predictions. The following figure 52 | indicates the overview of this solution. 53 | 54 | ![SME Overview](./images/sme_overview.png) 55 | 56 | ## 2. Setup the Extension 57 | 58 | ### 2.1 Installation 59 | 60 | In order to use the Exasol SageMaker Extension, it is necessary to install the python package of the Extension 61 | , upload the given SageMaker-Extension Container into 62 | BucketFS and then activate the uploaded container in Exasol. These pre-packaged 63 | releases are available in the [Releases](https://github.com/exasol/sagemaker-extension/releases) 64 | of the Github repository. 65 | 66 | Before starting the installation, let's define the variables required for the 67 | installation (Please note that you need to change variables below to use your 68 | own Exasol Database): 69 | ```python 70 | DATABASE_HOST="127.0.0. 1" 71 | DATABASE_PORT=9563 72 | DATABASE_USER="sys" 73 | DATABASE_PASSWORD="exasol" 74 | DATABASE_SCHEMA="IDA" 75 | BUCKETFS_PORT=6666 76 | BUCKETFS_USER="w" 77 | BUCKETFS_PASSWORD="write" 78 | BUCKETFS_NAME="bfsdefault" 79 | BUCKET_NAME="default" 80 | PATH_IN_BUCKET="container" 81 | CONTAINER_NAME="exasol_sagemaker_extension_container-release" 82 | CONTAINER_FILE="exasol_sagemaker_extension_container-release.tar.gz" 83 | ``` 84 | 85 | - The sagemaker-extension python package provides a command line tool to 86 | deploy the Lua and UDF scripts to the database. It is installed as follows 87 | (Please check [the latest release](https://github.com/exasol/sagemaker-extension/releases/latest)): 88 | ```sh 89 | pip install https://github.com/exasol/sagemaker-extension/releases/download//exasol_sagemaker_extension--py3-none-any.whl 90 | ``` 91 | 92 | - The required libraries and dependencies of the Exasol SageMaker Extension are 93 | distributed into Exasol by uploading the pre-built Exasol SageMaker-Extension Language 94 | Container to the BucketFS. You can upload it with any http(s) client that can send 95 | files via HTTP-Put requests. For more details please check 96 | [Access Files in BucketFS](https://docs.exasol.com/database_concepts/bucketfs/file_access.htm). 97 | The following example uploads the pre-built SageMaker-Extension Container to BucketFS with the curl command, a http(s) client: 98 | ```sh 99 | curl -vX PUT -T \ 100 | "" 101 | "http://w:@$bucketfs_host://" 102 | ``` 103 | 104 | - You need to activate the uploaded container for your session or the whole system through 105 | adjusting parameter `SCRIPT_LANGUAGES`. Please keep in mind, that 106 | the name of the language alias is assumed to be `PYTHON_SME` in the 107 | SageMaker-Extension. For more details, please check 108 | [Adding New Packages to Existing Script Languages](https://docs.exasol.com/database_concepts/udf_scripts/adding_new_packages_script_languages.htm). 109 | The following example query activates the container session-wide: 110 | ```sh 111 | ALTER SESSION SET SCRIPT_LANGUAGES=\ 112 | 'PYTHON_SME=localzmq+protobuf://////?\ 113 | lang=python#buckets////\ 114 | exaudf/exaudfclient_py3 PYTHON3=builtin_python3 PYTHON=builtin_python R=builtin_r JAVA=builtin_java' 115 | 116 | ``` 117 | 118 | ### 2.2 Deployment 119 | 120 | The installed SageMaker-extension python package provides a command-line 121 | interface (CLI), enabling you to deploy all necessary Lua and UDF scripts into 122 | the specified `DATABASE_SCHEMA` of Exasol Database. The command line is run 123 | as follows: 124 | 125 | ```sh 126 | python -m exasol_sagemaker_extension.deployment.deploy_cli \ 127 | --host \ 128 | --port \ 129 | --user \ 130 | --pass \ 131 | --schema 132 | ``` 133 | 134 | After running this deployment command, you should be able to find all the 135 | required Lua and UDF scripts in the specified schema. To check this, you can 136 | run the following query: 137 | ```sql 138 | SELECT 139 | SCRIPT_NAME , 140 | SCRIPT_TYPE 141 | FROM 142 | SYS.EXA_ALL_SCRIPTS 143 | WHERE 144 | SCRIPT_SCHEMA='IDA'; 145 | ``` 146 | 147 | |SCRIPT_NAME |SCRIPT_TYPE| 148 | |---------------------------------------|-----------| 149 | |SME_TRAIN_WITH_SAGEMAKER_AUTOPILOT |SCRIPTING | 150 | |SME_AUTOPILOT_TRAINING_UDF |UDF | 151 | |SME_POLL_SAGEMAKER_AUTOPILOT_JOB_STATUS|SCRIPTING | 152 | |SME_AUTOPILOT_JOB_STATUS_POLLING_UDF |UDF | 153 | |SME_DEPLOY_SAGEMAKER_AUTOPILOT_ENDPOINT|SCRIPTING | 154 | |SME_AUTOPILOT_ENDPOINT_DEPLOYMENT_UDF |UDF | 155 | |SME_DELETE_SAGEMAKER_AUTOPILOT_ENDPOINT|SCRIPTING | 156 | |SME_AUTOPILOT_ENDPOINT_DELETION_UDF |UDF | 157 | 158 | ### 2.3 Create Connection to AWS 159 | 160 | The Exasol SageMaker Extension needs to connect to AWS SageMaker and your AWS S3 bucket. 161 | For that, it needs AWS credentials that has AWS Sagemaker Execution permissions. 162 | The required credentials are AWS Access Key (Please check how to 163 | [create an access key](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html#Using_CreateAccessKey)). 164 | 165 | 166 | In order for the SageMaker-Extension to use the Access Key, you need to create 167 | an Exasol `CONNECTION` object which securely stores your keys. For more information, 168 | please check [Create Connection in Exasol](https://docs.exasol.com/sql/create_connection.htm?Highlight=connection): 169 | 170 | 171 | Before creating the connection object, let's define the variables for the 172 | AWS connection (Please note that you need to use your own credentials for 173 | below variables.) 174 | ```python 175 | AWS_BUCKET="ida_dataset_bucket" 176 | AWS_REGION="eu-central-1" 177 | AWS_KEY_ID="*****" 178 | AWS_ACCESS_KEY="*****" 179 | AWS_CONNECTION_NAME="AWS_CONNECTION" 180 | ``` 181 | 182 | The Exasol `CONNECTION` object object is created as follows: 183 | ```sh 184 | CREATE OR REPLACE CONNECTION 185 | TO 'https://.s3..amazonaws.com/'' 186 | USER '' 187 | IDENTIFIED BY '' 188 | ``` 189 | 190 | 191 | ## 3. Use Case 192 | In the use case, the publicly available [Air pressure system failures in Scania trucks](https://archive.ics.uci.edu/ml/datasets/IDA2016Challenge) 193 | dataset is used. The dataset is provided by Scania CV AB as a challenge 194 | dataset in Industrial Challenge at the [15th International Symposium on 195 | Intelligent Data Analysis (IDA)](https://ida2016.blogs.dsv.su.se/) in 2016. 196 | 197 | The dataset consists of data collected from heavy Scania trucks in everyday usage. The dataset includes two different classes accroding to Air Pressure system (APS): (1) The positive class 198 | consists of component failures for a specific component of the APS system. (2) The negative class consists of trucks with failures for components not related to the APS. 199 | 200 | In this use case, it is proposed to develop a predictive machine learning model using our SageMaker-Extension to classify failures according to whether they are related to APS, or not. 201 | 202 | ### 3.1 Load the Dataset 203 | The following python script downloads the train and test datasets as CSV files 204 | to the local file system. Then it creates `TRAIN` and `TEST` tables in the 205 | specified `DATABASE_SCHEMA` of Exasol and imports the downloaded csv files 206 | to these tables respectively. 207 | 208 | ```python 209 | import pyexasol 210 | import pandas as pd 211 | from zipfile import ZipFile 212 | from urllib.request import urlopen 213 | 214 | DATABASE_CONNECTION = "{host}:{port}".format(host=DATABASE_HOST, port=DATABASE_PORT) 215 | exasol = pyexasol.connect( 216 | dsn=DATABASE_CONNECTION, 217 | user=DATABASE_USER, 218 | password=DATABASE_PASSWORD, 219 | compression=True) 220 | 221 | DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00414/to_uci.zip" 222 | TRAINING_FILE = "to_uci/aps_failure_training_set.csv" 223 | TEST_FILE = "to_uci/aps_failure_test_set.csv" 224 | 225 | # Data is preceeded with a 20-line header (copyright & license) 226 | NUM_SKIP_ROWS = 20 227 | NA_VALUE = "na" 228 | 229 | # Download datasets as csv files 230 | resp = urlopen(DATA_URL) 231 | with open('to_uci.zip', 'wb') as f: 232 | f.write(resp.read()) 233 | with ZipFile('to_uci.zip') as z: 234 | with z.open(TRAINING_FILE, "r") as f: 235 | train_set = pd.read_csv(f, skiprows=NUM_SKIP_ROWS, na_values=NA_VALUE) 236 | with z.open(TEST_FILE, "r") as f: 237 | test_set = pd.read_csv(f, skiprows=NUM_SKIP_ROWS, na_values=NA_VALUE) 238 | 239 | # Create the schema if not exists 240 | exasol.execute( 241 | query="CREATE SCHEMA IF NOT EXISTS {schema!i}", 242 | query_params={"schema": DATABASE_SCHEMA}) 243 | 244 | # Define column names and types 245 | column_names = list(train_set.columns) 246 | column_types = ["VARCHAR(3)"] + ["DECIMAL(18,2)"] * (len(column_names) - 1) 247 | column_desc = [" ".join(t) for t in zip(column_names, column_types)] 248 | params = { 249 | "schema": DATABASE_SCHEMA, 250 | "column_names": column_names, 251 | "column_desc": column_desc} 252 | 253 | # Create tables for data 254 | exasol.execute( 255 | query="CREATE OR REPLACE TABLE {schema!i}.TRAIN(" 256 | + ", ".join(column_desc) + ")", 257 | query_params=params) 258 | exasol.execute( 259 | query="CREATE OR REPLACE TABLE {schema!i}.TEST " 260 | "LIKE {schema!i}.TRAIN", 261 | query_params=params) 262 | 263 | # Import data into Exasol 264 | exasol.import_from_pandas(train_set, (DATABASE_SCHEMA, "TRAIN")) 265 | print(f"Imported {exasol.last_statement().rowcount()} rows into TRAIN.") 266 | exasol.import_from_pandas(test_set, (DATABASE_SCHEMA, "TEST")) 267 | print(f"Imported {exasol.last_statement().rowcount()} rows into TEST.") 268 | ``` 269 | 270 | ### 3.2 Train with SageMaker Autopilot 271 | 272 | When you execute the SQL command to train a model, the Exasol SageMaker-Extension 273 | exports the specified table from the Exasol Database to your specified 274 | AWS S3 bucket. This export operation is highly efficient, as it is performed 275 | in parallel. After that the execution script calls Amazon SageMaker Autopilot, 276 | which automatically perform an end-to end machine learning development, 277 | to build a model. The following figure indicates this solution. 278 | 279 | ![SME Training](./images/sme_training.png) 280 | 281 | First, let's define the variables required to execute the training SQL command: 282 | ```python 283 | JOB_NAME="APSClassifier" 284 | IAM_SAGEMAKER_ROLE="*****" 285 | S3_BUCKET_URI="s3://" 286 | S3_OUTPUT_PATH="ida_dataset_path" 287 | INPUT_TABLE_NAME="TARGET" 288 | TARGET_COLUMN="CLASS" 289 | MAX_CANDIDATES=2 290 | ``` 291 | 292 | The following command exports the `TRAIN` table in the `DATABASE_SCHEMA` using 293 | the credentials stored in the `AWS_CONNECTION` into AWS `S3_OUTPUT_PATH` and 294 | enables Autopilot to start a job with the `JOB_NAME`. Please note that 295 | `JOB_NAME` must be unique to the corresponding account, and it is 296 | case-insensitive. In addition, the maximum number of candidate models is 297 | limited to 2 by an optional parameter called `max_candidates`. On the other side, 298 | the other optional parameters that are not set in this sample SQL command 299 | such as `problem_type`, `objective` ... etc. will be inferenced by Autopilot. 300 | For more information please check the [User Guide](https://github.com/exasol/sagemaker-extension/blob/main/doc/user_guide/user_guide.md). 301 | 302 | 303 | ```sh 304 | EXECUTE SCRIPT IDA."SME_TRAIN_WITH_SAGEMAKER_AUTOPILOT"( 305 | '{ 306 | "job_name" : "", 307 | "aws_credentials_connection_name" : "", 308 | "aws_region" : "", 309 | "iam_sagemaker_role" : "", 310 | "s3_bucket_uri" : "", 311 | "s3_output_path" : "", 312 | "input_schema_name" : "", 313 | "input_table_or_view_name" : "", 314 | "target_attribute_name" : "", 315 | "max_candidates" : 316 | }'); 317 | ``` 318 | 319 | This SQL command does not wait for the job to finish after calling Autopilot 320 | and completes its execution. The metadata information of the created Autopilot 321 | job is saved into the `SME_METADATA_AUTOPILOT_JOBS` table. You can query this 322 | table as follows: 323 | ```sql 324 | SELECT 325 | * 326 | FROM 327 | IDA."SME_METADATA_AUTOPILOT_JOBS"; 328 | ``` 329 | 330 | |DATETIME |JOB_NAME |AWS_CREDENTIALS_CONNECTION_NAME|S3_BUCKET_URI |S3_OUTPUT_PATH |TARGET_ATTRIBUTE_NAME|PROBLEM_TYPE|OBJECTIVE| ... | 331 | ---------------------------|-------------|-------------------------------|-----------------------|----------------|---------------------|------------|---------| --- | 332 | |2021-11-24-13.35.11.569000|APSClassifier|AWS_CONNECTION |s3://ida-dataset-bucket|ida_dataset_path|CLASS | | | ... | 333 | 334 | 335 | ### 3.3 Poll Training Status 336 | As mentioned in the above section, the training SQL script runs asynchronously. 337 | Therefore, you don't have to wait the training to finish. However, you can poll 338 | the status of the Autopilot training job with the polling SQL script provided 339 | by Exasol SageMaker-Extension. This SQL command takes the name of the job 340 | whose status will be queried, namely `JOB_NAME`, as input and returns the 341 | current status of the job. For more information please check the 342 | [User Guide](https://github.com/exasol/sagemaker-extension/blob/main/doc/user_guide/user_guide.md). 343 | You can execute the polling SQL command as follows: 344 | 345 | ```sh 346 | EXECUTE SCRIPT IDA."SME_POLL_SAGEMAKER_AUTOPILOT_JOB_STATUS"( 347 | '', 348 | '', 349 | '' 350 | ); 351 | ``` 352 | 353 | You can below see the sample results of this polling SQL command executed 354 | several times while the "APSClassifier" training job is running: 355 | 356 | |JOB_STATUS|JOB_SECONDARY_STATUS| 357 | |----------|--------------------| 358 | |InProgress|AnalyzingData | 359 | 360 | 361 | |JOB_STATUS|JOB_SECONDARY_STATUS| 362 | |----------|--------------------| 363 | |InProgress|FeatureEngineering | 364 | 365 | 366 | |JOB_STATUS|JOB_SECONDARY_STATUS| 367 | |----------|--------------------| 368 | |Completed |Completed | 369 | 370 | ### 3.4 Deploy Sagemaker Endpoint 371 | In order to perform prediction on a trained Autopilot model, one of the methods 372 | is to deploy the model to the real-time AWS SageMaker endpoint. You can use the 373 | deployment SQL command to create a real-time endpoint and deploy the best 374 | candidate model of the trained Autopilot jobs on it. The deployment SQL command 375 | additionally generates the prediction UDF script which is specific to the 376 | deployed endpoint so that you are able to perform real-time predictions. 377 | The following figure indicates this solution. 378 | 379 | ![SME Training](./images/sme_deployment.png) 380 | 381 | First, let's define the variables required to execute the deployment SQL command: 382 | ```python 383 | ENDPOINT_NAME="APSPredictor" 384 | INSTANCE_TYPE="ml.m5.large" 385 | INSTANCE_COUNT=1 386 | DATABASE_PRED_SCHEMA="IDAPrediction" 387 | ``` 388 | 389 | The following deployment SQL command creates a SageMaker endpoint called 390 | `EDNPOINT_NAME` and deploys the best model of `JOB_NAME` on it. Please keep 391 | in mind, that the `ENDPOINT_NAME` is also the name of the UDF script generated 392 | for the prediction. Furthermore, you can specify a different schema 393 | (`DATABASE_PRED_SCHEMA`) for the prediction UDF script to be installed 394 | than the one in which the scripts of the Exasol SageMaker-Extension project 395 | are deployed. For more information please check the 396 | [User Guide](https://github.com/exasol/sagemaker-extension/blob/main/doc/user_guide/user_guide.md). 397 | You can execute the deployment script with the defined variables as follows: 398 | 399 | ```sh 400 | EXECUTE SCRIPT IDA."SME_DEPLOY_SAGEMAKER_AUTOPILOT_ENDPOINT"( 401 | '', 402 | '', 403 | '', 404 | '', 405 | , 406 | '', 407 | '' 408 | ); 409 | ``` 410 | 411 | You should be able to see the created UDF script for prediction, as follows: 412 | 413 | ```sql 414 | SELECT 415 | SCRIPT_NAME, 416 | SCRIPT_LANGUAGE 417 | FROM 418 | SYS.EXA_ALL_SCRIPTS 419 | WHERE 420 | SCRIPT_SCHEMA = 'IDAPrediction' 421 | AND SCRIPT_TYPE = 'UDF' 422 | 423 | ``` 424 | 425 | |SCRIPT_NAME |SCRIPT_LANGUAGE| 426 | |-------------------------------------|---------------| 427 | |APSPredictor |PYTHON3_SME | 428 | 429 | ### 3.5 Predict via SageMaker Endpoint 430 | 431 | The Exasol SageMaker-Extension generates a prediction UDF for each model, 432 | enabling you to perform prediction on the deployed endpoint. The name of the 433 | prediction script is the same as the name of the endpoint (`ENDPOINT_NAME`) 434 | specified when creating the endpoint. 435 | 436 | The prediction UDF makes a real-time and synchronous call to the SageMaker 437 | endpoint. The prediction SQL command takes all the columns used while 438 | creating the model as inputs, appends the prediction result to these columns and 439 | the response is returned immediately. For more information, please check the 440 | [User Guide](https://github.com/exasol/sagemaker-extension/blob/main/doc/user_guide/user_guide.md). 441 | You can make prediction for this use case as follows: 442 | 443 | ```sql 444 | SELECT IDAPrediction."APSPredictor"( 445 | AA_000,AB_000,AC_000,AD_000,AE_000,AF_000,AG_000, 446 | ... 447 | EE_005,EE_006,EE_007,EE_008,EE_009,EF_000,EG_000 448 | ) FROM IDA.TEST 449 | GROUP BY IPROC(), 450 | MOD(ROWNUM, 6); 451 | ``` 452 | 453 | |AA_000 |AB_000|AC_000 |AD_000 |AE_000 |AF_000 |AG_000| ... |PREDICTIONS | 454 | |----------|------|-------------|-------|-------|-------|------|------|------------| 455 | | 79492.00| | 0.00| | 0.00| 0.00| 0.00| ... | neg| 456 | | 41026.00| | 518.00| 392.00| 0.00| 0.00| 0.00| ... | neg| 457 | | 43728.00| 0.00|2130706432.00| 144.00| 522.00| 142.00| 0.00| ... | neg| 458 | | 55896.00| | 74.00| 70.00| 0.00| 0.00| 0.00| ... | neg| 459 | | 40122.00| | 232.00| 210.00| 0.00| 0.00| 0.00| ... | neg| 460 | | ...| ...| ...| ...| ...| ...| ...| ... | ...| 461 | 462 | Please keep in mind, that you can get high efficiency by executing the prediction 463 | UDF script using the `GROUP BY IPROC()` statement, which allows you to perform 464 | predictions on each node in parallel. 465 | 466 | ### 3.6 Delete Endpoint 467 | It is important to delete the endpoint created, when you are finished with the 468 | endpoint Otherwise, the endpoint will continue to be charged. You can use the 469 | following SQL command to delete the endpoint and associated resources: 470 | 471 | ```sh 472 | EXECUTE SCRIPT IDA."SME_DELETE_SAGEMAKER_AUTOPILOT_ENDPOINT"( 473 | '', 474 | '', 475 | '' 476 | ); 477 | ``` 478 | 479 | Please note, that by the execution of the deletion SQL command, the predicted 480 | UDF script will not be deleted and will not be able to run until the endpoint 481 | is restarted. 482 | 483 | ## 4.Conclusion 484 | In this tutorial, we went through each steps of the installation and deployment 485 | of the Exasol SageMaker-Extension, and examined in detail how it works on 486 | a real-world problem. 487 | 488 | The Exasol SageMaker-Extension provides a simple installation with the 489 | pre-packaged releases and perform a functional deployment with an easy-to-use 490 | CLI tool. The SQL commands which come with the deployment enable you to create 491 | the machine learning model of the table you want using the SageMaker Autopilot 492 | service and make your predictions. 493 | -------------------------------------------------------------------------------- /tutorials/script-languages/README.md: -------------------------------------------------------------------------------- 1 | ## Script-Language Container Tutorials 2 | 3 | This section contains tutorials for building and customizing Script-Language Container. Script-Language Container are used for adding packages to the Exasol UDFs. As such they are often used in Machine Learning use cases to provide access to additional Machine Learning libraries. 4 | 5 | * [Building and Customization Script-Langugae Container](script-languages.ipynb) 6 | -------------------------------------------------------------------------------- /tutorials/script-languages/bash_runner.py: -------------------------------------------------------------------------------- 1 | from pexpect import replwrap, EOF 2 | import pexpect 3 | # Inspired by https://github.com/takluyver/bash_kernel 4 | class IREPLWrapper(replwrap.REPLWrapper): 5 | """A subclass of REPLWrapper that gives incremental output 6 | The parameters are the same as for REPLWrapper, except for one 7 | extra parameter: 8 | :param line_output_callback: a callback method to receive each batch 9 | of incremental output. It takes one string parameter. 10 | """ 11 | def __init__(self, cmd_or_spawn, orig_prompt, prompt_change, 12 | extra_init_cmd=None, line_output_callback=None): 13 | self.line_output_callback = line_output_callback 14 | replwrap.REPLWrapper.__init__(self, cmd_or_spawn, orig_prompt, 15 | prompt_change, extra_init_cmd=extra_init_cmd) 16 | 17 | def _expect_prompt(self, timeout=-1): 18 | if timeout == None: 19 | # "None" means we are executing code from a Jupyter cell by way of the run_command 20 | # in the do_execute() code below, so do incremental output. 21 | while True: 22 | pos = self.child.expect_exact([self.prompt, self.continuation_prompt, u'\r\n'], 23 | timeout=None) 24 | if pos == 2: 25 | # End of line received 26 | self.line_output_callback(self.child.before) 27 | else: 28 | if len(self.child.before) != 0: 29 | # prompt received, but partial line precedes it 30 | self.line_output_callback(self.child.before) 31 | break 32 | else: 33 | # Otherwise, use existing non-incremental code 34 | pos = replwrap.REPLWrapper._expect_prompt(self, timeout=timeout) 35 | 36 | # Prompt received, so return normally 37 | return pos 38 | 39 | def run(code): 40 | child = pexpect.spawn("bash", echo=False, encoding='utf-8', codec_errors='replace') 41 | ps1 = replwrap.PEXPECT_PROMPT[:5] + u'\[\]' + replwrap.PEXPECT_PROMPT[5:] 42 | ps2 = replwrap.PEXPECT_CONTINUATION_PROMPT[:5] + u'\[\]' + replwrap.PEXPECT_CONTINUATION_PROMPT[5:] 43 | prompt_change = u"PS1='{0}' PS2='{1}' PROMPT_COMMAND=''".format(ps1, ps2) 44 | 45 | # Using IREPLWrapper to get incremental output 46 | bashwrapper = IREPLWrapper(child, u'\$', prompt_change, 47 | extra_init_cmd="export PAGER=cat", 48 | line_output_callback=lambda x: print(x)) 49 | bashwrapper.run_command(code.rstrip(), timeout=None) 50 | -------------------------------------------------------------------------------- /tutorials/script-languages/requirements.txt: -------------------------------------------------------------------------------- 1 | pexpect==4.8.0 2 | pyexasol==0.16.1 -------------------------------------------------------------------------------- /tutorials/script-languages/slc_main_build_steps.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 |
udfclient_deps
udfclient_deps
language_deps
language_deps
build_run
build_run
flavor_base_deps
flavor_base_deps
flavor_customization
flavor_customization
release
release
Viewer does not support full SVG 1.1
-------------------------------------------------------------------------------- /tutorials/spatial-analysis/README.md: -------------------------------------------------------------------------------- 1 | ## Spatial Analysis Tutorials 2 | This section contains tutorials for doing Spatial Analysis within the Exasol database. We are going to provide examples, tasks and use cases. 3 | 4 | ### Languages: 5 | 6 | * [Visualizing Spatial Queries](visualizing_spatial_queries) 7 | 8 | ### Prerequisites 9 | 10 | For general prerequisites, please refer to [Prerequisites](../../README.md). 11 | -------------------------------------------------------------------------------- /tutorials/spatial-analysis/visualizing_spatial_queries/README.md: -------------------------------------------------------------------------------- 1 | # Exasol Spatial Demo with Jupyter Notebook 2 | 3 | [ 4 | Geospatial data](https://docs.exasol.com/sql_references/geospatialdata.htm) can be stored and analyzed in the Exasol database using the GEOMETRY datatype. In this solution, we will show you some examples of how to work with geospatial data inside a Jupyter Notebook with the help of SQL inline magic and visualize geospatial data on map using Python libraries. 5 | 6 | # Table of contents 7 | 8 | 9 | 10 | - [Prerequisites](#prerequisites) 11 | - [Datasets](#datasets) 12 | - [Use Cases](#use-cases) 13 | - [External Resources](#external-resources) 14 | 15 | 16 | 17 | ### Prerequisites 18 | 19 | To run this demo a working [Jupyter notebook](https://jupyter.org/install) is required with python version 2.7 or greater. After Python and Jupyter notebook installation, we need; [ipython-sql library](#ipython-sql-library) to run SQL from Jupyter notebook, [SQL Alchemy](https://www.sqlalchemy.org/) dialect to [connect to EXASOL](#connection-to-exasol) and some additional [python libraries](#additional-python-libraries) for data visualization. GeoJSON files containing spatial data for New York City needs to be downloaded in the [geojsonfiles](geojsonfiles) folder. 20 | 21 | #### GeoJSON files 22 | 23 | Download the following GeoJSON files in [geojsonfiles](geojsonfiles) folder: 24 | 25 | 1. New York City Streets data: 26 | - https://storage.googleapis.com/exasol_data_science_examples_data/visualizing_spatial_queries/geojsonfiles/nyc_street_data.geojson 27 | 2. New York City Borough boundaries data: 28 | - https://storage.googleapis.com/exasol_data_science_examples_data/visualizing_spatial_queries/geojsonfiles/nycboroughboundaries.geojson 29 | 3. New York City Neighborhood boundaries data: 30 | - https://storage.googleapis.com/exasol_data_science_examples_data/visualizing_spatial_queries/geojsonfiles/nycneighborhoods.geojson 31 | 32 | #### IPython-sql library 33 | 34 | [IPython-sql libraray](https://github.com/catherinedevlin/ipython-sql) enables the use of Jupyter magic functions. With Jupyter magic functions, Jupyter notebooks can be used for data analysis with SQL on a database. Magic functions are pre-defined functions in Jupyter kernel that executes supplied commands. They are prefaced with `%` character. Usage and installation instructions can be found [here](https://github.com/catherinedevlin/ipython-sql). After installation run the following command: 35 | 36 | ```mysql 37 | %load_ext sql 38 | ``` 39 | 40 | #### Connection to EXASOL 41 | 42 | To connect to EXASOL install the [SQLAlchemy](https://www.sqlalchemy.org/) dialect for EXASOL database. Installation instructions and project details can be found [here](https://pypi.org/project/sqlalchemy-exasol/) 43 | 44 | After installation, connect to EXASOL using the following command: 45 | 46 | ```mysql 47 | %sql exa+pyodbc://USER:PASSWORD@DSN 48 | ``` 49 | 50 | DSN should point to your ODBC installation. For EXASOL6.2 ODBC download and installation details visit [EXASOL ODBC installation](https://www.exasol.com/portal/display/DOWNLOAD/6.2) 51 | 52 | #### Additional Python libraries 53 | 54 | Additional python libraries are used to process and visualize geospatial data. We make use of the following python libraries for this demo: 55 | 56 | 1. [Folium](https://pypi.org/project/folium/) 57 | 2. [Pandas](https://pandas.pydata.org/pandas-docs/stable/install.html) 58 | 3. [GeoJSON](https://pypi.org/project/geojson/) 59 | 4. [JSON](https://docs.python.org/3/library/json.html) 60 | 5. [Requests](https://pypi.org/project/requests/) 61 | 62 | #### Jupter Notebook extensions 63 | 64 | Extensions allow to enhance features of Jupyter Notebook. They are easy to install and configure using the `Nbextensions configuration` page. We have used two extensions in our demo. Remember that the purpose of these extensions is to help visualize the results and are not required to run the [visualizing_spatial_queries.ipynb](visualizing_spatial_queries.ipynb) demo. 65 | 66 | Installation and configuration details for these extensions can be found [here](https://github.com/ipython-contrib/jupyter_contrib_nbextensions) 67 | 68 | ##### Hide Input 69 | 70 | This extension allows hiding of an individual cell. All the code segments that are not necessary for this particular demo are hidden for better visualization and usability. 71 | 72 | ##### Limit Output 73 | 74 | Limits the output of a cell. This comes in handy as large result outputs can break the notebook. Limiting the output makes it easy to render results. 75 | 76 | ### Datasets 77 | 78 | For the purpose of this demo we use `NYC_UBER` and `NYC_TAXI` schemas from `demodb.exasol.com`. 79 | 80 | Use the following command to open a schema 81 | 82 | ```mysql 83 | %sql open schema SCHEMA_NAME 84 | ``` 85 | 86 | Uber pickups data is stored in `UBER_TAXI_DATA` table in `NYC_UBER` schema. Use `DESCRIBE` to get an overview of this table 87 | 88 | ```mysql 89 | %sql describe NYC_UBER.UBER_TAXI_DATA 90 | ``` 91 | 92 | New York City Taxi pickups data is stored in `TRIPS` table in `NYC_TAXI schema`. Use `DESCRIBE` to get an overview of this table 93 | 94 | ```mysql 95 | %sql describe NYC_TAXI.TRIPS 96 | ``` 97 | 98 | ### Use Cases 99 | 100 | Let's go briefly through the use cases implemented in [visualizing_spatial_queries.ipynb](visualizing_spatial_queries.ipynb) 101 | 102 | #### Uber pickups grouped by New York City Boroughs 103 | 104 | In the first use case, we use New York City data in `NYC_UBER` schema to show Uber pickups per borough in New York City. We use Uber pickups data and NYC borough data to query the total number of Uber pickups per borough using inline SQL magic. To visualize New York City borough boundaries we use [New York City Borough boundaries](#GeoJSON-files) dataset. 105 | 106 | #### Uber pickups grouped by New York City Neighborhoods 107 | 108 | In the second use case, we use New York City data in `NYC_UBER` schema to show Uber pickups per neighborhood in New York City. We use Uber pickups data and NYC neighborhood data to query the total number of Uber pickups per neighborhood using inline SQL magic. To visualize New York City neighborhood boundaries we use [New York City Neighborhood boundaries](#GeoJSON-files) dataset. 109 | 110 | #### New York City Streets with highest Uber pickups 111 | 112 | In the third use case, we use NYC street data and NYC Uber pickup data to visualize top streets according to number of pickups. This data is stored in our demo database in `NYC_UBER` schema. To visualize New York City neighborhood boundaries we use [New York City Streets](#GeoJSON-files) dataset. Example in this query can be parameterized to view different results on the map by providing a value for variable `NumberOfStreets` 113 | 114 | #### Comparison of Yellow Taxi and Uber pickups within a certain radius of a location in New York City 115 | 116 | In the fourth use case, we make a comparison between the number of Uber and Yellow Taxi pickups. For this example we have selected **Museum of the New York City** in Manhattan as a pickup point. We have used geocoding to find the latitude and longitude values of a given location. We have Uber data from April-Sept 2014. By changing the value for `month` within this range we can visualize different sets of geospatial data on map. Radius defines the `radius` value of given lat/long point. For speed purposes its recommended to keep radius value small. 117 | 118 | ### External Resources 119 | 120 | GeoJSON files used for this demo were obtained from the following sources: 121 | 122 | - NYC borough boundary polygons: http://data.beta.nyc/dataset/nyc-borough-boundaries 123 | - NYC neighborhood boundary polygons: http://data.beta.nyc/dataset/nyc-neighborhood-boundaries 124 | - NYC streets multi-line data: https://data.cityofnewyork.us/City-Government/NYC-Street-Centerline-CSCL-/exjm-f27b 125 | 126 | Currently the following external resources are unavailable therefore you need to download required [GeoJSON files](#GeoJSON-files). 127 | 128 | -------------------------------------------------------------------------------- /tutorials/spatial-analysis/visualizing_spatial_queries/geojsonfiles/README.md: -------------------------------------------------------------------------------- 1 | All GeoSJON files required for the spatial demo are downloaded here 2 | -------------------------------------------------------------------------------- /tutorials/spatial-analysis/visualizing_spatial_queries/visualizing_spatial_queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Spatial demo with Jupyter Notebook and Exasol" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Prerequsities" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Installing all python libraries required for this demo" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "hide_input": false 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "from IPython import get_ipython\n", 33 | "if get_ipython() is None:\n", 34 | " from IPython.core.interactiveshell import InteractiveShell\n", 35 | " InteractiveShell.instance()\n", 36 | "!pip install ipython-sql sqlalchemy-exasol folium pandas geojson requests jupyter_contrib_nbextensions geopy" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Importing all required installed libraries to Jupyter Notebook" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 1, 49 | "metadata": { 50 | "hide_input": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "import folium\n", 55 | "import pandas as pd\n", 56 | "import os\n", 57 | "import geojson\n", 58 | "import warnings\n", 59 | "import requests as r\n", 60 | "import json\n", 61 | "import geopy.geocoders\n", 62 | "from geopy.geocoders import Nominatim\n", 63 | "warnings.filterwarnings('ignore')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "Load Jupyter magic functions" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 2, 76 | "metadata": { 77 | "hide_input": false 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "%reload_ext sql" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Enter user credentials" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "EXA_USER = \"\"\n", 98 | "EXA_PWD = \"\"\n", 99 | "DSN = \"exadb\"" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Connect to DSN" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 6, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "'Connected: @None'" 118 | ] 119 | }, 120 | "execution_count": 6, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "%sql exa+pyodbc://{EXA_USER}:{EXA_PWD}@{DSN}" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "Set query cache off" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "%sql alter session set query_cache='off';" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### Datasets" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "#### Open schema NYC_UBER" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "%sql open schema NYC_UBER" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Overview of UBER_TAXI_DATA table" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "%sql describe NYC_UBER.UBER_TAXI_DATA" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "Count of uber pickup records in UBER_TAXI_DATA table" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "%sql select count(*) from NYC_UBER.UBER_TAXI_DATA" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "Date and time range of uber pickups " 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 7, 210 | "metadata": { 211 | "hide_input": false 212 | }, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "1 rows affected.\n" 219 | ] 220 | }, 221 | { 222 | "data": { 223 | "text/html": [ 224 | "\n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | "
start_dateend_date
2014-04-01 00:00:002014-09-30 22:59:00
" 234 | ], 235 | "text/plain": [ 236 | "[(datetime.datetime(2014, 4, 1, 0, 0), datetime.datetime(2014, 9, 30, 22, 59))]" 237 | ] 238 | }, 239 | "execution_count": 7, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "%sql select min(DATETIME) as START_DATE,max(DATETIME) as END_DATE from NYC_UBER.UBER_TAXI_DATA" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "#### Open schema NYC_TAXI" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "%sql open schema NYC_TAXI" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "Overview of TRIPS table" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "%sql describe NYC_TAXI.TRIPS" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "Count of yellow taxi pickups records in TRIPS table" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "%sql select count(*) from NYC_TAXI.TRIPS where CAB_TYPE_ID=1" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "Date and time range of New York City yellow taxi pickups " 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 8, 306 | "metadata": { 307 | "hide_input": false 308 | }, 309 | "outputs": [ 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "1 rows affected.\n" 315 | ] 316 | }, 317 | { 318 | "data": { 319 | "text/html": [ 320 | "\n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | "
start_dateend_date
2009-01-01 00:00:002017-06-30 23:59:59
" 330 | ], 331 | "text/plain": [ 332 | "[(datetime.datetime(2009, 1, 1, 0, 0), datetime.datetime(2017, 6, 30, 23, 59, 59))]" 333 | ] 334 | }, 335 | "execution_count": 8, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "%sql select min(PICKUP_DATETIME) as START_DATE,max(PICKUP_DATETIME) as END_DATE from NYC_TAXI.TRIPS" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "## Use Cases\n" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "### Uber pickups grouped by New York City boroughs" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "In the first use case, we use `DISJUNCT_NIGHBORHOODS` and `UBER_TAXI_DATA` tables from `NYC_UBER` schema to visualize uber pickups grouped by boroughs. The geometry column in `DISJUNCT_NEIGHBORHOODS` table contains polygons for boroughs while the geometry column in `UBER_TAXI_DATA` table from `NYC_UBER` schema contains pickup points " 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "Geometry column of type `Polygon` in `DISJUNCT_NEIGHBORHOODS`table " 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "hide_input": false, 377 | "scrolled": false 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "%sql select THE_GEOM from NYC_UBER.DISJUNCT_NEIGHBORHOODS limit 1" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "Geometry column of type `POINT` in `UBER_TAXI_DATA` table" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "%sql select THE_GEOM from NYC_UBER.UBER_TAXI_DATA limit 1" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "Exasol automatically creates indices for equality join conditions, even when expressions are used for comparison.\n", 405 | "Exasol 6.1 introduced indices on geospatial data types for joins using geospatial functions like ST_CONTAINS or ST_INTERSECTS. In this use case we use `ST_CONTAINS` function to join table `DISJUNCT_NEIGHBORHOODS` with `UBER_TAXI_DATA` on geometry columns grouped by New York City boroughs " 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "`%%time` is a cell magic function used here to calculate query execution time. `Wall time` gives the total of query runtime and cell rendering time (negligible)" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 9, 418 | "metadata": {}, 419 | "outputs": [ 420 | { 421 | "name": "stdout", 422 | "output_type": "stream", 423 | "text": [ 424 | "5 rows affected.\n", 425 | "Wall time: 56.7 ms\n" 426 | ] 427 | }, 428 | { 429 | "data": { 430 | "text/html": [ 431 | "\n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | "
borough_idpickups
Manhattan3443402
Brooklyn593648
Queens342186
Bronx31589
Staten Island1034
" 457 | ], 458 | "text/plain": [ 459 | "[('Manhattan', 3443402),\n", 460 | " ('Brooklyn', 593648),\n", 461 | " ('Queens', 342186),\n", 462 | " ('Bronx', 31589),\n", 463 | " ('Staten Island', 1034)]" 464 | ] 465 | }, 466 | "execution_count": 9, 467 | "metadata": {}, 468 | "output_type": "execute_result" 469 | } 470 | ], 471 | "source": [ 472 | "%%time\n", 473 | "%sql select borough_id, count(*) as pickups FROM NYC_UBER.DISJUNCT_NEIGHBORHOODS n INNER JOIN NYC_UBER.UBER_TAXI_DATA t ON ST_CONTAINS(n.THE_GEOM, t.THE_GEOM) group by borough_id order by pickups desc" 474 | ] 475 | }, 476 | { 477 | "cell_type": "markdown", 478 | "metadata": {}, 479 | "source": [ 480 | "Visualizing geospatial data of uber pickups grouped by New York City boroughs " 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": { 487 | "hide_input": true, 488 | "scrolled": false 489 | }, 490 | "outputs": [], 491 | "source": [ 492 | "#--- to be removed if direct links to http://data.beta.nyc works again---#\n", 493 | "\n", 494 | "nyc_boroughs = 'geojsonfiles/nycboroughboundaries.geojson'\n", 495 | "\n", 496 | "#nyc_boroughs = \"http://data.beta.nyc//dataset/68c0332f-c3bb-4a78-a0c1-32af515892d6/resource/7c164faa-4458-4ff2-9ef0-09db00b509ef/download/42c737fd496f4d6683bba25fb0e86e1dnycboroughboundaries.geojson\"\n", 497 | "\n", 498 | "borough_pickups_sql = %sql select borough_id, count(*) as pickups FROM NYC_UBER.DISJUNCT_NEIGHBORHOODS n INNER JOIN NYC_UBER.UBER_TAXI_DATA t ON ST_CONTAINS(n.THE_GEOM, t.THE_GEOM) group by borough_id order by pickups desc\n", 499 | "borough_pickups_df = borough_pickups_sql.DataFrame()\n", 500 | "\n", 501 | "#base map\n", 502 | "m1 = folium.Map([40.7586,-73.9706], zoom_start=10)\n", 503 | "\n", 504 | "# Choropleth:\n", 505 | "# geo_data: data of borough polygons\n", 506 | "# Columns: 1st column is key (Borough) and 2nd column is value(total number of pickups)\n", 507 | "# Key_on: Variable in the GeoJSON file to bind the data to\n", 508 | "# bins = width between values\n", 509 | "choropleth = folium.Choropleth(geo_data=nyc_boroughs,name = 'choropleth', data = borough_pickups_df, columns = ['borough_id','pickups'],key_on='feature.properties.borough', fill_color='YlGnBu',bins=[1,100,300000,500000,600000,3500000],fill_opacity = 0.5,nan_fill_color='yellow' ,legend_name='Number of pickups', highlight=True).add_to(m1)\n", 510 | "\n", 511 | "#hover over to view tooltip with borough name \n", 512 | "choropleth.geojson.add_child(\n", 513 | " folium.features.GeoJsonTooltip(['borough'])\n", 514 | ")\n", 515 | "\n", 516 | "# We can also export this interactive map to results/...html file\n", 517 | "#m.save(os.path.join('results', 'GeoJSONWithoutTitles_2.html'))\n", 518 | "\n", 519 | "# display map \n", 520 | "display(m1)" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": { 526 | "hide_input": true 527 | }, 528 | "source": [ 529 | "### Uber pickups grouped by New York City Neighborhoods" 530 | ] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "metadata": { 535 | "hide_input": false 536 | }, 537 | "source": [ 538 | "In the second use case, we use `DISJUNCT_NIGHBORHOODS` and `UBER_TAXI_DATA` tables from `NYC_UBER` schema to visualize uber pickups grouped by neighborhoods. The geometry column in `DISJUNCT_NEIGHBORHOODS` table contains polygons for neighborhoods while the geometry column in `UBER_TAXI_DATA` table from `NYC_UBER` schema contains pickup points " 539 | ] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": {}, 544 | "source": [ 545 | "Similar to the previous use case, we join neighborhood polygons with uber pickup points using `ST_CONTAINS` function to count total uber pickups grouped by New York City neighborhoods" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "metadata": { 552 | "hide_input": false, 553 | "scrolled": false 554 | }, 555 | "outputs": [], 556 | "source": [ 557 | "%%time\n", 558 | "%sql select neighborhood,count(*) as pickups FROM NYC_UBER.DISJUNCT_NEIGHBORHOODS n INNER JOIN NYC_UBER.UBER_TAXI_DATA t ON ST_CONTAINS(n.THE_GEOM, t.THE_GEOM) group by neighborhood order by pickups desc limit 10" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "Visualizing geospatial data of uber pickups grouped by New York City neighborhoods" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": { 572 | "hide_input": true, 573 | "scrolled": false 574 | }, 575 | "outputs": [], 576 | "source": [ 577 | "#--- to be removed if direct links to data.beta.nyc works ---#\n", 578 | "\n", 579 | "nyc_neighborhoods = 'geojsonfiles/nycneighborhoods.geojson'\n", 580 | "\n", 581 | "#nyc_neighborhoods = \"http://data.beta.nyc//dataset/0ff93d2d-90ba-457c-9f7e-39e47bf2ac5f/resource/35dd04fb-81b3-479b-a074-a27a37888ce7/download/d085e2f8d0b54d4590b1e7d1f35594c1pediacitiesnycneighborhoods.geojson\"\n", 582 | "neighborhood_pickups_sql = %sql select neighborhood,count(*) as pickups FROM NYC_UBER.DISJUNCT_NEIGHBORHOODS n INNER JOIN NYC_UBER.UBER_TAXI_DATA t ON ST_CONTAINS(n.THE_GEOM, t.THE_GEOM) group by neighborhood order by pickups desc \n", 583 | "neighborhood_pickups_df = neighborhood_pickups_sql.DataFrame()\n", 584 | "\n", 585 | "#base map\n", 586 | "m2 = folium.Map([40.7586,-73.9706], zoom_start=10)\n", 587 | "\n", 588 | "# Choropleth:\n", 589 | "# geo_data: data of borough polygons\n", 590 | "# Columns: 1st column is key (Neighborhood) and 2nd column is value(total number of pickups). \n", 591 | "# Key_on: Variable in the GeoJSON file to bind the data to\n", 592 | "# bins = width between values\n", 593 | "# nan_fill_colors= yellow for neighborhoods with no pickup data\n", 594 | "# For a detailed reference see https://python-visualization.github.io/folium/modules.html#Extra_Features\n", 595 | "\n", 596 | "choropleth = folium.Choropleth(geo_data=nyc_neighborhoods,name = 'choropleth', data = neighborhood_pickups_df, columns = ['neighborhood','pickups'],key_on='feature.properties.neighborhood', fill_color='YlOrRd',fill_opacity = 0.5, legend_name='Number of pickups',nan_fill_color='yellow',nan_fill_opacity=0.4,bins=[1,560,114694,181979,349255,666970],highlight=True).add_to(m2)\n", 597 | "\n", 598 | "choropleth.geojson.add_child(\n", 599 | " folium.features.GeoJsonTooltip(['neighborhood'])\n", 600 | ")\n", 601 | "\n", 602 | "\n", 603 | "# We can also export this interactive map to results/...html file\n", 604 | "#m.save(os.path.join('results', 'GeoJSONWithoutTitles_2.html'))\n", 605 | "\n", 606 | "# display map with choropleth\n", 607 | "display(m2)" 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "metadata": { 613 | "hide_input": true 614 | }, 615 | "source": [ 616 | "### New York City Streets with highest Uber pickups" 617 | ] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": {}, 622 | "source": [ 623 | "In this use case, we use NYC street data and NYC Uber pickup data to visualize top streets according to number of pickups. Lets have a look at `STREETS` table from `NYC_UBER` schema. `PHYSICALID`, `THE_GEOM` and `ST_NAME` are columns used for this demo" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [ 632 | "%sql describe NYC_UBER.STREETS" 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": {}, 638 | "source": [ 639 | "Before querying for New York City streets with highest Uber pickups, we create a view from `STREETS` table and transform the geometry column from a spherical coordinate system(SRID:4326) to a Mercator cordinate system(SRID:3857) using `ST_TRANSFORM` function. The transformation from 4326 WGS84 (spherical coordinates) to 3857 (Google) Mercator has the advantage that for Mercator, distance is measured in meters (in contrast to 4326 where distance is measured in degrees). Mercator is used by most of the map services including OpenStreetMap (used in this demo). After transformation a buffer of 50 meters is added around the street geometry column using `ST_BUFFER` function to account for positioning inaccuracy. A snapshot of `STREETS_TRANSFORMED` view:" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": { 645 | "hide_input": false 646 | }, 647 | "source": [ 648 | "``` mysql\n", 649 | "CREATE OR REPLACE VIEW \"NYC_UBER\".\"STREETS_TRANSFORMED\" as select\n", 650 | "...\n", 651 | "ST_BUFFER(ST_TRANSFORM(THE_GEOM, 3857),50) as THE_GEOM,\n", 652 | "...\n", 653 | "from NYC_UBER.STREETS;\n", 654 | "```" 655 | ] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "metadata": {}, 660 | "source": [ 661 | "Similary the geometry column from `UBER_TAXI_DATA` is transformed to Mercator using `ST_TRANSFORM` function. A snapshot of the transformed `UBER_TAXI_DATA_TRANSFORMED` view: " 662 | ] 663 | }, 664 | { 665 | "cell_type": "markdown", 666 | "metadata": {}, 667 | "source": [ 668 | "``` mysql\n", 669 | "CREATE OR REPLACE VIEW \"NYC_UBER\".\"UBER_TAXI_DATA_TRANSFORMED\"\n", 670 | "...\n", 671 | "as select DATETIME,LAT,LON, BASE, ST_TRANSFORM(the_geom,3857) as the_geom \n", 672 | "...\n", 673 | "from UBER_TAXI_DATA;\n", 674 | "```\n" 675 | ] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "metadata": {}, 680 | "source": [ 681 | "Select the number of streets with highest pickups to view on map" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "metadata": {}, 688 | "outputs": [], 689 | "source": [ 690 | "NumberOfStreets = 5" 691 | ] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": {}, 696 | "source": [ 697 | "EXASOL query joins the views based on the geometry columns using `ST_CONTAINS`" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": { 704 | "scrolled": false 705 | }, 706 | "outputs": [], 707 | "source": [ 708 | "%%time\n", 709 | "%sql select s.full_stree as street_name,count(*) as pickups from (select * from \"NYC_UBER\".\"STREETS_TRANSFORMED\" order by false) s INNER JOIN (select * from \"NYC_UBER\".\"UBER_TAXI_DATA_TRANSFORMED\" order by false) t ON ST_CONTAINS(s.the_geom,t.the_geom) group by s.full_stree order by pickups desc limit $NumberOfStreets" 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "metadata": {}, 715 | "source": [ 716 | "Visualizing geospatial data of uber pickups grouped by New York City streets" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": { 723 | "hide_input": true, 724 | "scrolled": false 725 | }, 726 | "outputs": [], 727 | "source": [ 728 | "#---- to be removed it API call gets fixed: atm returns only 1000 rows -----#\n", 729 | "\n", 730 | "#data = open('C:/Users/smha/GeoSpatialViz/geojsonfiles/nyc_street_data.geojson','r')\n", 731 | "#jsondata = json.loads(data)\n", 732 | "path = 'geojsonfiles/nyc_street_data.geojson'\n", 733 | "with open(path) as f:\n", 734 | " data = geojson.load(f)\n", 735 | "features = data['features'][0]\n", 736 | "\n", 737 | "#---------------------------------------------------------------------------#\n", 738 | "#--------------- Instead use API call to data endpoint --------------------#\n", 739 | "\n", 740 | "# url = 'https://data.cityofnewyork.us/resource/gr6w-nsbv.json'\n", 741 | "\n", 742 | "# # get json street data by from NYC city data API\n", 743 | "# req = r.get('https://data.cityofnewyork.us/resource/gr6w-nsbv.json')\n", 744 | "# jsondata = json.loads(req.text)\n", 745 | "\n", 746 | "\n", 747 | "# # convert json to geojson for folium choropleth\n", 748 | "# GeoJSON = []\n", 749 | "# for i in range(0,len(jsondata)-1):\n", 750 | "# GeoJSON.append(\n", 751 | "# {\n", 752 | "# \"type\": \"Feature\", \n", 753 | "# \"properties\":\n", 754 | "# {\n", 755 | "# \"physicalid\": jsondata[i][\"physicalid\"],\n", 756 | "# \"full_stree\": jsondata[i][\"full_stree\"],\n", 757 | "# },\n", 758 | "# \"geometry\": jsondata[i]['the_geom'],\n", 759 | "# } )\n", 760 | " \n", 761 | "# GeoJSON[0]\n", 762 | "# data= {\"type\": \"FeatureCollection\",\"features\": GeoJSON }\n", 763 | "\n", 764 | "#top street SQL inline magic + EXASOL query\n", 765 | "top5streets_sql = %sql select s.full_stree,count(*) as pickups from (select * from \"NYC_UBER\".\"STREETS_TRANSFORMED\" order by false) s INNER JOIN (select * from \"NYC_UBER\".\"UBER_TAXI_DATA_TRANSFORMED\" order by false) t ON ST_CONTAINS(s.the_geom,t.the_geom) group by s.full_stree order by pickups desc limit $NumberOfStreets\n", 766 | "top5streets_df = top5streets_sql.DataFrame()\n", 767 | "\n", 768 | "# taking top x 'physicalid's as a type string\n", 769 | "top5streets_df['full_stree'] = top5streets_df.full_stree.astype(str)\n", 770 | "\n", 771 | "# save column 'physicalid' from top street dataframe for the next steps\n", 772 | "dfList = list(top5streets_df['full_stree'])\n", 773 | "\n", 774 | "# match full street column names with street names from json and save correspoding cordinates to a list \n", 775 | "l = list() \n", 776 | "for i in range(0,len(data['features'])-1): \n", 777 | " if data['features'][i]['properties']['full_stree'] in dfList: \n", 778 | " l.append(data['features'][i])\n", 779 | " \n", 780 | "\n", 781 | "# create a new dataframe with only the selected street geometry points\n", 782 | "data['features'] = l\n", 783 | "streetdata = json.dumps(data)\n", 784 | "#base map\n", 785 | "\n", 786 | "m3 = folium.Map([40.7586,-73.9706], zoom_start=12)\n", 787 | "\n", 788 | "# Choropleth\n", 789 | "# geo_data: data of borough polygons\n", 790 | "# Key_on: Variable in the GeoJSON file to bind the data to\n", 791 | "# bins = width bins between values\n", 792 | "# For a detailed reference see https://python-visualization.github.io/folium/modules.html#Extra_Features\n", 793 | "\n", 794 | "choropleth = folium.Choropleth(geo_data=streetdata,name = 'choropleth',key_on='feature.properties.full_stree', fill_color='YlGnBu',line_color = 'blue', line_weight= 5 , highlight=True).add_to(m3)\n", 795 | "\n", 796 | "choropleth.geojson.add_child(\n", 797 | " folium.features.GeoJsonTooltip(['full_stree'])\n", 798 | ")\n", 799 | "\n", 800 | "# We can also export this interactive map to results/...html file\n", 801 | "# m3.save(os.path.join('results', 'GeoJSONWithoutTitles_5.html'))\n", 802 | "\n", 803 | "# display map with choropleth\n", 804 | "display(m3)" 805 | ] 806 | }, 807 | { 808 | "cell_type": "markdown", 809 | "metadata": { 810 | "hide_input": true 811 | }, 812 | "source": [ 813 | "### Comparison of Taxi and Uber pickups within a certain radius of a location in New York City " 814 | ] 815 | }, 816 | { 817 | "cell_type": "markdown", 818 | "metadata": { 819 | "hide_input": true 820 | }, 821 | "source": [ 822 | "The following use case compares the number of Uber and Yellow Taxi pickups. For this example we have selected `Museum of the City of New York` in Manhattan as a pickup point. We have used geocoding to find the latitude and longitude values of a given location. To visualize geospatial data on map for a different location, change the value of `pos` variable. The value of month can be adjusted to visualize different results. Radius defines the radius around the given lat/long point. For speed purposes its recommended to keep radius value small. " 823 | ] 824 | }, 825 | { 826 | "cell_type": "markdown", 827 | "metadata": {}, 828 | "source": [ 829 | "Assigning query parameters " 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": null, 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [ 838 | "pos = \"Museum of the City of New York\"\n", 839 | "geolocator = Nominatim()\n", 840 | "geo = geolocator.geocode(pos, timeout=None) \n", 841 | "location_latitude = geo.latitude\n", 842 | "location_longitude = geo.longitude\n", 843 | "month = 6\n", 844 | "radius = 100\n", 845 | "geo_point = f\"\\'POINT({location_longitude} {location_latitude})\\'\"" 846 | ] 847 | }, 848 | { 849 | "cell_type": "markdown", 850 | "metadata": { 851 | "hide_input": true 852 | }, 853 | "source": [ 854 | "`ST_SETSRID` geospatial function is used to set the SRID(Spatial reference system identifier) of the given `geo_point`" 855 | ] 856 | }, 857 | { 858 | "cell_type": "markdown", 859 | "metadata": {}, 860 | "source": [ 861 | "``` mysql\n", 862 | "st_setsrid($geo_point,4326)\n", 863 | "```" 864 | ] 865 | }, 866 | { 867 | "cell_type": "markdown", 868 | "metadata": {}, 869 | "source": [ 870 | "After setting the SRID, the given `geopoint` is transformed to Mercator using `ST_TRANSFORM` function. To count the number of pickups within the radius of the given `geopoint` we use `ST_DISTANCE` function. `ST_DISTANCE` function calculates the distance between two geospatial points. " 871 | ] 872 | }, 873 | { 874 | "cell_type": "markdown", 875 | "metadata": {}, 876 | "source": [ 877 | "``` mysql\n", 878 | "st_distance(st_transform(st_setsrid($geo_point,4326),3857),the_geom) < 100\n", 879 | "```" 880 | ] 881 | }, 882 | { 883 | "cell_type": "markdown", 884 | "metadata": {}, 885 | "source": [ 886 | "Querying EXASOL to list pickup points for New York City yellow taxi given the above parameters" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": null, 892 | "metadata": { 893 | "scrolled": false 894 | }, 895 | "outputs": [], 896 | "source": [ 897 | "%%time\n", 898 | "%sql select pickup_latitude, pickup_longitude from nyc_taxi.trips where id in (select id from nyc_uber.nyc_taxi_with_point where st_distance(st_transform(st_setsrid($geo_point,4326),3857),the_geom) < $radius and year(pickup_date)=2014 and month(pickup_date)=$month) and CAB_TYPE_ID=1 " 899 | ] 900 | }, 901 | { 902 | "cell_type": "markdown", 903 | "metadata": {}, 904 | "source": [ 905 | "Querying EXASOL to list pickup points for Uber given the above parameters" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": null, 911 | "metadata": {}, 912 | "outputs": [], 913 | "source": [ 914 | "%%time\n", 915 | "%sql select lat,lon from nyc_uber.uber_taxi_data_transformed where st_distance(st_transform(st_setsrid($geo_point,4326),3857),the_geom) < $radius and year(datetime)=2014 and month(datetime)=$month" 916 | ] 917 | }, 918 | { 919 | "cell_type": "markdown", 920 | "metadata": { 921 | "hide_input": false 922 | }, 923 | "source": [ 924 | "Visualizing geospatial data comparing Taxi and Uber pickups within a certain radius of a location in New York City" 925 | ] 926 | }, 927 | { 928 | "cell_type": "code", 929 | "execution_count": null, 930 | "metadata": { 931 | "hide_input": true 932 | }, 933 | "outputs": [], 934 | "source": [ 935 | "nyc_jfk = %sql select pickup_latitude, pickup_longitude from nyc_taxi.trips where id in (select id from nyc_uber.nyc_taxi_with_point where st_distance(st_transform(st_setsrid($geo_point,4326),3857),the_geom) < $radius and year(pickup_date)=2014 and month(pickup_date)=$month)\n", 936 | "taxi_df = nyc_jfk.DataFrame()\n", 937 | "\n", 938 | "uber_JFK = %sql select * from nyc_uber.uber_taxi_data_transformed where st_distance(st_transform(st_setsrid($geo_point,4326),3857),the_geom) < $radius and year(datetime)=2014 and month(datetime)=$month\n", 939 | "uber_df = uber_JFK.DataFrame()\n", 940 | "\n", 941 | "#base map\n", 942 | "emp_m = folium.Map([location_latitude,location_longitude], zoom_start=20)\n", 943 | "\n", 944 | "# Add markers to pickup points on the map object \n", 945 | "for i in range(0,taxi_df.shape[0]-1):\n", 946 | " folium.Marker([taxi_df.iloc[i]['pickup_latitude'], taxi_df.iloc[i]['pickup_longitude']],icon=folium.Icon(color='orange', icon='taxi')).add_to(emp_m)\n", 947 | "for i in range(0,uber_df.shape[0]-1):\n", 948 | " folium.Marker([uber_df.iloc[i]['lat'], uber_df.iloc[i]['lon']]).add_to(emp_m) \n", 949 | "\n", 950 | "display(emp_m)" 951 | ] 952 | } 953 | ], 954 | "metadata": { 955 | "kernelspec": { 956 | "display_name": "Python 3", 957 | "language": "python", 958 | "name": "python3" 959 | }, 960 | "language_info": { 961 | "codemirror_mode": { 962 | "name": "ipython", 963 | "version": 3 964 | }, 965 | "file_extension": ".py", 966 | "mimetype": "text/x-python", 967 | "name": "python", 968 | "nbconvert_exporter": "python", 969 | "pygments_lexer": "ipython3", 970 | "version": "3.6.7" 971 | } 972 | }, 973 | "nbformat": 4, 974 | "nbformat_minor": 2 975 | } 976 | --------------------------------------------------------------------------------