├── .gitignore
├── LICENSE
├── README.md
├── doc
    └── changes
    │   ├── changelog.md
    │   └── changes_1.0.0.md
├── error_code_config.yml
├── examples
    └── tensorflow-with-gpu-preview
    │   ├── .gitignore
    │   ├── EXAConf
    │   ├── fetch_output_redirect_from_last_statement.sh
    │   ├── gcloud-create-instance.sh
    │   ├── gcloud-setup.sh
    │   ├── start_output_redirect_server.sh
    │   ├── system-status.sh
    │   ├── tensorflow-gpu-preview.ipynb
    │   └── tensorflow_udf
    │       ├── .gitignore
    │       ├── __init__.py
    │       ├── column_encoder.py
    │       ├── dataset_utils.py
    │       ├── identity_feature_column.py
    │       ├── keras_layer.py
    │       ├── requirements.txt
    │       ├── tensorflow_config.yaml
    │       ├── tensorflow_udf.py
    │       └── utils.py
└── tutorials
    ├── README.md
    ├── machine-learning
        ├── README.md
        ├── python
        │   ├── AzureML
        │   │   ├── ConnectAzureMLtoExasol.ipynb
        │   │   ├── Introduction.ipynb
        │   │   ├── InvokeModelFromExasolDBwithUDF.ipynb
        │   │   ├── TrainModelInAzureML.ipynb
        │   │   ├── img_src
        │   │   │   ├── access_key_azure.png
        │   │   │   ├── azureML_public_ip.png
        │   │   │   ├── cluster_creation.png
        │   │   │   ├── conda_file_artifact.png
        │   │   │   ├── connection_detail_generate.png
        │   │   │   ├── connection_details_acess_token.png
        │   │   │   ├── consume_endpoint.png
        │   │   │   ├── create_datastore.png
        │   │   │   ├── data_blobstore.png
        │   │   │   ├── download_all.png
        │   │   │   ├── download_file_arifact.png
        │   │   │   ├── file_path_bucketfs.png
        │   │   │   ├── get_data_link.png
        │   │   │   ├── get_data_link_2.png
        │   │   │   ├── manage_udf_files.png
        │   │   │   ├── registered_model.png
        │   │   │   └── resource_group.png
        │   │   ├── main.py
        │   │   └── score.py
        │   ├── README.md
        │   ├── sagemaker
        │   │   ├── ConnectSagemakerToExasol.ipynb
        │   │   ├── LoadExampleDataIntoExasol.ipynb
        │   │   ├── TrainSagemakerModelWithExasolData.ipynb
        │   │   └── UseSagemakerModelFromExasol.ipynb
        │   └── scikit-learn
        │   │   ├── README.md
        │   │   └── classification.ipynb
        └── sagemaker-extension
        │   ├── images
        │       ├── sme_deployment.png
        │       ├── sme_overview.png
        │       └── sme_training.png
        │   └── tutorial.md
    ├── script-languages
        ├── README.md
        ├── bash_runner.py
        ├── requirements.txt
        ├── script-languages.ipynb
        └── slc_main_build_steps.svg
    └── spatial-analysis
        ├── README.md
        └── visualizing_spatial_queries
            ├── README.md
            ├── geojsonfiles
                └── README.md
            └── visualizing_spatial_queries.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | venv/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Exasol
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ⚠ This project has been archived please check [Exasol's AI Lab](https://github.com/exasol/ai-lab) for data science examples.
 2 | 
 3 | # Data Science with Exasol
 4 | This repository contains a collection of examples and tutorials for Data Science and Machine Learning with Exasol. In those examples and tutorials you learn how to explore and prepare your data and build, train and deploy your model with and within Exasol.
 5 | 
 6 | **Currently, this repository is under development and we will add more and more examples and tutorials in the future.**
 7 | 
 8 | ## What's inside:
 9 | 
10 | * [Tutorials](tutorials): Tutorials show a complete workflow on a realistic use case and data. 
11 | * [Examples](examples): Examples only show how to integrate a specific technology, but not a whole data science workflow with it.
12 | 
13 | ## Prerequisites:
14 | 
15 | In general, you need:
16 |   * Exasol, in particular with user-defined functions (UDFs). In most cases Version 6.0 and above with [Script Language Container](https://github.com/exasol/script-languages) support is required. We provide a [Community Edition](https://www.exasol.com/portal/display/DOC/EXASOL+Community+Edition+Quick+Start+Guide) or [Docker images](https://github.com/exasol/docker-db). 
17 |   * Many examples or tutorials are provided as [Jupyter](https://jupyter.org/) Notebooks. We recommend to install a Jupyter server with access to the Database and the BucketFS (Documentation can be found in the [Exasol User Manual](https://www.exasol.com/portal/display/DOC/User+Manual+6.1.0) in Section 3.6.4). 
18 |   * Furthermore, many examples heavily use [pyexasol](https://github.com/badoo/pyexasol) to communicate with the Database. We recommend to install it on your Jupyter server.
19 | 
20 | Specific prerequisites are stated in each tutorial.
21 | 


--------------------------------------------------------------------------------
/doc/changes/changelog.md:
--------------------------------------------------------------------------------
1 | # Changes
2 | 
3 | * [1.0.0](changes_1.0.0.md) 


--------------------------------------------------------------------------------
/doc/changes/changes_1.0.0.md:
--------------------------------------------------------------------------------
 1 | # data-science-examples 1.0.0, released 2023-10-16
 2 | 
 3 | First release of this collection of examples for Integrating the Exasol Database with datascience
 4 | focused applications and packages.
 5 | 
 6 | ## Features / Enhancements
 7 | 
 8 | * #6: Added Tensorflow GPU UDF preview
 9 | * #8: Added SciKit-learn classification Example
10 | * #21: Added an example for connecting from AWS Sagemaker to an Exasol database
11 | * #23: Added an example for training a Sagemaker model with data from Exasol
12 | * #25: Added an example for using a Sagemaker model from within Exasol
13 | * #29: Add script-languages build and customization tutorial 
14 | * #34: Added an example for loading example data into the Exasol database 
15 | * #35: Added tutorial for Sagemaker-Extension
16 | * #43: Updated to Python3.8 minimal flavor in script-languages tutorial
17 | * #45: Added error_code_config
18 | * #39-#48: Added tutorial Series for Connection to AzureML
19 | 
20 | ## Bugs
21 | 
22 | * #53: Fixed error_code_config.yaml
23 | * #38: Fix typo
24 | 
25 | 


--------------------------------------------------------------------------------
/error_code_config.yml:
--------------------------------------------------------------------------------
1 | error-tags:
2 |   DSE:
3 |     highest-index: 0


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/.gitignore:
--------------------------------------------------------------------------------
1 | tfhub_modules
2 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/EXAConf:
--------------------------------------------------------------------------------
  1 | [Global]
  2 |     Revision = 15
  3 |     Checksum = COMMIT
  4 |     ClusterName = cl4
  5 |     Platform = Docker
  6 |     LicenseFile = /exa/etc/license.xml
  7 |     CoredPort = 10001
  8 |     SSHPort = 22
  9 |     XMLRPCPort = 443
 10 |     # List of networks for this cluster: 'private' is mandatory, 'public' is optional.
 11 |     Networks = private
 12 |     # Comma-separated list of nameservers for this cluster.
 13 |     NameServers = 8.8.8.8
 14 |     Timezone = Europe/Berlin
 15 |     # Nr. of hugepages ('0' = disabled, 'host' = manually configured on the host, 'auto' = set automatically based on DB config)
 16 |     Hugepages = 0
 17 |     ConfVersion = 6.1.3
 18 |     OSVersion = 6.1.3
 19 |     REVersion = 6.1.3
 20 |     DBVersion = 6.1.3
 21 |     ImageVersion = 6.1.3-d1
 22 | 
 23 | # SSL options
 24 | [SSL]
 25 |     # The SSL certificate, private key and CA for all EXASOL services
 26 |     Cert = /path/to/ssl.crt
 27 |     CertKey = /path/to/ssl.key
 28 |     CertAuth = /path/to/ssl.ca
 29 | 
 30 | # Docker related options
 31 | [Docker]
 32 |     # The directory that contains all data related to this docker cluster
 33 |     # (except for mapped devices)
 34 |     RootDir = /exa/etc
 35 |     # The EXASOL docker image used for all containers of this cluster
 36 |     Image = exasol/docker-db:latest
 37 |     # The type of storage devices for this cluster: 'block' or 'file'
 38 |     DeviceType = file
 39 |     # Comma-separated list of volumes to be mounted in all containers (e. g. '/mnt/my_data:/exa/my_data:rw' )
 40 |     # These user-defined volumes are mounted additionally to the internal ones (like the node root volume)
 41 |     AdditionalVolumes = 
 42 | 
 43 | [Groups]
 44 |     [[root]]
 45 |         ID = 0
 46 |     [[exausers]]
 47 |         ID = 500
 48 |     [[exadbadm]]
 49 |         ID = 1001
 50 |     [[exastoradm]]
 51 |         ID = 1002
 52 |     [[exabfsadm]]
 53 |         ID = 1003
 54 |     [[exaadm]]
 55 |         ID = 1004
 56 | 
 57 | [Users]
 58 |     [[root]]
 59 |         ID = 0
 60 |         Group = root
 61 |         LoginEnabled = True
 62 |         AdditionalGroups = exausers, exadbadm, exastoradm, exabfsadm, exaadm
 63 |     [[exadefusr]]
 64 |         ID = 500
 65 |         Group = exausers
 66 |         LoginEnabled = False
 67 |         AdditionalGroups = exadbadm, exastoradm, exabfsadm, exaadm
 68 | 
 69 | [Node : 11]
 70 |     PrivateNet = 172.17.0.2/16
 71 |     PublicNet = 
 72 |     Name = n11
 73 |     UUID = ECD384A2153246AA9EFC9E88E5292806CE8451C2
 74 |     DockerVolume = n11
 75 |     # Ports to be exposed (container : host)
 76 |     ExposedPorts = 8888:8899, 6583:6594
 77 |     [[Disk : disk1]]
 78 |         Component = exastorage
 79 |         Devices = dev.1
 80 |         Mapping = dev.1:/exa/data/storage
 81 | 
 82 | # Global EXAStorage options
 83 | [EXAStorage]
 84 |     # Enable or disable background recovery / data restoration (does not affect on-demand recovery)
 85 |     BgRecEnabled = True
 86 |     # Max. throughput for background recovery / data restoration (in MiB/s)
 87 |     BgRecLimit = 
 88 |     # Space usage threshold (in percent, per node) for sending a warning
 89 |     SpaceWarnThreshold = 90
 90 | 
 91 | # An EXAStorage volume
 92 | [EXAVolume : DataVolume1]
 93 |     # Type of volume: 'data' | 'archive'
 94 |     Type = data
 95 |     # Volume size (e. g. '1 TiB')
 96 |     Size = 90 GiB
 97 |     # Name of the disk to be used for this volume.
 98 |     # This disk must exist on all volume nodes.
 99 |     Disk = disk1
100 |     # Comma-separated list of node IDs to be used for this volume (incl. redundancy nodes)
101 |     Nodes = 11
102 |     # OPTIONAL: Nr. of master nodes for this volume (default: use all nodes)
103 |     NumMasterNodes = 1
104 |     # Desired redundancy for this volume
105 |     Redundancy = 1
106 |     # Volume owner (user and group ID)
107 |     Owner = 500 : 500
108 |     Permissions = rwx
109 |     BlockSize = 4 KiB
110 |     StripeSize = 256 KiB
111 |     # OPTIONAL: shared volumes can be opened (for writing) by multiple clients simultaneously
112 |     Shared = True
113 |     # OPTIONAL: I/O priority (0 = highest, 20 = lowest)
114 |     Priority = 10
115 | 
116 | # An EXASOL database
117 | [DB : DB1]
118 |     # The EXASOL version to be used for this database
119 |     Version = 6.1.3
120 |     # Memory size over all nodes (e. g. '1 TiB')
121 |     MemSize = 28 GiB
122 |     Port = 8888
123 |     Nodes = 11
124 |     Owner = 500 : 500
125 |     NumMasterNodes = 1
126 |     DataVolume = DataVolume1
127 |     # JDBC driver configuration
128 |     [[JDBC]]
129 |         # BucketFS that contains the JDBC driver
130 |         BucketFS = bfsdefault
131 |         # Bucket that contains the JDBC driver
132 |         Bucket = default
133 |         # Directory within the bucket that contains the drivers
134 |         Dir = drivers/jdbc
135 |     # Oracle driver configuration
136 |     [[Oracle]]
137 |         # BucketFS that contains the JDBC drivers
138 |         BucketFS = bfsdefault
139 |         # Bucket that contains the JDBC drivers
140 |         Bucket = default
141 |         # Directory within the bucket that contains the drivers
142 |         Dir = drivers/oracle
143 | 
144 | # Global BucketFS options
145 | [BucketFS]
146 |     # User and group ID of the BucketFS process.
147 |     ServiceOwner = 500 : 500
148 | 
149 | # A Bucket filesystem
150 | [BucketFS : bfsdefault]
151 |     # HTTP port number (0 = disabled)
152 |     HttpPort = 6583
153 |     # HTTPS port number (0 = disabled)
154 |     HttpsPort = 0
155 |     SyncKey = aW5oUzFMdGpUanNyUTdBMXR5ZGlSekdDSXdqNjFiUGQ=
156 |     SyncPeriod = 30000
157 |     
158 |     # The default bucket (auto-generated)
159 |     [[Bucket : default]]
160 |         ReadPasswd = cmVhZAo=
161 |         WritePasswd = d3JpdGU=
162 |         Public = True
163 |         AdditionalFiles = EXAClusterOS:/usr/opt/EXASuite-6/EXAClusterOS-6.1.3/var/clients/packages/ScriptLanguages-*, EXASolution-6.1.3:/usr/opt/EXASuite-6/EXASolution-6.1.3/bin/udf/*
164 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/fetch_output_redirect_from_last_statement.sh:
--------------------------------------------------------------------------------
1 | gcloud compute ssh $* -- "tac udf.log | grep 'NEW STATEMENT' -B10000 -m1 | tac"
2 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/gcloud-create-instance.sh:
--------------------------------------------------------------------------------
 1 | NAME=$1
 2 | shift
 3 | gcloud compute instances create $NAME \
 4 | --custom-memory=30GB  \
 5 | --custom-cpu=8  \
 6 | --boot-disk-auto-delete  \
 7 | --boot-disk-size=200GB  \
 8 | --image=projects/ubuntu-os-cloud/global/images/ubuntu-1804-bionic-v20190514  \
 9 | --boot-disk-type=pd-standard  \
10 | --maintenance-policy=TERMINATE  \
11 | --scopes=bigquery,storage-ro,storage-rw  \
12 | --metadata=startup-script-url=https://raw.githubusercontent.com/exasol/data-science-examples/master/examples/tensorflow-with-gpu-preview/gcloud-setup.sh $*
13 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/gcloud-setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -x -e -o pipefail -u
  4 | {
  5 |   ##### Install Nvidia Driver #####
  6 |   sudo echo "Install Nvidia Driver" >> /setup.log 
  7 | 
  8 |   curl -o NVIDIA-Linux-x86_64-410.104.run http://de.download.nvidia.com/tesla/410.104/NVIDIA-Linux-x86_64-410.104.run
  9 |   chmod +x NVIDIA-Linux-x86_64-410.104.run
 10 |   sudo apt-get update
 11 |   sudo DEBIAN_FRONTEND=noninteractive \
 12 |         apt-get install -yq --no-install-recommends \
 13 |                             cpp=4:7.3.0-3ubuntu2 \
 14 |                             cpp-7=7.3.0-16ubuntu3 \
 15 |                             g++=4:7.3.0-3ubuntu2 \
 16 |                             g++-7=7.3.0-16ubuntu3 \
 17 |                             gcc=4:7.3.0-3ubuntu2 \
 18 |                             gcc-7=7.3.0-16ubuntu3 \
 19 |                             gcc-7-base=7.3.0-16ubuntu3 \
 20 |                             libasan4=7.3.0-16ubuntu3 \
 21 |                             libcilkrts5=7.3.0-16ubuntu3 \
 22 |                             libgcc-7-dev=7.3.0-16ubuntu3 \
 23 |                             libstdc++-7-dev=7.3.0-16ubuntu3 \
 24 |                             libubsan0=7.3.0-16ubuntu3
 25 |   sudo apt-mark hold cpp cpp-7 g++ g++-7 gcc gcc-7 gcc-7-base libasan4 \
 26 |                     libcilkrts5 libgcc-7-dev libstdc++-7-dev libubsan0
 27 |   sudo dpkg --add-architecture i386
 28 |   sudo apt-get update
 29 |   sudo DEBIAN_FRONTEND=noninteractive \
 30 |         apt-get install -yq --no-install-recommends \
 31 |                           apt-utils \
 32 |                           build-essential \
 33 |                           ca-certificates \
 34 |                           curl \
 35 |                           kmod \
 36 |                           libc6:i386 \
 37 |                           libelf-dev
 38 |   sudo curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey
 39 |   sudo curl -fsSL -o /usr/local/bin/extract-vmlinux https://raw.githubusercontent.com/torvalds/linux/master/scripts/extract-vmlinux
 40 |   sudo chmod +x /usr/local/bin/donkey /usr/local/bin/extract-vmlinux
 41 |   ./NVIDIA-Linux-x86_64-410.104.run --silent
 42 |   sudo curl https://raw.githubusercontent.com/NVIDIA/nvidia-persistenced/master/init/systemd/nvidia-persistenced.service.template | sed 's/__USER__/root/' > /etc/systemd/system/nvidia-persistenced.service
 43 |   sudo systemctl enable nvidia-persistenced
 44 |   sudo systemctl start nvidia-persistenced
 45 |   
 46 |   #### Install Docker #####
 47 |   sudo echo "Install Docker" >> /setup.log
 48 | 
 49 |   sudo DEBIAN_FRONTEND=noninteractive \
 50 |         apt-get install -yq --no-install-recommends \
 51 |           apt-transport-https \
 52 |           ca-certificates \
 53 |           curl \
 54 |           gnupg-agent \
 55 |           software-properties-common
 56 |   curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
 57 |   sudo apt-key fingerprint 0EBFCD88
 58 |   sudo add-apt-repository \
 59 |     "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
 60 |   sudo apt-get update
 61 |   sudo DEBIAN_FRONTEND=noninteractive \
 62 |         apt-get install -yq --no-install-recommends docker-ce docker-ce-cli containerd.io
 63 |   sudo docker run hello-world
 64 | 
 65 |   #### Nvidia Docker ######
 66 |   sudo echo "Install Nvidia Docker" >> /setup.log
 67 | 
 68 |   # Add the package repositories
 69 |   curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
 70 |   distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
 71 |   curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
 72 |     sudo tee /etc/apt/sources.list.d/nvidia-docker.list
 73 |   sudo apt-get update
 74 |   # Install nvidia-docker2 and reload the Docker daemon configuration
 75 |   sudo DEBIAN_FRONTEND=noninteractive \
 76 |         apt-get install -yq --no-install-recommends nvidia-docker2
 77 |   sudo pkill -SIGHUP dockerd
 78 |   # Test nvidia-smi with the latest official CUDA image
 79 |   sudo docker run --runtime=nvidia --rm nvidia/cuda:9.0-base nvidia-smi
 80 | 
 81 |   ##### Install Exasol #####
 82 |   sudo echo "Install Exasol" >> /setup.log
 83 | 
 84 |   wget https://raw.githubusercontent.com/tkilias/data-science-examples/tensorflow-gpu-preview/examples/tensorflow-with-gpu-preview/EXAConf
 85 |   sudo mkdir -p /exa/{etc,data/storage}
 86 |   sudo cp EXAConf /exa/etc/EXAConf
 87 |   SIZE="$((100*1073741824))"
 88 |   sudo dd if=/dev/zero of=/exa/data/storage/dev.1 bs=1 count=1 seek=$SIZE
 89 |   sudo chmod +rw /exa
 90 |   sudo nvidia-docker run --name exasoldb -p 8888:8888 -p 6583:6583 -v /exa:/exa --detach --privileged --stop-timeout 120 --restart always exasol/docker-db:6.1.3-d1
 91 | 
 92 |   ##### Install Python #####
 93 |   sudo echo "Install Python" >> /setup.log
 94 | 
 95 |   sudo DEBIAN_FRONTEND=noninteractive \
 96 |         apt-get install -yq python3-pip
 97 |   sudo pip3 install pyexasol tensorboard tensorflow
 98 |   #### Download scripts ####
 99 |   sudo echo "Download scripts" >> /setup.log
100 | 
101 |   wget https://raw.githubusercontent.com/tkilias/data-science-examples/tensorflow-gpu-preview/examples/tensorflow-with-gpu-preview/system-status.sh
102 | 
103 |   #### Finish Setup #####
104 |   sudo echo "Wait for Exasol" >> /setup.log
105 | 
106 |   sleep 180 # Wait for database to startup
107 |   sudo bash -x /system-status.sh &> status.log
108 |   sudo cp status.log /
109 | 
110 |   sudo echo "Finished" >> /setup.log
111 | } &> /tmp/setup_script.log
112 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/start_output_redirect_server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 | 
4 | old_pid=$(ps --no-headers -exo "uname:1,pid:1,args:1" | grep "[t]mux new -d python3 -m pyexasol_utils.script_output" | cut -f 2 -d " ")
5 | if [ -z "$old_pid" ]
6 | then
7 |   tmux new -d "python3 -m pyexasol_utils.script_output --port 9999 &> udf.log"
8 | fi
9 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/system-status.sh:
--------------------------------------------------------------------------------
1 | sudo nvidia-smi
2 | sudo nvidia-docker logs exasoldb
3 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/tensorflow_udf/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | .idea


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/tensorflow_udf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/examples/tensorflow-with-gpu-preview/tensorflow_udf/__init__.py


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/tensorflow_udf/column_encoder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Dict, Tuple, List
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.python.keras import metrics
  7 | from tensorflow.python.keras import Input
  8 | from tensorflow.python.keras.layers import Dense
  9 | 
 10 | from identity_feature_column import identity_column
 11 | from keras_layer import TFHubTextLayer
 12 | 
 13 | 
 14 | class ColumnEncoder:
 15 | 
 16 |     def create_categorical_column_with_hash_bucket(self, column, column_config):
 17 |         hash_bucket_size = column_config["hash_bucket_size"]
 18 |         embedding_dimensions = column_config["embedding_dimensions"]
 19 |         feature_column = tf.feature_column.categorical_column_with_hash_bucket(
 20 |             key=column.name, hash_bucket_size=hash_bucket_size,
 21 |             dtype=tf.dtypes.as_dtype(np.dtype(column.type))
 22 |         )
 23 |         return hash_bucket_size, embedding_dimensions, feature_column
 24 | 
 25 |     def min_max_scaling(self, x, min_value, max_value):
 26 |         return (x - min_value) / (max_value - min_value)
 27 | 
 28 |     def get_numeric_column(self, column, column_config: Dict):
 29 |         min_value = column_config["min_value"]
 30 |         max_value = column_config["max_value"]
 31 |         feature_column = tf.feature_column.numeric_column(
 32 |             key=column.name,
 33 |             normalizer_fn=lambda x:
 34 |             self.min_max_scaling(x, min_value, max_value))
 35 |         return feature_column
 36 | 
 37 |     def generate_string_inputs(self, column, column_config: Dict):
 38 |         os.environ["TFHUB_DOWNLOAD_PROGRESS"] = "1"
 39 |         keras_input = Input(name=column.name, shape=[1], dtype=tf.string)
 40 |         hub_layer = TFHubTextLayer("default", column_config["module_url"], trainable=True)(keras_input)
 41 |         feature_column = identity_column(column.name)
 42 |         return feature_column, keras_input, hub_layer
 43 | 
 44 |     def generate_categorical_input(self, column, column_config: Dict):
 45 |         hash_bucket_size, embedding_dimensions, feature_column = \
 46 |             self.create_categorical_column_with_hash_bucket(column, column_config)
 47 |         embedding_feature_column = \
 48 |             tf.feature_column.embedding_column(
 49 |                 feature_column, dimension=embedding_dimensions)
 50 |         keras_input = Input(name=column.name, shape=[embedding_dimensions])
 51 |         return embedding_feature_column, keras_input, keras_input
 52 | 
 53 |     def generate_numeric_input(self, column, column_config: Dict):
 54 |         feature_column = self.get_numeric_column(column, column_config)
 55 |         keras_input = Input(name=column.name, shape=[1])
 56 |         return feature_column, keras_input, keras_input
 57 | 
 58 |     def generate_categorical_output(self, column, net, column_config: Dict):
 59 |         hash_bucket_size, embedding_dimensions, feature_column = \
 60 |             self.create_categorical_column_with_hash_bucket(column, column_config)
 61 |         indicator_feature_column = tf.feature_column.indicator_column(feature_column)
 62 |         keras_output = Dense(hash_bucket_size, activation='relu', name="output_" + column.name)(net)
 63 |         loss = ("output_%s" % column.name, 'categorical_crossentropy', 1)
 64 |         output_metrics = ("output_%s" % column.name, "categorical_accuracy")
 65 |         return indicator_feature_column, keras_output, loss, output_metrics
 66 | 
 67 |     def generate_numeric_output(self, column, net, column_config: Dict):
 68 |         feature_column = self.get_numeric_column(column, column_config)
 69 |         keras_output = Dense(1, name="output_" + column.name)(net)
 70 |         loss = ("output_%s" % column.name, 'mean_squared_error', 1)
 71 |         output_metrics = ("output_%s" % column.name, 'mae')
 72 |         return feature_column, keras_output, loss, output_metrics
 73 | 
 74 |     def generate_input_feature_columns(self, input_columns, config: Dict):
 75 |         for column in input_columns:
 76 |             if column.name in config:
 77 |                 column_config = config[column.name]
 78 |                 if column_config["type"] == "categorical" and \
 79 |                         (column.type == int or column.type == str):
 80 |                     yield self.generate_categorical_input(column, column_config)
 81 |                 elif column_config["type"] == "float" and column.type == float:
 82 |                     yield self.generate_numeric_input(column, column_config)
 83 |                 elif column_config["type"] == "string" and column.type == str:
 84 |                     yield self.generate_string_inputs(column, column_config)
 85 |                 else:
 86 |                     raise Exception(f"Unsupported Type for column {column.name}")
 87 | 
 88 |     def generate_output_feature_columns(self, output_columns, net: tf.keras.Model, config: Dict):
 89 |         for column in output_columns:
 90 |             if column.name in config:
 91 |                 column_config = config[column.name]
 92 |                 if column_config["type"] == "categorical" and \
 93 |                         (column.type == int or column.type == int):
 94 |                     yield self.generate_categorical_output(column, net, column_config)
 95 |                 elif column_config["type"] == "float" and column.type == float:
 96 |                     yield self.generate_numeric_output(column, net, column_config)
 97 |                     yield self.generate_numeric_output(column, net, column_config)
 98 |                 else:
 99 |                     raise Exception("Unsupported Type")
100 | 
101 |     def generate_inputs(self, input_columns, config: Dict):
102 |         inputs = config["input"]
103 |         input_columns = \
104 |             [column
105 |              for column in input_columns
106 |              if column.name in inputs]
107 |         input_feature_columns = list(self.generate_input_feature_columns(input_columns, inputs))
108 |         input_columns, keras_inputs, preprocessed_keras_inputs = zip(*input_feature_columns)
109 |         return input_columns, keras_inputs, preprocessed_keras_inputs
110 | 
111 |     def generate_outputs(self, input_columns, net, config: Dict) -> \
112 |             Tuple[List, List, Dict, Dict, Dict]:
113 |         outputs = config["output"]
114 |         output_columns = [column
115 |                           for column in input_columns
116 |                           if column.name in outputs]
117 |         output_feature_columns = list(self.generate_output_feature_columns(output_columns, net, outputs))
118 |         output_columns, keras_outputs, losses, output_metrics = zip(*output_feature_columns)
119 |         loss_weights = {name: weight for name, loss, weight in losses}
120 |         losses = {name: loss for name, loss, weight in losses}
121 |         output_metrics = {name: metrics for name, metrics in output_metrics}
122 |         return output_columns, keras_outputs, losses, loss_weights, output_metrics
123 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/tensorflow_udf/dataset_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.python.feature_column.feature_column import input_layer
 4 | 
 5 | class DatasetUtils:
 6 | 
 7 |     def generator(self, ctx, epochs: int, batch_size: int, use_cache: bool):
 8 |         steps_per_epoch = ctx.size() // batch_size
 9 |         for epoch in range(epochs):
10 |             for batch in range(steps_per_epoch):
11 |                 df = ctx.get_dataframe(num_rows=batch_size)
12 |                 if df is not None:
13 |                     to_dict = df.to_dict(orient="series")
14 |                     yield to_dict
15 |                 else:
16 |                     break
17 |             if not use_cache:
18 |                 ctx.reset()
19 | 
20 |     def create_generator_dataset(self, ctx, epochs: int, batch_size: int, use_cache: bool, input_columns):
21 |         ds = tf.data.Dataset.from_generator(
22 |             lambda: self.generator(ctx, epochs, batch_size, use_cache),
23 |             {column.name: np.dtype(column.type) for column in input_columns},
24 |             {column.name: tf.TensorShape([None]) for column in input_columns}
25 |         )
26 |         return ds
27 | 
28 |     def add_feature_columns_to_dataset(
29 |             self, dataset: tf.data.Dataset, input_columns, output_columns):
30 |         dataset = dataset.map(
31 |             lambda x: (
32 |                 tuple(input_layer(x, column) for column in input_columns),
33 |                 tuple(input_layer(x, column) for column in output_columns)
34 |             ), num_parallel_calls=4
35 |         ).apply(tf.data.experimental.unbatch())
36 |         return dataset
37 | 
38 |     def create_dataset(self, dataset: tf.data.Dataset,
39 |                        input_columns, output_columns,
40 |                        batch_size: int, use_cache: bool):
41 |         dataset = self.add_feature_columns_to_dataset(dataset, input_columns, output_columns)
42 |         if use_cache:
43 |             dataset = dataset.cache("cache").repeat()
44 |         dataset = dataset.shuffle(1000, reshuffle_each_iteration=True)
45 |         dataset = dataset.batch(batch_size, drop_remainder=True)
46 |         return dataset
47 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/tensorflow_udf/identity_feature_column.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.python import tf_export, dtypes, collections, deprecation, tensor_shape
  2 | from tensorflow.python.feature_column import feature_column as fc_old
  3 | from tensorflow.python.feature_column.feature_column_v2 import _check_shape, _assert_key_is_string, DenseColumn, \
  4 |     _FEATURE_COLUMN_DEPRECATION_DATE, _FEATURE_COLUMN_DEPRECATION, \
  5 |     _check_config_keys
  6 | from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
  7 | from tensorflow.python.ops import parsing_ops
  8 | 
  9 | 
 10 | @tf_export('feature_column.identity_column')
 11 | def identity_column(key,
 12 |                     shape=(1,),
 13 |                     dtype=dtypes.string, ):
 14 |     shape = _check_shape(shape, key)
 15 |     _assert_key_is_string(key)
 16 |     return IdentityColumn(key, shape=shape, dtype=dtype)
 17 | 
 18 | 
 19 | class IdentityColumn(
 20 |     DenseColumn,
 21 |     fc_old._DenseColumn,
 22 |     collections.namedtuple(
 23 |         'IdentityColumn',
 24 |         ('key', 'shape', 'dtype'))):
 25 |     """see `numeric_column`."""
 26 | 
 27 |     @property
 28 |     def _is_v2_column(self):
 29 |         return True
 30 | 
 31 |     @property
 32 |     def name(self):
 33 |         """See `FeatureColumn` base class."""
 34 |         return self.key
 35 | 
 36 |     @property
 37 |     @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
 38 |                             _FEATURE_COLUMN_DEPRECATION)
 39 |     def _parse_example_spec(self):
 40 |         return self.parse_example_spec
 41 | 
 42 |     @property
 43 |     def parse_example_spec(self):
 44 |         """See `FeatureColumn` base class."""
 45 |         return {
 46 |             self.key:
 47 |                 parsing_ops.FixedLenFeature(self.shape, self.dtype,
 48 |                                             self.default_value)
 49 |         }
 50 | 
 51 |     def _transform_input_tensor(self, input_tensor):
 52 |         if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
 53 |             raise ValueError(
 54 |                 'The corresponding Tensor of numerical column must be a Tensor. '
 55 |                 'SparseTensor is not supported. key: {}'.format(self.key))
 56 |         return input_tensor
 57 | 
 58 |     @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
 59 |                             _FEATURE_COLUMN_DEPRECATION)
 60 |     def _transform_feature(self, inputs):
 61 |         input_tensor = inputs.get(self.key)
 62 |         return self._transform_input_tensor(input_tensor)
 63 | 
 64 |     def transform_feature(self, transformation_cache, state_manager):
 65 |         input_tensor = transformation_cache.get(self.key, state_manager)
 66 |         return self._transform_input_tensor(input_tensor)
 67 | 
 68 |     @property
 69 |     def variable_shape(self):
 70 |         """See `DenseColumn` base class."""
 71 |         return tensor_shape.TensorShape(self.shape)
 72 | 
 73 |     @property
 74 |     @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
 75 |                             _FEATURE_COLUMN_DEPRECATION)
 76 |     def _variable_shape(self):
 77 |         return self.variable_shape
 78 | 
 79 |     def get_dense_tensor(self, transformation_cache, state_manager):
 80 |         return transformation_cache.get(self, state_manager)
 81 | 
 82 |     @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
 83 |                             _FEATURE_COLUMN_DEPRECATION)
 84 |     def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
 85 |         del weight_collections
 86 |         del trainable
 87 |         return inputs.get(self)
 88 | 
 89 |     @property
 90 |     def parents(self):
 91 |         """See 'FeatureColumn` base class."""
 92 |         return [self.key]
 93 | 
 94 |     def _get_config(self):
 95 |         """See 'FeatureColumn` base class."""
 96 |         config = dict(zip(self._fields, self))
 97 |         config['dtype'] = self.dtype.name
 98 |         return config
 99 | 
100 |     @classmethod
101 |     def _from_config(cls, config, custom_objects=None, columns_by_name=None):
102 |         """See 'FeatureColumn` base class."""
103 |         _check_config_keys(config, cls._fields)
104 |         kwargs = config.copy()
105 |         kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
106 |         return cls(**kwargs)
107 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/tensorflow_udf/keras_layer.py:
--------------------------------------------------------------------------------
 1 | import tensorflow
 2 | import tensorflow_hub as tfhub
 3 | from tensorflow.python.keras.engine import InputSpec
 4 | from tensorflow.python.layers.base import Layer
 5 | 
 6 | 
 7 | class TFHubTextLayer(Layer):
 8 |     """                                                                                                            
 9 |     Layer that encapsulates the following:                                                                         
10 |     - Take full text level input                                                                                   
11 |     - Return TFHub model's output according to provided input and output signature                                 
12 | 
13 |     # Input Shape                                                                                                  
14 |         1D string tensor with shape `(batch_size)`                                                                 
15 |     # Output Shape                                                                                                 
16 |         Determined by the output_key                                                                               
17 |     """
18 | 
19 |     def __init__(self, output_key, module_uri, max_strlen=10000, **kwargs):
20 |         self._name = "TFHubTextLayer"
21 |         super(TFHubTextLayer, self).__init__(**kwargs)
22 |         self.input_spec = InputSpec(
23 |             ndim=2, dtype=tensorflow.string)
24 | 
25 |         self.output_key = output_key
26 |         # lol fucking tensorflow hub can't handle unicode URIs                                                     
27 |         self.module_uri = str(module_uri)
28 |         self.max_strlen = max_strlen
29 | 
30 |     def get_config(self):
31 |         config = {
32 |             'output_key': self.output_key,
33 |             'module_uri': self.module_uri,
34 |             'max_strlen': self.max_strlen,
35 |         }
36 |         base_config = super(TFHubTextLayer, self).get_config()
37 |         config.update(base_config)
38 |         return config
39 | 
40 |     def build(self, input_shape):
41 |         self.embedder = tfhub.Module(self.module_uri, trainable=self.trainable)
42 |         self.embedder_spec = tfhub.load_module_spec(self.module_uri)
43 |         variables_ = [v for v in tensorflow.trainable_variables() if v in self.embedder.variables]
44 |         self.trainable_weights.extend(variables_)
45 |         self.weights.extend(variables_)
46 |         self.trainable_variables.extend(variables_)
47 |         super(TFHubTextLayer, self).build(input_shape)
48 | 
49 |     def call(self, str_inp):
50 |         # we're basically always going to let TFHub modules do space                                               
51 |         # tokenization for us                                                                                      
52 | 
53 |         # blech, it's not really possible to actually define a Keras input w/a shape of ndim 1
54 |         str_inp_squeezed = tensorflow.squeeze(str_inp, axis=1)
55 | 
56 |         # let's apply the max strlen to prevent OOM hopefully                                                  
57 |         str_inp_cutoff = tensorflow.strings.substr(str_inp_squeezed, 0, self.max_strlen)
58 | 
59 |         return self.embedder(str_inp_cutoff, as_dict=True)[self.output_key]
60 | 
61 |     def compute_output_shape(self, input_shape):
62 |         output_shape_spec = map(int, self.embedder_spec.get_output_info_dict[self.output_key].get_shape()._dims)
63 |         # change this to be whatever the batch size is                                                             
64 |         output_shape_spec[0] = input_shape[0]
65 |         return output_shape_spec   


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/tensorflow_udf/requirements.txt:
--------------------------------------------------------------------------------
 1 | Keras
 2 | Keras-Applications
 3 | Keras-Preprocessing
 4 | numpy
 5 | pandas
 6 | pyexasol
 7 | python-dateutil
 8 | pytz
 9 | PyYAML
10 | scipy
11 | stopwatch.py
12 | tensorboard
13 | tensorflow
14 | tensorflow-hub
15 | requests


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/tensorflow_udf/tensorflow_config.yaml:
--------------------------------------------------------------------------------
 1 | columns:
 2 |   input:
 3 |     f_text_0:
 4 |       type: "string"
 5 |       module_url: "https://tfhub.dev/google/universal-sentence-encoder-large/3"
 6 |       # dan_module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
 7 |     f_text_1:
 8 |       type: "string"
 9 |       module_url: "https://tfhub.dev/google/universal-sentence-encoder-large/3"
10 |     f_int_0:
11 |       type: "categorical"
12 |       hash_bucket_size: 100
13 |       embedding_dimensions: 100
14 |   output:
15 |     f_float_0:
16 |       type: "float"
17 |       min_value: 0
18 |       max_value: 1
19 |     f_int_1:
20 |       type: "categorical"
21 |       hash_bucket_size: 100
22 |       embedding_dimensions: 100
23 | use_cache: false
24 | batch_size: 100
25 | epochs: 5
26 | profile: true
27 | device: "/device:GPU:0" # "/cpu:0"
28 | model_load_bucketfs_path:
29 | model_save_bucketfs_url: "http://w@write:localhost:6583/default/tensorflow/save"
30 | model_temporary_save_path: "save" # UDF need to write to /tmp


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/tensorflow_udf/tensorflow_udf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import urllib.parse
  4 | 
  5 | import requests
  6 | import tensorflow as tf
  7 | import yaml
  8 | from tensorflow.python.keras.engine.training import Model
  9 | from tensorflow.python.keras.layers import Dense, Concatenate
 10 | 
 11 | from column_encoder import ColumnEncoder
 12 | from dataset_utils import DatasetUtils
 13 | from utils import Utils
 14 | 
 15 | 
 16 | class TensorflowUDF():
 17 |     CONNECTION_NAME = "tensorflow_config"
 18 | 
 19 |     def create_table_network(self, preprocessed_keras_inputs):
 20 |         concat = Concatenate()(list(preprocessed_keras_inputs))
 21 |         net = Dense(100, activation='relu')(concat)
 22 |         net = Dense(100, activation='relu')(net)
 23 |         return net
 24 | 
 25 |     def read_config(self,exa):
 26 |         config_file_url = exa.get_connection(self.CONNECTION_NAME).address
 27 |         url_data = urllib.parse.urlparse(config_file_url)
 28 |         config_file = urllib.parse.unquote(url_data.path)
 29 |         with open(config_file) as file:
 30 |             config = yaml.load(file, yaml.Loader)
 31 |         with open(config_file) as file:
 32 |             print(file.read())
 33 |         return config
 34 | 
 35 |     def run(self, ctx, exa, train:bool):
 36 |         session_config = tf.ConfigProto(
 37 |             allow_soft_placement=True,
 38 |             log_device_placement=False)
 39 |         session = tf.Session(config=session_config)
 40 |         tf.keras.backend.set_session(session)
 41 | 
 42 |         config = self.read_config(exa)
 43 |         batch_size = config["batch_size"]
 44 |         epochs = config["epochs"]
 45 |         steps_per_epoch = ctx.size() // batch_size
 46 |         use_cache = config["use_cache"]
 47 |         load_path = None
 48 |         if "model_load_bucketfs_path" in config:
 49 |             load_path = config["model_load_bucketfs_path"]
 50 |         save_url = None
 51 |         if "model_save_bucketfs_url" in config:
 52 |             save_url = config["model_save_bucketfs_url"]
 53 |         save_path = config["model_temporary_save_path"]
 54 |         dataset = DatasetUtils().create_generator_dataset(
 55 |             ctx, epochs, batch_size, use_cache, exa.meta.input_columns)
 56 | 
 57 |         with tf.device(config["device"]):
 58 |             input_columns, keras_inputs, preprocessed_keras_inputs = \
 59 |                 ColumnEncoder().generate_inputs(
 60 |                     exa.meta.input_columns, config["columns"])
 61 |             table_network = self.create_table_network(preprocessed_keras_inputs)
 62 |             output_columns, keras_outputs, losses, loss_weights, output_metrics = \
 63 |                 ColumnEncoder().generate_outputs(
 64 |                     exa.meta.input_columns, table_network, config["columns"])
 65 |             session.run(tf.tables_initializer())
 66 | 
 67 |             dataset = DatasetUtils().create_dataset(dataset,
 68 |                                                     input_columns, output_columns,
 69 |                                                     batch_size, use_cache)
 70 | 
 71 |             session.run(tf.global_variables_initializer())
 72 |             session.run(tf.local_variables_initializer())
 73 | 
 74 |             dataset_iterator = dataset.make_initializable_iterator()
 75 |             session.run(dataset_iterator.initializer)
 76 | 
 77 |             saver = tf.train.Saver(max_to_keep=1,save_relative_paths=True)
 78 |             print("load_path",load_path,flush=True)
 79 |             if load_path is not None and load_path != "":
 80 |                 initial_epoch = Utils().restore_model_and_get_inital_epoch(session, saver, load_path+"/checkpoints/tmp/save")
 81 |             else:
 82 |                 initial_epoch = 0
 83 |             callbacks = Utils().create_callbacks(session, saver, save_path)
 84 | 
 85 |             model = Model(inputs=keras_inputs, outputs=keras_outputs)
 86 |             profile = config["profile"]
 87 |             profile_model_options = Utils().add_profiler(callbacks, profile, session, save_path)
 88 |             print(output_metrics, flush=True)
 89 |             model.compile(optimizer='rmsprop', loss=losses, loss_weights=loss_weights, metrics=output_metrics,
 90 |                           **profile_model_options)
 91 |             print(model.summary(),flush=True)
 92 | 
 93 |             if train:
 94 |                 print("Starting training",flush=True)
 95 |                 history = model.fit(dataset_iterator, steps_per_epoch=steps_per_epoch,
 96 |                                     epochs=initial_epoch + epochs, verbose=2, callbacks=callbacks,
 97 |                                     initial_epoch=initial_epoch )
 98 |                 ctx.emit(str(history.history))
 99 |                 print("save_url", save_url,flush=True)
100 |                 if save_url != "" and save_url is not None:
101 |                     tarfile = f"/tmp/save"
102 |                     os.makedirs(tarfile,exist_ok=True)
103 |                     self.tar_save(save_path, tarfile)
104 |                     self.upload_save(save_url, tarfile)
105 | 
106 |             else:
107 |                 print("Starting prediction",flush=True)
108 |                 for i in range(steps_per_epoch):
109 |                     print(f"Predicting Batch {i}/steps_per_epoch",flush=True)
110 |                     output = model.predict(dataset_iterator, steps=1)
111 |                     ctx.emit(output)
112 | 
113 |     def upload_save(self, save_url, tarfile):
114 |         print("Upload save", flush=True)
115 |         with open(f"{tarfile}/metrics.tar", "rb") as f:
116 |             requests.put(f"{save_url}/metrics.tar", data=f)
117 |         with open(f"{tarfile}/checkpoints.tar", "rb") as f:
118 |             requests.put(f"{save_url}/checkpoints.tar", data=f)
119 | 
120 |     def tar_save(self, save_path, tarfile):
121 |         print("Tar save",flush=True)
122 |         try:
123 |             subprocess.check_output(f"tar -czf {tarfile}/metrics.tar {save_path}/metrics", shell=True)
124 |             subprocess.check_output(f"tar -czf {tarfile}/checkpoints.tar {save_path}/checkpoints", shell=True)
125 |         except subprocess.CalledProcessError as e:
126 |             print(e)
127 |             print(e.output, flush=True)
128 | 


--------------------------------------------------------------------------------
/examples/tensorflow-with-gpu-preview/tensorflow_udf/utils.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import tensorflow as tf
 4 | from tensorflow.python import keras
 5 | from tensorflow.python.profiler import option_builder
 6 | from tensorflow.python.profiler.model_analyzer import Profiler
 7 | 
 8 | 
 9 | class Utils:
10 | 
11 |     def save_graph(self, epoch, logs,
12 |                    session: tf.Session,
13 |                    saver: tf.train.Saver, checkpoint_path: str,
14 |                    save_summary_writer: tf.summary.FileWriter):
15 |         save_summary_writer.add_graph(session.graph)
16 |         saver.save(session, save_path=f"{checkpoint_path}/{epoch}")
17 | 
18 |     def create_callbacks(self, session: tf.Session,
19 |                          saver: tf.train.Saver,
20 |                          save_path: str):
21 |         checkpoint_path = self.get_checkpoint_path(save_path)
22 |         save_summary_writer = tf.summary.FileWriter(checkpoint_path)
23 |         save_callback = keras.callbacks.LambdaCallback(
24 |             on_epoch_end=lambda epoch, logs: self.save_graph(epoch, logs, session, saver, checkpoint_path,
25 |                                                              save_summary_writer))
26 |         log_callback = \
27 |             keras.callbacks.TensorBoard(
28 |                 log_dir=f'{save_path}/metrics', histogram_freq=0, batch_size=32,
29 |                 write_graph=True,
30 |                 write_grads=False,
31 |                 write_images=False, embeddings_freq=0,
32 |                 embeddings_layer_names=None,
33 |                 embeddings_metadata=None, embeddings_data=None,
34 |                 update_freq='epoch')
35 |         callbacks = [log_callback, save_callback]
36 |         return callbacks
37 | 
38 |     def get_checkpoint_path(self, save_path):
39 |         checkpoint_path = f"{save_path}/checkpoints"
40 |         return checkpoint_path
41 | 
42 |     def restore_model_and_get_inital_epoch(
43 |             self, session: tf.Session,
44 |             saver: tf.train.Saver,
45 |             load_path: str):
46 |         print("load_path", load_path, flush=True)
47 |         checkpoint_path = self.get_checkpoint_path(load_path)
48 |         print("checkpoint_path",checkpoint_path, flush=True)
49 |         latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
50 |         print("latest_checkpoint", latest_checkpoint, flush=True)
51 |         if latest_checkpoint is not None:
52 |             saver.restore(session, latest_checkpoint)
53 |             return int(pathlib.Path(latest_checkpoint).name)
54 |         else:
55 |             return 0
56 | 
57 |     def add_profile(self, epoch, logs,
58 |                     run_metadata: tf.RunMetadata,
59 |                     profiler: tf.profiler.Profiler,
60 |                     profile_writer: tf.summary.FileWriter,
61 |                     save_path: str):
62 |         timeline_path = f"{save_path}/timeline"
63 |         pathlib.Path(timeline_path).mkdir(exist_ok=True, parents=True)
64 |         profiler.add_step(epoch, run_meta=run_metadata)
65 |         opts = (option_builder.ProfileOptionBuilder(
66 |             option_builder.ProfileOptionBuilder.time_and_memory())
67 |                 .with_step(epoch)
68 |                 .with_timeline_output(f"{timeline_path}/step").build())
69 |         profiler.profile_graph(options=opts)
70 |         profile_writer.add_run_metadata(run_metadata, f"step{epoch}")
71 | 
72 |     def add_profiler(self, callbacks, profile: bool, session: tf.Session, save_path: str):
73 |         if profile:
74 |             profiler = Profiler(session.graph)
75 |             options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
76 |             run_metadata = tf.RunMetadata()
77 |             profile_writer = tf.summary.FileWriter(f"{save_path}/profile")
78 |             profile_writer.add_graph(session.graph)
79 |             profiler_callback = keras.callbacks.LambdaCallback(
80 |                 on_epoch_end=lambda batch, logs: self.add_profile(batch, logs, run_metadata,
81 |                                                                   profiler, profile_writer, save_path))
82 |             callbacks.append(profiler_callback)
83 |             additional_options = dict(options=options, run_metadata=run_metadata)
84 |         else:
85 |             additional_options = dict()
86 |         return additional_options
87 | 


--------------------------------------------------------------------------------
/tutorials/README.md:
--------------------------------------------------------------------------------
 1 | ## Tutorials
 2 | This section contains tutorials which show complete data science workflows on a realistic scenario and data. We are going to provide examples for different languages, frameworks, tasks and use cases.
 3 | 
 4 | For general prerequisites, please refer to [Prerequisites](../README.md).
 5 | 
 6 | **Currently, this repository is under development and we will add more and more tutorials in the future.**
 7 | 
 8 | ### Overview
 9 | 
10 | * [Machine Learning](machine-learning)
11 |   * [Python](machine-learning/python):
12 |     * [AzureML](machine-learning/python/AzureML/Introduction.ipynb)
13 |     * [Scikit-learn](machine-learning/python/scikit-learn)
14 |     * [Sagemaker](machine-learning/python/sagemaker) Using AWS sagemaker for machine learning with Exasol
15 |   * [SageMaker Extension](machine-learning/sagemaker-extension)
16 | * [Script-Language Container](script-languages)
17 | * [Spatial Analysis](spatial-analysis)
18 |   * [Visualizing Spatial Queries](spatial-analysis/visualizing_spatial_queries)
19 | 


--------------------------------------------------------------------------------
/tutorials/machine-learning/README.md:
--------------------------------------------------------------------------------
 1 | ## Machine Learning Tutorials
 2 | This section contains tutorials for doing Machine Learning within the Exasol database. We are going to provide examples for different languages and frameworks, tasks and use cases.
 3 | 
 4 | ### Languages:
 5 | 
 6 | * [Python](python)
 7 |   * [Scikit-learn](python/scikit-learn):
 8 |   
 9 | ### Prerequisites
10 | 
11 | For general prerequisites, please refer to [Prerequisites](../README.md). However, these tutorials typically need a specific flavor of the [Script Language Container](https://github.com/exasol/script-languages) which has the required dependencies installed. For these purposes, we provide the python3-ds-* and the fancy-r-* flavors which already contain the dependencies for the frameworks used in these tutorials. Prepackaged releases for this flavor can be found on the [release page](https://github.com/exasol/script-languages/releases).
12 | 


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/ConnectAzureMLtoExasol.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "source": [
  6 |     "# Connect to Exasol from AzureML"
  7 |    ],
  8 |    "metadata": {
  9 |     "collapsed": false
 10 |    }
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "source": [
 15 |     "In this Tutorial we will:\n",
 16 |     " - Connect to Exasol SaaS from AzureML\n",
 17 |     " - Preprocess data\n",
 18 |     " - Export Exasol tables to an Azure Blobstore Container\n",
 19 |     " - Create a Datastore"
 20 |    ],
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    }
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "source": [
 28 |     "## Prerequisites\n",
 29 |     "\n",
 30 |     "You will need:\n",
 31 |     " - Your running Exasol Saas Cluster with your data loaded into it\n",
 32 |     " - Authentication information for your Exasol Saas Cluster\n",
 33 |     " - An AzureML account and Azure Storage account\n",
 34 |     " - AzureML set up with a:\n",
 35 |     "    - Workspace\n",
 36 |     "    - Compute instance"
 37 |    ],
 38 |    "metadata": {
 39 |     "collapsed": false
 40 |    }
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "source": [
 45 |     "## Why using Azure blobstorage is necessary\n",
 46 |     "\n",
 47 |     "In this tutorial we copy the data from an Exasol Saas Database into an Azure Blob Storage Container. This is necessary because while AzureML has functionality to import directly from SQL databases, the Exasol SQL dialect is not supported by AzureML at the moment of writing.\n"
 48 |    ],
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    }
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "source": [
 56 |     "## AzureML setup\n",
 57 |     "\n",
 58 |     "If you do not know how to set up your AzureML studio, please refer to the [AzureML documentation](https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources).\n",
 59 |     "Once you are set up with a workspace and compute instance, you can copy this notebook into your notebook files. Open it and select your compute instance in the drop-down menu at the top of your notebook. Now we can get started with connecting to the Exasol Saas cluster.\n"
 60 |    ],
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    }
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "source": [
 68 |     "### Connect to Exasol Saas\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "We are going to use the [PyExasol](https://docs.exasol.com/db/latest/connect_exasol/drivers/python/pyexasol.htm) package in order to connect to the Exasol database and read the data. First we need to install PyExasol using pip in your AzureML Compute."
 72 |    ],
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    }
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "!pip install pyexasol"
 83 |    ],
 84 |    "metadata": {
 85 |     "collapsed": false,
 86 |     "pycharm": {
 87 |      "name": "#%%\n"
 88 |     }
 89 |    }
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "source": [
 94 |     "Then we need to connect with PyExasol to our Exasol Saas Cluster with the data. Change these values to reflect your Cluster.\n",
 95 |     "We ask for 10 lines of our \"IDA.TEST\" table from the [Scania Trucks](https://archive.ics.uci.edu/ml/datasets/IDA2016Challenge) to check if our connection is working."
 96 |    ],
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    }
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "outputs": [],
105 |    "source": [
106 |     "import pyexasol\n",
107 |     "import pandas\n",
108 |     "\n",
109 |     "EXASOL_HOST = \"<your>.clusters.exasol.com\"      # change\n",
110 |     "EXASOL_PORT = \"8563\"                            # change if needed\n",
111 |     "EXASOL_USER = \"<your-exasol-user>\"              # change\n",
112 |     "EXASOL_PASSWORD = \"exa_pat_<your_password>\"     # change\n",
113 |     "EXASOL_SCHEMA = \"IDA\"                           # change if needed\n",
114 |     "\n",
115 |     "# get the connection\n",
116 |     "EXASOL_CONNECTION = \"{host}:{port}\".format(host=EXASOL_HOST, port=EXASOL_PORT)\n",
117 |     "exasol = pyexasol.connect(dsn=EXASOL_CONNECTION, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True)\n",
118 |     "\n",
119 |     "# check if the connection is working\n",
120 |     "exasol.export_to_pandas(\"SELECT * FROM TABLE IDA.TEST LIMIT 10\")"
121 |    ],
122 |    "metadata": {
123 |     "collapsed": false,
124 |     "pycharm": {
125 |      "name": "#%%\n"
126 |     }
127 |    }
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "source": [
132 |     "We will also need to get access to the Azure Storage Account, which we will use later to transfer the data. For that, you need to insert your Azure Storage Account Name and Access Key. To find your Access Key, in the Azure portal navigate to your Storage Account, and click on \"Access Keys\" under \"Security + networking\" and copy one of your Access Keys.\n",
133 |     "\n",
134 |     "![](img_src/access_key_azure.png)\n"
135 |    ],
136 |    "metadata": {
137 |     "collapsed": false
138 |    }
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "outputs": [],
144 |    "source": [
145 |     "from azure.ai.ml.entities import AccountKeyConfiguration\n",
146 |     "\n",
147 |     "my_storage_account_name = \"your_storage_account_name\"   # change\n",
148 |     "account_key=\"your_storage_account_key\"                  # change\n",
149 |     "\n",
150 |     "credentials= AccountKeyConfiguration(account_key)"
151 |    ],
152 |    "metadata": {
153 |     "collapsed": false,
154 |     "pycharm": {
155 |      "name": "#%%\n"
156 |     }
157 |    }
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "source": [
162 |     "### Data Preprocessing\n",
163 |     "\n",
164 |     "Now that we are set up for the data transfer, we are first going to preprocess the data in the Exasol Database before pulling the data into Azure. We want to replace the text based \"CLASS\" column all data tables with a boolean column called \"CLASS_POS\" which will make classifying easier.\n",
165 |     "\n",
166 |     "For your own project, you need to evaluate which preprocessing steps to run in the efficient Exasol Database and which might be easier to accomplish later on the CSV files in Azure Blob Storage.\n",
167 |     "\n",
168 |     "First, we create a new table \"TRAIN_PREPARED\" which is a copy of the \"TRAIN\" table, with the replaced \"CLASS_POS\" column."
169 |    ],
170 |    "metadata": {
171 |     "collapsed": false
172 |    }
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "outputs": [],
178 |    "source": [
179 |     "all_columns = exasol.export_to_pandas(\"SELECT * FROM IDA.TRAIN LIMIT 1;\")\n",
180 |     "column_names = list(all_columns)\n",
181 |     "column_names.remove(\"CLASS\")\n",
182 |     "exasol.execute(\"\"\"CREATE OR REPLACE TABLE IDA.TRAIN_PREPARED AS (\n",
183 |     "                SELECT\n",
184 |     "               (CLASS = 'pos') as CLASS_POS, {all_columns_except_class!q} FROM IDA.TRAIN)\"\"\",\n",
185 |     "               {\"all_columns_except_class\": column_names})\n",
186 |     "\n",
187 |     "\n",
188 |     "\n",
189 |     "exasol.export_to_pandas(\"SELECT * FROM IDA.TRAIN_PREPARED LIMIT 4\")"
190 |    ],
191 |    "metadata": {
192 |     "collapsed": false,
193 |     "pycharm": {
194 |      "name": "#%%\n"
195 |     }
196 |    }
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "source": [
201 |     "Then we create a new \"TEST_PREPARED\" table as a copy of the \"TEST\" table with replaced \"CLASS_POS\" column."
202 |    ],
203 |    "metadata": {
204 |     "collapsed": false
205 |    }
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "outputs": [],
211 |    "source": [
212 |     "exasol.execute(\"\"\"CREATE OR REPLACE TABLE IDA.TEST_PREPARED AS (\n",
213 |     "               SELECT\n",
214 |     "               (CLASS = 'pos') as CLASS_POS, {all_columns_except_class!q} FROM IDA.TEST)\"\"\",\n",
215 |     "               {\"all_columns_except_class\": column_names})\n",
216 |     "\n",
217 |     "\n",
218 |     "\n",
219 |     "exasol.export_to_pandas(\"SELECT * FROM IDA.TEST_PREPARED LIMIT 4\")"
220 |    ],
221 |    "metadata": {
222 |     "collapsed": false,
223 |     "pycharm": {
224 |      "name": "#%%\n"
225 |     }
226 |    }
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "source": [
231 |     "### Load Data into AzureML Blob Storage\n",
232 |     "\n",
233 |     "Now that our data is prepared and we have access to our Azure Storage Account and our Exasol Saas Cluster, we use an \"EXPORT TABLE\" command for each of our data tables to export them into a CSV file in our Blob Storage using \"INTO CSV AT CLOUD AZURE BLOBSTORAGE\". You can find [the domumentation for this export command](https://docs.exasol.com/db/latest/sql/export.htm) in the Exasol documentation.\n",
234 |     "If you choose an existing Azure Blob Storage container, this command will save your files in this container. Otherwise, a new container with the given name will be created automatically.\n",
235 |     "When you created your AzureML Workspace, an Azure Blob Container was [created automatically](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-access-data) and added as a Datastore named \"workspaceblobstore\" to your workspace. You can use it here and then skip the \"Create a Datastore\" step below if you want. For this you would need to find its name (\"azureml-blobstore-some-ID\") in the Datastore Information and insert it here."
236 |    ],
237 |    "metadata": {
238 |     "collapsed": false
239 |    }
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "source": [
244 |     "Some of the 170 features of the Scania Trucks dataset do not have a notable influence on the classification or contain a big amount of empty values. Because of this we select only some columns to actually use for the training. Since we only want to use them, we import only these features to Azure.\n",
245 |     "\n",
246 |     "Once we have selected the column names we want to use, we transfer the \"TEST_PREPARED\" table using the exasol EXPORT command."
247 |    ],
248 |    "metadata": {
249 |     "collapsed": false
250 |    }
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "outputs": [],
256 |    "source": [
257 |     "table = \"TEST_PREPARED\"\n",
258 |     "column_names = ['CLASS_POS', 'AA_000', 'AG_005', 'AH_000', 'AL_000', 'AM_0', 'AN_000', 'AO_000', 'AP_000', 'AQ_000',\n",
259 |     "                    'AZ_004', 'BA_002', 'BB_000', 'BC_000', 'BD_000', 'BE_000',\n",
260 |     "                    'BF_000', 'BG_000', 'BH_000', 'BI_000', 'BJ_000', 'BS_000', 'BT_000', 'BU_000', 'BV_000',\n",
261 |     "                    'BX_000', 'BY_000', 'BZ_000', 'CA_000', 'CB_000', 'CC_000', 'CI_000', 'CN_004', 'CQ_000',\n",
262 |     "                    'CS_001', 'DD_000', 'DE_000', 'DN_000', 'DS_000', 'DU_000', 'DV_000', 'EB_000', 'EE_005']\n",
263 |     "\n",
264 |     "blobstorage_name = \"azureml-tutorial\"   # change, remember to you might need to remove the \"_datastore\" suffix\n",
265 |     "\n",
266 |     "save_path = f'{blobstorage_name}/ida/{table}'\n",
267 |     "sql_export = \"\"\"EXPORT (SELECT {column_names!q} FROM IDA.{table!q})\n",
268 |     "                INTO CSV AT CLOUD AZURE BLOBSTORAGE 'DefaultEndpointsProtocol=https;EndpointSuffix=core.windows.net'\n",
269 |     "                USER '{my_storage_account_name!q}' IDENTIFIED BY '{account_key!q}'\n",
270 |     "                FILE '{save_path!q}' WITH COLUMN NAMES REPLACE\"\"\"\n",
271 |     "\n",
272 |     "\n",
273 |     "exasol.execute(sql_export, {\"column_names\": column_names,\n",
274 |     "                            \"table\": table,\n",
275 |     "                            \"my_storage_account_name\": my_storage_account_name,\n",
276 |     "                            \"account_key\": credentials.account_key,\n",
277 |     "                            \"save_path\": save_path})\n",
278 |     "print(f\"saved {table} in file {save_path}\")"
279 |    ],
280 |    "metadata": {
281 |     "collapsed": false,
282 |     "pycharm": {
283 |      "name": "#%%\n"
284 |     }
285 |    }
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "source": [
290 |     "Then we do the same with the TRAIN_PREPARED table:"
291 |    ],
292 |    "metadata": {
293 |     "collapsed": false
294 |    }
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "outputs": [],
300 |    "source": [
301 |     "table = \"TRAIN_PREPARED\"\n",
302 |     "save_path = f'{blobstorage_name}/ida/{table}'\n",
303 |     "\n",
304 |     "exasol.execute(sql_export, {\"column_names\": column_names,\n",
305 |     "                            \"table\": table,\n",
306 |     "                            \"my_storage_account_name\": my_storage_account_name,\n",
307 |     "                            \"account_key\": credentials.account_key,\n",
308 |     "                            \"save_path\": save_path})\n",
309 |     "print(f\"saved {table} in file {save_path}\")"
310 |    ],
311 |    "metadata": {
312 |     "collapsed": false,
313 |     "pycharm": {
314 |      "name": "#%%\n"
315 |     }
316 |    }
317 |   },
318 |   {
319 |    "cell_type": "markdown",
320 |    "source": [
321 |     "Delete the temporary tables from the Exasol Saas Database in order to not pollute the database."
322 |    ],
323 |    "metadata": {
324 |     "collapsed": false
325 |    }
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": null,
330 |    "outputs": [],
331 |    "source": [
332 |     "for table in [\"TRAIN_PREPARED\", \"TEST_PREPARED\"]:\n",
333 |     "    exasol.execute(f\"DROP TABLE IDA.{table};\")"
334 |    ],
335 |    "metadata": {
336 |     "collapsed": false,
337 |     "pycharm": {
338 |      "name": "#%%\n"
339 |     }
340 |    }
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "source": [
345 |     "You can check the success of the command by navigating to your Container in the Azure portal using your Azure storage account.\n",
346 |     "In the menu on the left, you can find \"Containers\" under \"Data Storage\". Find the container named \"your-container-name\" and click on it. Your files should be there.\n"
347 |    ],
348 |    "metadata": {
349 |     "collapsed": false
350 |    }
351 |   },
352 |   {
353 |    "cell_type": "markdown",
354 |    "source": [
355 |     "### Create a Datastore\n",
356 |     "\n",
357 |     "We recommend that you create a connection between your Azure Storage Container and your AzureML Workspace. For this, enter your workspace in AzureML Studio and select \"Data\" under \"Assets\" in the menu on the left. Now select \"Datastores\" and click on \"+Create\".\n",
358 |     "\n",
359 |     "![](img_src/create_datastore.png)\n",
360 |     "\n",
361 |     "In the view that opens you need to enter the info for your datastore. Enter a name and select the type as \"Azure Blob Storage\". Then select your Azure subscription and the blob container we loaded the data into from the drop-down menu. Use Authentication type Account key and enter your Azure storage account access key. Click create."
362 |    ],
363 |    "metadata": {
364 |     "collapsed": false
365 |    }
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "source": [
370 |     "![](img_src/data_blobstore.png)\n",
371 |     "\n",
372 |     "You can now see your data directly in AzureML by navigating to \"Datastores\" and clicking on <your_datastore_name> . If you then change into the \"Browse\" view you can open your files and have a look at them if you want.\n",
373 |     "\n",
374 |     "\n",
375 |     "Great, we successfully connected to our Exasol Saas instance and loaded data from there into our Azure Blob Storage!\n",
376 |     "\n",
377 |     "Now we move on to [working with the data in AzureML and training a model on it](TrainModelInAzureML.ipynb)."
378 |    ],
379 |    "metadata": {
380 |     "collapsed": false
381 |    }
382 |   }
383 |  ],
384 |  "metadata": {
385 |   "kernelspec": {
386 |    "display_name": "Python 3",
387 |    "language": "python",
388 |    "name": "python3"
389 |   },
390 |   "language_info": {
391 |    "codemirror_mode": {
392 |     "name": "ipython",
393 |     "version": 2
394 |    },
395 |    "file_extension": ".py",
396 |    "mimetype": "text/x-python",
397 |    "name": "python",
398 |    "nbconvert_exporter": "python",
399 |    "pygments_lexer": "ipython2",
400 |    "version": "2.7.6"
401 |   }
402 |  },
403 |  "nbformat": 4,
404 |  "nbformat_minor": 0
405 | }


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/Introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "source": [
  6 |     "# Apply Microsoft's AzureML training on Exasol data"
  7 |    ],
  8 |    "metadata": {
  9 |     "collapsed": false
 10 |    }
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "source": [
 15 |     "In this series of tutorials you will learn how to :\n",
 16 |     "\n",
 17 |     " - Connect your Exasol (Saas) Database to AzureML ([link](ConnectAzureMLtoExasol.ipynb))\n",
 18 |     " - Use your data from Exasol to train a Machine learning model in AzureML ([link](TrainModelInAzureML.ipynb))\n",
 19 |     " - Invoke your trained model from your Exasol Database and receive the results directly into your Database ([link](InvokeModelFromExasolDBwithUDF.ipynb))\n",
 20 |     " - Export the trained model to Exasol and run it directly in the Database using UDF's ([link](InvokeModelFromExasolDBwithUDF.ipynb))\n"
 21 |    ],
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    }
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "source": [
 29 |     "## Who this tutorial is for\n",
 30 |     "\n",
 31 |     "If you are an Exasol user and want to make more out of your data using Azure Machine Learning, this tutorial shows you how to get started. Or maybe you are already using Azure Machine Learning to analyze your data but are interested in hosting the data somewhere else, preferably in an analytic database to get the best out of your data. Then this tutorial might give you an insight into how easy it is to switch out your database while not having to disrupt or rebuild your Machine Learning processes."
 32 |    ],
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    }
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "source": [
 40 |     "## What this tutorial does not cover\n",
 41 |     " - Using the exasol data in AzureML directly (not supported by AzureML)\n",
 42 |     " - Setting training parameters/invoking training automatically from Exasol\n",
 43 |     " - Check training/prediction progress from Exasol\n",
 44 |     " - Get training parameters of trained model from Exasol\n",
 45 |     " - Check if the model is running from Exasol\n",
 46 |     " - Monitor AzureML node utilisation from Exasol\n",
 47 |     "\n",
 48 |     "Many of these things are possible but not covered here. After finishing this tutorial you should have the necessary tools to get started implementing solutions for these tasks though, provided you know your way around Azure and AzureML."
 49 |    ],
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    }
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "source": [
 57 |     "## Prerequisites\n",
 58 |     "   - For this tutorial we will use an [Exasol Saas](https://www.exasol.com/exasol-saas/) instance. Other versions of Exasol should also work as long as you are able to connect to them via PyExasol.\n",
 59 |     "   - [Microsoft AzureML Studio](https://studio.azureml.net/) access (works in conjunction with [Microsoft Azure](https://azure.microsoft.com/de-de/free/search/?&ef_id=EAIaIQobChMIkZ-J_bzg_QIVCthRCh3Uyga7EAAYASAAEgIK0PD_BwE:G:s&OCID=AIDcmmzzaokddl_SEM_EAIaIQobChMIkZ-J_bzg_QIVCthRCh3Uyga7EAAYASAAEgIK0PD_BwE:G:s&gclid=EAIaIQobChMIkZ-J_bzg_QIVCthRCh3Uyga7EAAYASAAEgIK0PD_BwE))\n",
 60 |     "   - The [PyExasol package](https://pypi.org/project/pyexasol/) ([documentation](https://docs.exasol.com/db/latest/connect_exasol/drivers/python/pyexasol.htm))"
 61 |    ],
 62 |    "metadata": {
 63 |     "collapsed": false
 64 |    }
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "source": [
 69 |     "## Setting up Exasol SaaS for this tutorial\n",
 70 |     "\n",
 71 |     "Firstly, if you do not have an accounr aleady, [sign up for Exasol Saas free trail](https://cloud.exasol.com/signup?_gl=1*l5pvjo*_ga*MTAwNTY5MzY5NC4xNjc2Mzc3NzA2*_ga_3M805TBTX9*MTY3Nzc2MDM1MC4yLjAuMTY3Nzc2MDM1MC42MC4wLjA.), or sign in to your existing account.\n",
 72 |     "\n",
 73 |     "Once signed in, click the \"Add database\" button on the top left, choose a database name and your preferred region, then click \"next\" define your first database cluster by setting a cluster name and your preferred cluster size. If you are on the free trial the price for your cluster will be deducted from your free credits.\n",
 74 |     "You can also set the automatic shutoff time for your cluster in this view (or change it later).\n",
 75 |     "\n",
 76 |     "![](img_src/cluster_creation.png)\n",
 77 |     "\n",
 78 |     "We choose the smallest available cluster size (XSmall, 8 vCPU, 64 GB Memory) for this tutorial as it is sufficient.\n",
 79 |     " When you are ready click \"Add database\" and your cluster will be set up and started (this might take some time).\n"
 80 |    ],
 81 |    "metadata": {
 82 |     "collapsed": false,
 83 |     "pycharm": {
 84 |      "name": "#%% md\n"
 85 |     }
 86 |    }
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "source": [
 91 |     "### Allow connection from AzureML\n",
 92 |     "\n",
 93 |     "Once you set up your Exasol database, you need to allow incoming connections from AzureML Studio. You also need to get the connection info for your cluster, so we can use it later to set up the PyExasol connection to your cluster.\n",
 94 |     "\n",
 95 |     "Firstly you need to find the public IP of your AzureML compute instance. If you do not yet have an AzureML compute instance you will need to [set one up](https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources).\n",
 96 |     "To find the public IP, click the \"Compute\" entry in the menu on the left in AzureML Studio below \"Manage\". Find your compute and open it. Toward the bottom you will find the \"Public IP\".\n",
 97 |     "\n",
 98 |     "![](img_src/azureML_public_ip.png)\n",
 99 |     "\n",
100 |     "Now you need to register the IP with your Saas Database to allow incoming requests form your AzureML compute.\n",
101 |     "In the Saas portal, navigate to your Cluster and click on \"Connect via tools\" on the right. Enter the IP of your AzureML compute instance\n",
102 |     "and \"Add\". Then click \"next\" two times. You will see a screen \"Connection details.\".\n",
103 |     "\n",
104 |     "![](img_src/connection_detail_generate.png)\n",
105 |     "\n",
106 |     " Click the bottom column to generate a Personal access token for your Database.\n",
107 |     "\n",
108 |     "![](img_src/connection_details_acess_token.png)\n",
109 |     "\n",
110 |     "We will use this token to connect to the database from AzureML.\n",
111 |     "**Remember the connection string, port, user-name and access token/password.**\n",
112 |     "If you accidentally added a wrong IP you can remove them again under \"Security\"\n",
113 |     "\n",
114 |     "Now that you got your connection information, run [this notebook](../sagemaker/LoadExampleDataIntoExasol.ipynb) to load the [Scania Trucks](https://archive.ics.uci.edu/ml/datasets/IDA2016Challenge) dataset into your Exasol Saas Instance (Don't forget to change the connection info in the first cell).\n",
115 |     "You can run this from your AzureML cluster, or from your local machine(remember to add your local IP beforehand like we did the AzureML cluster IP above).\n",
116 |     "\n",
117 |     "If want to use other tool for your data upload multiple ways are documented [here](https://docs.exasol.com/saas/connect_exasol.htm).\n",
118 |     "\n",
119 |     "Now we have our Exasol Saas set up with some data to play around with, we can move on to [next part of the tutorial](ConnectAzureMLtoExasol.ipynb)."
120 |    ],
121 |    "metadata": {
122 |     "collapsed": false,
123 |     "pycharm": {
124 |      "name": "#%% md\n"
125 |     }
126 |    }
127 |   }
128 |  ],
129 |  "metadata": {
130 |   "kernelspec": {
131 |    "display_name": "Python 3",
132 |    "language": "python",
133 |    "name": "python3"
134 |   },
135 |   "language_info": {
136 |    "codemirror_mode": {
137 |     "name": "ipython",
138 |     "version": 2
139 |    },
140 |    "file_extension": ".py",
141 |    "mimetype": "text/x-python",
142 |    "name": "python",
143 |    "nbconvert_exporter": "python",
144 |    "pygments_lexer": "ipython2",
145 |    "version": "2.7.6"
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 0
150 | }


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/TrainModelInAzureML.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "source": [
  6 |     "# Train an ML model on Exasol Data\n",
  7 |     "\n",
  8 |     "In this tutorial, you will load the data from Azure Blob Storage, and run a Python script as an AzureML job to preprocess the data and train a simple scikit-learn model. Then You will register the trained model with AzureML for further use."
  9 |    ],
 10 |    "metadata": {
 11 |     "collapsed": false
 12 |    }
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "source": [
 17 |     "## Prerequisites\n",
 18 |     "You completed the [previous part of this tutorial series](ConnectAzureMLtoExasol.ipynb) and therefore have:\n",
 19 |     " - A running AzureML compute instance\n",
 20 |     " - An Azure Storage account\n",
 21 |     " - The [Scania Trucks](https://archive.ics.uci.edu/ml/datasets/IDA2016Challenge) dataset loaded into Azure Blob Storage\n"
 22 |    ],
 23 |    "metadata": {
 24 |     "collapsed": false
 25 |    }
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "source": [
 30 |     "## Python script for training the model\n",
 31 |     "\n",
 32 |     "We will use a Python script to create and train a SciKit-Learn model on the data we loaded from Exasol. You can find the script [here](main.py).\n",
 33 |     "The script loads the data from the files we saved in the Azure Blob Storage, does data preprocessing to combat the unbalanced nature of the dataset and removes empty values so the training can work properly.\n",
 34 |     "Then, it creates a simple SciKit-Learn model and trains it on the data. The model is evaluated using the test dataset and registered in the AzureML Workspace using MLflow.\n",
 35 |     "\n",
 36 |     "This script creates a model that only uses Python packages available in Exasol Saas UDFs natively. This means you can upload this model directly to your exasol Database and run it using an UDF. If your own models use different packages but you still need to run them on the cluster directly you need to [build and install yout own Script-Language Container](https://docs.exasol.com/db/latest/database_concepts/udf_scripts/adding_new_packages_script_languages.htm). Information on which packages are supported out of the box can be found [here](https://docs.exasol.com/saas/database_concepts/udf_scripts/python3.htm).\n"
 37 |    ],
 38 |    "metadata": {
 39 |     "collapsed": false
 40 |    }
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "source": [
 45 |     "## Prepare AzureML studio to run the Python script\n",
 46 |     "\n",
 47 |     "This notebook is meant to be run in AzureML Studio, so upload it to your Notebooks, open it and select your compute instance in the drop-down menu at the top of your notebook. The same steps can be achieved by accessing AzureML using remote scripts, but for demonstration purposes we use AzureML Studio here.\n",
 48 |     "\n",
 49 |     "First, we install some AzureML functionality."
 50 |    ],
 51 |    "metadata": {
 52 |     "collapsed": false
 53 |    }
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!pip install azure-identity\n",
 61 |     "!pip install azure-ai-ml==1.3.0"
 62 |    ],
 63 |    "metadata": {
 64 |     "collapsed": false,
 65 |     "pycharm": {
 66 |      "name": "#%%\n"
 67 |     }
 68 |    }
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "source": [
 73 |     "Then, we create an MLClient for accessing our AzureML jobs programmatically. For this we need our AzureML subscription id, resource group name and workspace name. If you are not sure what your resource group name is, you can find it by clicking your subscription in the top left oft AzureML Studio\n",
 74 |     "Make sure to use the workspace you set up in the previous tutorial.\n",
 75 |     "\n",
 76 |     "![](img_src/resource_group.png)"
 77 |    ],
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    }
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# Handle to the workspace\n",
 88 |     "from azure.ai.ml import MLClient\n",
 89 |     "\n",
 90 |     "# Authentication package\n",
 91 |     "from azure.identity import DefaultAzureCredential\n",
 92 |     "\n",
 93 |     "credential = DefaultAzureCredential()\n",
 94 |     "# Get a handle to the workspace\n",
 95 |     "ml_client = MLClient(\n",
 96 |     "    credential=credential,\n",
 97 |     "    subscription_id=\"<your subscription id>\",               # change\n",
 98 |     "    resource_group_name=\"<your resource group name>\",       # change\n",
 99 |     "    workspace_name=\"<your workspace name>\",                 # change\n",
100 |     ")"
101 |    ],
102 |    "metadata": {
103 |     "collapsed": false,
104 |     "pycharm": {
105 |      "name": "#%%\n"
106 |     }
107 |    }
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "source": [
112 |     "### Create a new Python Environment\n",
113 |     "\n",
114 |     "To run our Python script we need to create a new environment and install the required dependencies. For this, we first create a new directory called \"dependencies\"."
115 |    ],
116 |    "metadata": {
117 |     "collapsed": false
118 |    }
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "outputs": [],
124 |    "source": [
125 |     "#make env\n",
126 |     "import os\n",
127 |     "\n",
128 |     "dependencies_dir = \"./dependencies\"\n",
129 |     "os.makedirs(dependencies_dir, exist_ok=True)"
130 |    ],
131 |    "metadata": {
132 |     "collapsed": false,
133 |     "pycharm": {
134 |      "name": "#%%\n"
135 |     }
136 |    }
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "source": [
141 |     "In order for our model to be usable in the Exasol Saas Database later, we need to make sure the SciKit-learn version we use matches the version in Saas."
142 |    ],
143 |    "metadata": {
144 |     "collapsed": false
145 |    }
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "outputs": [],
151 |    "source": [
152 |     "%%writefile {dependencies_dir}/conda.yml\n",
153 |     "name: model-env\n",
154 |     "channels:\n",
155 |     "  - conda-forge\n",
156 |     "dependencies:\n",
157 |     "  - python=3.8\n",
158 |     "  - numpy=1.21.2\n",
159 |     "  - scikit-learn=1.0.2\n",
160 |     "  - pandas>=1.1,<1.2\n",
161 |     "  - pip:\n",
162 |     "    - inference-schema[numpy-support]==1.3.0\n",
163 |     "    - mlflow== 1.26.1\n",
164 |     "    - azureml-mlflow==1.42.0\n"
165 |    ],
166 |    "metadata": {
167 |     "collapsed": false,
168 |     "pycharm": {
169 |      "name": "#%%\n"
170 |     }
171 |    }
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "source": [
176 |     "Next, we will create a new environment to run our job in. We will use the new dependencies file and use an Ubuntu image as the base for our environment. Then we will create the new environment on our *MLClient*."
177 |    ],
178 |    "metadata": {
179 |     "collapsed": false
180 |    }
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 1,
185 |    "outputs": [
186 |     {
187 |      "ename": "ModuleNotFoundError",
188 |      "evalue": "No module named 'azure'",
189 |      "output_type": "error",
190 |      "traceback": [
191 |       "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
192 |       "\u001B[0;31mModuleNotFoundError\u001B[0m                       Traceback (most recent call last)",
193 |       "Input \u001B[0;32mIn [1]\u001B[0m, in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mazure\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mai\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mml\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mentities\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Environment\n\u001B[1;32m      2\u001B[0m custom_env_name \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m<Name your environment here>\u001B[39m\u001B[38;5;124m\"\u001B[39m    \u001B[38;5;66;03m# change\u001B[39;00m\n\u001B[1;32m      4\u001B[0m pipeline_job_env \u001B[38;5;241m=\u001B[39m Environment(\n\u001B[1;32m      5\u001B[0m     name\u001B[38;5;241m=\u001B[39mcustom_env_name,\n\u001B[1;32m      6\u001B[0m     description\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCustom environment for azureML tut\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m      9\u001B[0m     image\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m     10\u001B[0m )\n",
194 |       "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'azure'"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "from azure.ai.ml.entities import Environment\n",
200 |     "custom_env_name = \"<Name your environment here>\"    # change\n",
201 |     "\n",
202 |     "pipeline_job_env = Environment(\n",
203 |     "    name=custom_env_name,\n",
204 |     "    description=\"Custom environment for AzureML tutorial\",\n",
205 |     "    tags={\"scikit-learn\": \"1.0.2\"},\n",
206 |     "    conda_file=os.path.join(dependencies_dir, \"conda.yml\"),\n",
207 |     "    image=\"mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest\",\n",
208 |     ")\n",
209 |     "pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)\n",
210 |     "\n",
211 |     "print(\n",
212 |     "    f\"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}.\"\n",
213 |     ")"
214 |    ],
215 |    "metadata": {
216 |     "collapsed": false,
217 |     "pycharm": {
218 |      "name": "#%%\n"
219 |     }
220 |    }
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "source": [
225 |     "## Run the Python script\n",
226 |     "\n",
227 |     "Now we need to create an AzureML job with the following inputs:\n",
228 |     "\n",
229 |     " - The path to the Python script\n",
230 |     " - A command to run the script\n",
231 |     " - Information which AzureML Compute and Environment to use\n",
232 |     "\n",
233 |     "This job will be used to run our Python script on our Compute using the environment we created in the step before.\n",
234 |     "The script takes links to the data files we loaded ino Azure Blob Storage in the previous tutorial as input. You can find these links by naviating to your data files in your data store and clicking the kebab menu besides each file. A drop down menu will open where you can select the \"Copy URI\" option. This opens a pop-up window where you can copy the link to the file.\n",
235 |     "![](img_src/get_data_link.png)\n",
236 |     "\n",
237 |     "This opens a pop-up window where you can copy the link to the file.\n",
238 |     "![](img_src/get_data_link_2.png)\n",
239 |     "\n",
240 |     "Also don't forget to change the variables for your Compute.\n"
241 |    ],
242 |    "metadata": {
243 |     "collapsed": false
244 |    }
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "outputs": [],
250 |    "source": [
251 |     "from azure.ai.ml import command\n",
252 |     "from azure.ai.ml import Input\n",
253 |     "from azure.ai.ml.constants import AssetTypes\n",
254 |     "\n",
255 |     "\n",
256 |     "job = command(\n",
257 |     "    inputs=dict(\n",
258 |     "        train_data=Input(\n",
259 |     "            type=AssetTypes.URI_FILE,\n",
260 |     "            path=\"< link to training data file >\",       # change\n",
261 |     "        ),\n",
262 |     "        test_data=Input(\n",
263 |     "            type=AssetTypes.URI_FILE,\n",
264 |     "            path=\"< link to test data file >\",       # change\n",
265 |     "        ),\n",
266 |     "        learning_rate=0.05\n",
267 |     "    ),\n",
268 |     "    code=\".\",  # location of source code\n",
269 |     "    command=\"python main.py --train_data ${{inputs.train_data}} --test_data ${{inputs.test_data}} --learning_rate ${{inputs.learning_rate}}\",\n",
270 |     "    environment=pipeline_job_env,\n",
271 |     "    compute=\"<your_compute_name>\",                      # change\n",
272 |     "    experiment_name=\"<experiment_name>\",                # change\n",
273 |     "    display_name=\"<experiment_name_>\",                  # change\n",
274 |     ")\n"
275 |    ],
276 |    "metadata": {
277 |     "collapsed": false,
278 |     "pycharm": {
279 |      "name": "#%%\n"
280 |     }
281 |    }
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "source": [
286 |     "Now, we can run the script on our compute instance. A link will show up below, which you can click on to see the job details and output logs."
287 |    ],
288 |    "metadata": {
289 |     "collapsed": false
290 |    }
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "outputs": [],
296 |    "source": [
297 |     "ml_client.create_or_update(job)"
298 |    ],
299 |    "metadata": {
300 |     "collapsed": false,
301 |     "pycharm": {
302 |      "name": "#%%\n"
303 |     }
304 |    }
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "source": [
309 |     "Here is the Confusion Matrix of our trained model.\n",
310 |     "\n",
311 |     "\n",
312 |     "|            | predicted neg  | predicted pos  |\n",
313 |     "|------------|----------------|----------------|\n",
314 |     "|actual neg  |        14841   | 784            |\n",
315 |     "|actual pos  |           13   | 362            |\n",
316 |     "\n",
317 |     "The model has a total cost of 14340 according to the ida-score we implemented in accordance to the problem description of the Scania Trucks dataset."
318 |    ],
319 |    "metadata": {
320 |     "collapsed": false
321 |    }
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "source": [
326 |     "## Save the trained model\n",
327 |     "\n",
328 |     "The script will directly register the trained model in your AzureML Workspace, so you can use it to run inference in AzureML. It will also save the model in the output of the job. From there, you can extract it to run it in your Exasol cluster. You can find your registered model under the Assets, Model entry in the AzureML Studio menu on the left.\n",
329 |     "\n",
330 |     "![](img_src/registered_model.png)\n",
331 |     "\n",
332 |     "Now that we have trained and registered a model on the data we imported from our Exasol Saas instance, we can move on to the\n",
333 |     "[next part](InvokeModelFromExasolDBwithUDF.ipynb), where we will use this model from with in our Exasol Cluster to classify some data."
334 |    ],
335 |    "metadata": {
336 |     "collapsed": false
337 |    }
338 |   }
339 |  ],
340 |  "metadata": {
341 |   "kernelspec": {
342 |    "display_name": "Python 3",
343 |    "language": "python",
344 |    "name": "python3"
345 |   },
346 |   "language_info": {
347 |    "codemirror_mode": {
348 |     "name": "ipython",
349 |     "version": 2
350 |    },
351 |    "file_extension": ".py",
352 |    "mimetype": "text/x-python",
353 |    "name": "python",
354 |    "nbconvert_exporter": "python",
355 |    "pygments_lexer": "ipython2",
356 |    "version": "2.7.6"
357 |   }
358 |  },
359 |  "nbformat": 4,
360 |  "nbformat_minor": 0
361 | }


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/access_key_azure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/access_key_azure.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/azureML_public_ip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/azureML_public_ip.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/cluster_creation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/cluster_creation.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/conda_file_artifact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/conda_file_artifact.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/connection_detail_generate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/connection_detail_generate.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/connection_details_acess_token.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/connection_details_acess_token.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/consume_endpoint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/consume_endpoint.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/create_datastore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/create_datastore.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/data_blobstore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/data_blobstore.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/download_all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/download_all.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/download_file_arifact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/download_file_arifact.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/file_path_bucketfs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/file_path_bucketfs.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/get_data_link.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/get_data_link.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/get_data_link_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/get_data_link_2.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/manage_udf_files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/manage_udf_files.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/registered_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/registered_model.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/img_src/resource_group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/python/AzureML/img_src/resource_group.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from sklearn.pipeline import Pipeline
  8 | from sklearn.impute import SimpleImputer
  9 | from sklearn.preprocessing import StandardScaler
 10 | 
 11 | from sklearn.ensemble import ExtraTreesClassifier
 12 | from sklearn.metrics import make_scorer
 13 | from sklearn.model_selection import ParameterGrid
 14 | from sklearn.model_selection import GridSearchCV
 15 | from sklearn.metrics import confusion_matrix
 16 | 
 17 | import mlflow
 18 | import mlflow.sklearn
 19 | 
 20 | 
 21 | def main():
 22 |     """Main function of the script."""
 23 | 
 24 |     # input and output arguments
 25 |     parser = argparse.ArgumentParser()
 26 |     parser.add_argument("--train_data", type=str, help="path to input train data")
 27 |     parser.add_argument("--test_data", type=str, help="path to input test data")
 28 |     parser.add_argument("--learning_rate", required=False, default=0.1, type=float)
 29 |     args = parser.parse_args()
 30 |     print(" ".join(f"{k}={v}" for k, v in vars(args).items()))
 31 | 
 32 |     # read the data from the AzureML Blob Storage. This is a good way for the data used for this example,
 33 |     # but for your own data another approach might be better. Check here for more info:
 34 |     # https://learn.microsoft.com/en-us/azure/machine-learning/how-to-read-write-data-v2?view=azureml-api-2&tabs=cli
 35 |     train_df_no_scale = pd.read_csv(args.train_data, header=0)
 36 |     test_df_no_scale = pd.read_csv(args.test_data, header=0)
 37 | 
 38 |     train_data_and_labels = get_labels(train_df_no_scale, class_col_name='CLASS_POS')
 39 |     test_data_and_labels = get_labels(test_df_no_scale, class_col_name='CLASS_POS')
 40 | 
 41 |     # get transformer for data preparation:
 42 |     #   normalization, removing nans from dataset(important for back propagation),
 43 |     _, transformer = get_transformer(train_data_and_labels)
 44 | 
 45 |     # build classifier and find best training parameters
 46 |     clf, grid_search = build_et_classifier(train_data_and_labels, transformer)
 47 |     print(grid_search.best_params_['n_estimators'])
 48 |     print(grid_search.best_params_['max_depth'])
 49 |     print(str(grid_search.best_params_['class_weight']))
 50 | 
 51 |     # Train and evaluate the model.
 52 |     clf.fit(train_data_and_labels[1], train_data_and_labels[0].ravel())
 53 | 
 54 |     # Evaluate the trained classifier using test data. Output can be found in the logs of the AzureML job run.
 55 |     y_pred = test_eval(test_data_and_labels, clf)
 56 | 
 57 |     # Save the trained model and register it with AzureML Workspace
 58 |     mlflow.sklearn.log_model(
 59 |         sk_model=clf,
 60 |         registered_model_name="registered_model_name_sklearn",
 61 |         artifact_path="./outputs/model/sklearn_model_sklearn_save"
 62 |     )
 63 | 
 64 | # get class labels from dataset
 65 | def get_labels(df, class_col_name):
 66 |     y = df.loc[:, class_col_name]
 67 |     X_data = df.loc[:, df.columns != class_col_name]
 68 |     return [y, X_data]
 69 | 
 70 | # get transformer and train for data preprocessing
 71 | def get_transformer(data_and_labels):
 72 |     transformer = Pipeline([
 73 |         ('imputer', SimpleImputer(strategy="median")),
 74 |         ('scaler', StandardScaler())
 75 |     ])
 76 |     train_df_transformed = transformer.fit_transform(data_and_labels[1])
 77 |     return train_df_transformed, transformer
 78 | 
 79 | 
 80 | def build_et_classifier(data_and_labels, transformer):
 81 |     y = data_and_labels[0]
 82 |     X_data = data_and_labels[1]
 83 |     X_data = transformer.transform(X_data)
 84 | 
 85 |     # Create classifier
 86 |     clf = ExtraTreesClassifier(n_jobs=-1)
 87 | 
 88 |     # Specify parameter search grid
 89 |     # The grid size is kept small to reduce the computation time
 90 |     # Good values (known from offline grid search) are:
 91 |     # 'n_estimators': 61
 92 |     # 'max_depth': 10
 93 |     # 'class_weight': {{0: 1, 1: 89}}
 94 |     param_grid = [
 95 |         {'n_estimators': [30, 61],
 96 |          'max_depth': [5, 10],
 97 |          'class_weight': [{0: 1, 1: 30}, {0: 1, 1: 50}, {0: 1, 1: 89}]}
 98 |     ]
 99 | 
100 |     ida_scorer = make_scorer(ida_score)
101 | 
102 |     # Search for optimal values in grid using 5-fold cross validation
103 |     grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=ida_scorer, n_jobs=-1)
104 |     grid_search.fit(X_data, y.values.ravel())
105 | 
106 |     # Create new model with optimal parameter values
107 |     clf = ExtraTreesClassifier(n_jobs=-1,
108 |                                n_estimators=grid_search.best_params_['n_estimators'],
109 |                                max_depth=grid_search.best_params_['max_depth'],
110 |                                class_weight=grid_search.best_params_['class_weight'])
111 | 
112 |     # fuse the classifier and the transformer into one pipeline.
113 |     # This guarantees the preprocessing stays the same for each use of the model.
114 |     model = Pipeline([
115 |         ('transform', transformer),
116 |         ('clf', clf)
117 |     ])
118 | 
119 |     return model, grid_search
120 | 
121 | # Evaluate the trained model
122 | def test_eval(data_and_labels, clf):
123 |     y = data_and_labels[0]
124 |     X_data = data_and_labels[1]
125 | 
126 |     # Predict classes of test data
127 |     y_pred = clf.predict(X_data)
128 | 
129 |     # Examine the results
130 |     confusion_mat = confusion_matrix(y, y_pred)
131 |     confusion_matrix_df = pd.DataFrame(confusion_mat,
132 |                                        index=['actual neg', 'actual pos'],
133 |                                        columns=['predicted neg', 'predicted pos'])
134 | 
135 |     print("Total Cost:", - ida_score(y, y_pred), "\n")
136 |     print("Confusion Matrix:\n", confusion_matrix_df)
137 | 
138 | 
139 | # Define scoring metric for grid search from problem description of the Scania Trucks dataset
140 | def ida_score(y, y_pred):
141 |     false_preds = y - y_pred
142 |     num_false_pos = (false_preds < 0).sum()
143 |     num_false_neg = (false_preds > 0).sum()
144 |     return -(num_false_pos * 10 + num_false_neg * 500)
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     main()
149 | 


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/AzureML/score.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import json
 4 | import numpy
 5 | import joblib
 6 | import mlflow
 7 | 
 8 | 
 9 | def init():
10 |     """
11 |     This function is called when the container is initialized/started, typically after create/update of the deployment.
12 |     You can write the logic here to perform init operations like caching the model in memory
13 |     """
14 |     global model
15 |     # AZUREML_MODEL_DIR is an environment variable created during deployment.
16 |     # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
17 |     model_path = os.path.join(
18 |         os.getenv("AZUREML_MODEL_DIR"), "sklearn_model_sklearn_save/"
19 |     )
20 |     # deserialize the model file back into a sklearn model
21 |     model = mlflow.sklearn.load_model(model_path)
22 |     logging.info("Init complete")
23 | 
24 | 
25 | def run(raw_data):
26 |     """
27 |     This function is called for every invocation of the endpoint to perform the actual scoring/prediction.
28 |     In the example we extract the data from the json input and call the scikit-learn model's predict()
29 |     method and return the result back.
30 |     raw_data:   The json input received by the endpoint. needs to include a "data" field with a table that holds the 42
31 |                 features of each item to be classified
32 |                 these correspond to the following column names from the IDA tables:
33 |                     ['AA_000', 'AG_005', 'AH_000', 'AL_000', 'AM_0', 'AN_000', 'AO_000', 'AP_000', 'AQ_000',
34 |                     'AZ_004', 'BA_002', 'BB_000', 'BC_000', 'BD_000', 'BE_000',
35 |                     'BF_000', 'BG_000', 'BH_000', 'BI_000', 'BJ_000', 'BS_000', 'BT_000', 'BU_000', 'BV_000',
36 |                     'BX_000', 'BY_000', 'BZ_000', 'CA_000', 'CB_000', 'CC_000', 'CI_000', 'CN_004', 'CQ_000',
37 |                     'CS_001', 'DD_000', 'DE_000', 'DN_000', 'DS_000', 'DU_000', 'DV_000', 'EB_000', 'EE_005']
38 |     """
39 |     logging.error(json.loads(raw_data))
40 |     json_in = json.loads(raw_data)
41 |     data = json_in["data"]
42 | 
43 |     data = numpy.array(data)
44 |     response = model.predict(data)
45 |     return {"result": str(response)}
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/README.md:
--------------------------------------------------------------------------------
 1 | ## Python Tutorials
 2 | This section contains tutorials with the Python Programming Language. We are going to provide examples for different frameworks, tasks and use cases.
 3 | 
 4 | ### AzureML:
 5 | [AzureML](https://azure.microsoft.com/de-de/products/machine-learning) is a Microsoft service for the Machine 
 6 | learning lifecycle in Azure.
 7 | 
 8 | This tutorial will show you:
 9 | 
10 | * [A general introduction to the topic](AzureML/Introduction.ipynb), we recommend you start here
11 | * [How to connect AzureML to Exasol](AzureML/ConnectAzureMLtoExasol.ipynb)
12 | * [How to Train a model using data from Exasol](AzureML/TrainModelInAzureML.ipynb)
13 | * [How to Invoke the trained model from an Exasol UDF](AzureML/InvokeModelFromExasolDBwithUDF.ipynb)
14 | 
15 | 
16 | ### Frameworks:
17 | 
18 | * [Scikit-learn](scikit-learn):
19 | 
20 |   [Scikit-learn](https://scikit-learn.org/stable/) is a free software machine learning library for the Python
21 |   programming language. It features various classification, regression and clustering algorithms including support
22 |   vector machines, random forests, gradient boosting, k-means and DBSCAN, and is designed to interoperate with the
23 |   Python numerical and scientific libraries NumPy and SciPy. Its scalability of the training is typically limited.
24 |   Out-of-core learning is not for all algorithms available, such that the usage of these algorithms is limited by the
25 |   available main memory. Scikit-learn supports parallel execution through python multi-processing and linear algebra
26 |   libraries. Distributed training and GPU acceleration is not out of the box available. You can find more details about
27 |   scalability [here](https://scikit-learn.org/stable/modules/computing.html).
28 | 
29 | * [AWS Sagemaker](sagemaker)
30 | 
31 |   [AWS Sagemaker](https://aws.amazon.com/de/sagemaker/) is an AWS cloud service for machine learning. In contains
32 |   hosted [Jupyter notebooks](https://jupyter.org/) but also
33 |   a [SDK for machine learning](https://sagemaker.readthedocs.io/en/stable/).
34 | 
35 |   This tutorial will show you:
36 | 
37 |   * [How to connect from a SageMaker Notebook to Exasol](sagemaker/ConnectSagemakerToExasol.ipynb)
38 |   * [How to load example dataset](sagemaker/LoadExampleDataIntoExasol.ipynb)
39 |   * [How to train a Sagemaker model with data from Exasol](sagemaker/TrainSagemakerModelWithExasolData.ipynb)
40 |   * [How to use a Sagemaker model from inside of Exasol](sagemaker/UseSagemakerModelFromExasol.ipynb)
41 | 
42 | ### Prerequisites:
43 | 
44 | For general prerequisites, please refer to [Prerequisites](../README.md). However, these tutorials typically need a specific flavor of the [Script Language Container](https://github.com/exasol/script-languages) which has the required dependencies installed. For these purposes, we provide the python3-ds-* flavors which already contain the dependencies for the frameworks used in these tutorials. Prepackaged releases for this flavor can be found on the [release page](https://github.com/exasol/script-languages/releases).
45 | 


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/sagemaker/ConnectSagemakerToExasol.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Connect to Exasol from Sagemaker\n",
  8 |     "\n",
  9 |     "This example shows you how to connect from AWS Sagemaker to an Exasol database.\n",
 10 |     "\n",
 11 |     "First we install pyexasol, as a driver."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "Requirement already satisfied: pyexasol in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (0.14.3)\n",
 24 |       "Requirement already satisfied: rsa in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyexasol) (4.5)\n",
 25 |       "Requirement already satisfied: websocket-client>=0.47.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyexasol) (0.57.0)\n",
 26 |       "Requirement already satisfied: pyasn1>=0.1.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from rsa->pyexasol) (0.4.8)\n",
 27 |       "Requirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from websocket-client>=0.47.0->pyexasol) (1.14.0)\n",
 28 |       "\u001B[33mWARNING: You are using pip version 20.0.2; however, version 20.3.1 is available.\n",
 29 |       "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001B[0m\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "!pip install pyexasol\n",
 35 |     "import pyexasol"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Now let's connect:"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "import pyexasol\n",
 52 |     "EXASOL_HOST = \"<database_host>\" # change\n",
 53 |     "EXASOL_PORT = \"8563\" # change if needed\n",
 54 |     "EXASOL_CONNECTION = \"{host}:{port}\".format(host=EXASOL_HOST, port=EXASOL_PORT)\n",
 55 |     "EXASOL_USER = \"sys\" # change if needed\n",
 56 |     "EXASOL_PASSWORD = \"<database_password>\" # change\n",
 57 |     "EXASOL_SCHEMA = \"IDA\" # change if needed\n",
 58 |     "exasol = pyexasol.connect(dsn=EXASOL_CONNECTION, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "... and run an example query:"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/html": [
 76 |        "<div>\n",
 77 |        "<style scoped>\n",
 78 |        "    .dataframe tbody tr th:only-of-type {\n",
 79 |        "        vertical-align: middle;\n",
 80 |        "    }\n",
 81 |        "\n",
 82 |        "    .dataframe tbody tr th {\n",
 83 |        "        vertical-align: top;\n",
 84 |        "    }\n",
 85 |        "\n",
 86 |        "    .dataframe thead th {\n",
 87 |        "        text-align: right;\n",
 88 |        "    }\n",
 89 |        "</style>\n",
 90 |        "<table border=\"1\" class=\"dataframe\">\n",
 91 |        "  <thead>\n",
 92 |        "    <tr style=\"text-align: right;\">\n",
 93 |        "      <th></th>\n",
 94 |        "      <th>USER_NAME</th>\n",
 95 |        "      <th>CREATED</th>\n",
 96 |        "      <th>USER_CONSUMER_GROUP</th>\n",
 97 |        "      <th>USER_COMMENT</th>\n",
 98 |        "    </tr>\n",
 99 |        "  </thead>\n",
100 |        "  <tbody>\n",
101 |        "    <tr>\n",
102 |        "      <th>0</th>\n",
103 |        "      <td>SYS</td>\n",
104 |        "      <td>NaN</td>\n",
105 |        "      <td>SYS_CONSUMER_GROUP</td>\n",
106 |        "      <td>SYS is the system user and possesses universal...</td>\n",
107 |        "    </tr>\n",
108 |        "  </tbody>\n",
109 |        "</table>\n",
110 |        "</div>"
111 |       ],
112 |       "text/plain": [
113 |        "  USER_NAME  CREATED USER_CONSUMER_GROUP  \\\n",
114 |        "0       SYS      NaN  SYS_CONSUMER_GROUP   \n",
115 |        "\n",
116 |        "                                        USER_COMMENT  \n",
117 |        "0  SYS is the system user and possesses universal...  "
118 |       ]
119 |      },
120 |      "execution_count": 4,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "users = exasol.export_to_pandas(\"SELECT * FROM SYS.EXA_ALL_USERS\")\n",
127 |     "users"
128 |    ]
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "kernelspec": {
133 |    "display_name": "conda_python3",
134 |    "language": "python",
135 |    "name": "conda_python3"
136 |   },
137 |   "language_info": {
138 |    "codemirror_mode": {
139 |     "name": "ipython",
140 |     "version": 3
141 |    },
142 |    "file_extension": ".py",
143 |    "mimetype": "text/x-python",
144 |    "name": "python",
145 |    "nbconvert_exporter": "python",
146 |    "pygments_lexer": "ipython3",
147 |    "version": "3.6.10"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 4
152 | }


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/sagemaker/LoadExampleDataIntoExasol.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Load Example Data Into the Exasol database\n",
  8 |     "\n",
  9 |     "In this Notebook we will load the \"Air pressure system failures in Scania trucks\" dataset into the exasol database using Python and Pyexasol. This Scania trucks dataset is a predictive maintenance scenario:\n",
 10 |     "\n",
 11 |     "> The dataset consists of data collected from heavy Scania trucks in everyday usage. The system in focus is the Air Pressure system (APS) which generates pressurized air that is utilized in various functions in a truck, such as braking and gear changes. The datasets' positive class consists of component failures for a specific component of the APS system. The negative class consists of trucks with failures for components not related to the APS. The data consists of a subset of all available data, selected by experts.\n",
 12 |     "\n",
 13 |     "You can find further information [here](https://archive.ics.uci.edu/ml/datasets/IDA2016Challenge)."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "source": [
 19 |     "For this we need:\n",
 20 |     "\n",
 21 |     "    - Connection information of the running Exasol database we want to load the data into.\n",
 22 |     "    - The url of the dataset we want to load (and knowledge of its structure).\n",
 23 |     "\n",
 24 |     "\n",
 25 |     "First we enter the connection details for the Exasol database we want to load the dataset into.\n",
 26 |     "Then we install pyexasol and import some dependencies."
 27 |    ],
 28 |    "metadata": {
 29 |     "collapsed": false,
 30 |     "pycharm": {
 31 |      "name": "#%% md\n"
 32 |     }
 33 |    }
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "EXASOL_HOST = \"<database_host>\" # change, in case of Exasol Saas this can be a \"connection string\"\n",
 42 |     "EXASOL_PORT = \"8563\" # change if needed\n",
 43 |     "EXASOL_USER = \"sys\" # change if needed\n",
 44 |     "EXASOL_PASSWORD = \"<database_password>\" # change, in case of Exasol Saas this can be a personal access token\n",
 45 |     "EXASOL_SCHEMA = \"IDA\""
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "!pip install pyexasol\n",
 55 |     "\n",
 56 |     "import pyexasol\n",
 57 |     "from io import BytesIO\n",
 58 |     "from urllib.request import urlopen\n",
 59 |     "import pandas as pd\n",
 60 |     "from zipfile import ZipFile"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "source": [
 66 |     "Next we can use  the pyexasol connection to connect to our Exasol DB."
 67 |    ],
 68 |    "metadata": {
 69 |     "collapsed": false,
 70 |     "pycharm": {
 71 |      "name": "#%% md\n"
 72 |     }
 73 |    }
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "EXASOL_CONNECTION = \"{host}:{port}\".format(host=EXASOL_HOST, port=EXASOL_PORT)\n",
 82 |     "exasol = pyexasol.connect(dsn=EXASOL_CONNECTION, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Download Example Data\n",
 90 |     "\n",
 91 |     "Now we download the dataset and write it to a zip-file."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "DATA_URL = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00414/to_uci.zip\"\n",
101 |     "\n",
102 |     "resp = urlopen(DATA_URL)\n",
103 |     "with open('to_uci.zip', 'wb') as f:  \n",
104 |     "    f.write(resp.read())\n",
105 |     "    \n",
106 |     "print(\"data downloaded\")"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "source": [
112 |     "And then we read the contents of the downloaded zip-file into \"train_set\" and \"test_set\" variables respectively, using pandas to load the train- and test-tables from the csv files."
113 |    ],
114 |    "metadata": {
115 |     "collapsed": false,
116 |     "pycharm": {
117 |      "name": "#%% md\n"
118 |     }
119 |    }
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "TRAINING_FILE = \"to_uci/aps_failure_training_set.csv\"\n",
128 |     "TEST_FILE = \"to_uci/aps_failure_test_set.csv\"\n",
129 |     "\n",
130 |     "# Data is preceded with a 20-line header (copyright & license)\n",
131 |     "NUM_SKIP_ROWS = 20\n",
132 |     "NA_VALUE = \"na\"\n",
133 |     "\n",
134 |     "with ZipFile('to_uci.zip') as z:\n",
135 |     "    with z.open(TRAINING_FILE, \"r\") as f:\n",
136 |     "        train_set = pd.read_csv(f, skiprows=NUM_SKIP_ROWS, na_values=NA_VALUE)\n",
137 |     "    with z.open(TEST_FILE, \"r\") as f:\n",
138 |     "        test_set = pd.read_csv(f, skiprows=NUM_SKIP_ROWS, na_values=NA_VALUE)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "## Import Example Data\n",
146 |     "\n",
147 |     "In the last step we want to load the dataset into the exasol database. First we need to create a new schema \"EXASOL_SCHEMA\" using the pyexasol connection."
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "exasol.execute(query=\"CREATE SCHEMA IF NOT EXISTS {schema!i}\", query_params={\"schema\": EXASOL_SCHEMA})"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "source": [
162 |     "Then we need to create the \"EXASOL_SCHEMA.TRAIN\" and \"EXASOL_SCHEMA.TEST\" tables in the Exasol database with column names and types that match the tables from the data set. We do this by extracting the column names from the pandas table we created in the previous step. The column types for the Scania Trucks data set are VARCHAR(3) for the first column (\"class\"), and DECIMAL(18,2) for all other columns. We use the pyexasol connection we created previously to create these tables."
163 |    ],
164 |    "metadata": {
165 |     "collapsed": false,
166 |     "pycharm": {
167 |      "name": "#%% md\n"
168 |     }
169 |    }
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "# Define column names and types\n",
178 |     "column_names = list(train_set.columns)\n",
179 |     "column_types = [\"VARCHAR(3)\"] + [\"DECIMAL(18,2)\"] * (len(column_names) - 1)\n",
180 |     "column_desc = [\" \".join(t) for t in zip(column_names, column_types)]\n",
181 |     "\n",
182 |     "params = {\"schema\": EXASOL_SCHEMA, \"column_names\": column_names, \"column_desc\": column_desc}\n",
183 |     "\n",
184 |     "# Create tables for data\n",
185 |     "exasol.execute(query=\"CREATE OR REPLACE TABLE {schema!i}.TRAIN(\" + \", \".join(column_desc) + \")\", query_params=params)\n",
186 |     "exasol.execute(query=\"CREATE OR REPLACE TABLE {schema!i}.TEST LIKE {schema!i}.TRAIN\", query_params=params)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "source": [
192 |     "Finally, we can use pyexasol's \"import_from_pandas\" functionality to import our pandas tables into our newly created Exasol tables using the pyexasol connection."
193 |    ],
194 |    "metadata": {
195 |     "collapsed": false,
196 |     "pycharm": {
197 |      "name": "#%% md\n"
198 |     }
199 |    }
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "outputs": [],
205 |    "source": [
206 |     "# Import data into Exasol\n",
207 |     "exasol.import_from_pandas(train_set, (EXASOL_SCHEMA, \"TRAIN\"))\n",
208 |     "print(f\"Imported {exasol.last_statement().rowcount()} rows into TRAIN.\")\n",
209 |     "exasol.import_from_pandas(test_set, (EXASOL_SCHEMA, \"TEST\"))\n",
210 |     "print(f\"Imported {exasol.last_statement().rowcount()} rows into TEST.\")"
211 |    ],
212 |    "metadata": {
213 |     "collapsed": false,
214 |     "pycharm": {
215 |      "name": "#%%\n"
216 |     }
217 |    }
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "source": [
222 |     "Now te Scania Trucks dataset should be available in the Exasol database in the Schema \"EXASOL_SCHEMA\" sorted into the \"TRAIN\" and the \"TEST\" tables."
223 |    ],
224 |    "metadata": {
225 |     "collapsed": false,
226 |     "pycharm": {
227 |      "name": "#%% md\n"
228 |     }
229 |    }
230 |   }
231 |  ],
232 |  "metadata": {
233 |   "kernelspec": {
234 |    "display_name": "conda_python3",
235 |    "language": "python",
236 |    "name": "conda_python3"
237 |   },
238 |   "language_info": {
239 |    "codemirror_mode": {
240 |     "name": "ipython",
241 |     "version": 3
242 |    },
243 |    "file_extension": ".py",
244 |    "mimetype": "text/x-python",
245 |    "name": "python",
246 |    "nbconvert_exporter": "python",
247 |    "pygments_lexer": "ipython3",
248 |    "version": "3.6.10"
249 |   }
250 |  },
251 |  "nbformat": 4,
252 |  "nbformat_minor": 4
253 | }


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/sagemaker/UseSagemakerModelFromExasol.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true,
  7 |     "pycharm": {
  8 |      "name": "#%% md\n"
  9 |     }
 10 |    },
 11 |    "source": [
 12 |     "# Use an AWS Sagemaker model from within Exasol\n",
 13 |     "\n",
 14 |     "In this notebook we will use an AWS Sagemaker model for predicitions from within Exasol queries.\n",
 15 |     "\n",
 16 |     "For that our exasol database needs permissions to use the Sagemaker inference Notebook.\n",
 17 |     "For that you can:\n",
 18 |     "\n",
 19 |     "* Provide credentials\n",
 20 |     "* Grant the permissions to the Role of the databases EC2 role.\n",
 21 |     "\n",
 22 |     "In this guide we will use the second approach.\n",
 23 |     "\n",
 24 |     "Grant the following permissions to your EC2 instance role:\n",
 25 |     "\n",
 26 |     "* `sts:AssumeRole` with a resource filter for the EC2 role itself.\n",
 27 |     "* `sagemaker:InvokeEndpoint` with a resource filter on your Sagemaker endpoint.\n",
 28 |     "\n",
 29 |     "In case you want to take the first approach, you can modify the UDF code below to use credentials."
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "source": [
 35 |     "## Parameters"
 36 |    ],
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    }
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "EXASOL_HOST = \"<database_host>\" # change\n",
 47 |     "EXASOL_PORT = \"8563\" # change if needed\n",
 48 |     "EXASOL_CONNECTION = \"{host}:{port}\".format(host=EXASOL_HOST, port=EXASOL_PORT)\n",
 49 |     "EXASOL_USER = \"sys\" # change if needed\n",
 50 |     "EXASOL_PASSWORD = \"<database_password>\" # change\n",
 51 |     "EXASOL_SCHEMA = \"IDA\" # change if needed\n",
 52 |     "EXASOL_CLUSTER_ROLE = \"<cluster_role>\" #change\n",
 53 |     "EXASOL_REGION = \"eu-central-1\" #change if needed\n",
 54 |     "ENDPOINT_NAME = \"<endpoint_name>\" #change"
 55 |    ],
 56 |    "metadata": {
 57 |     "collapsed": false,
 58 |     "pycharm": {
 59 |      "name": "#%%\n"
 60 |     }
 61 |    }
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "source": [
 66 |     "## Setup"
 67 |    ],
 68 |    "metadata": {
 69 |     "collapsed": false,
 70 |     "pycharm": {
 71 |      "name": "#%% md\n"
 72 |     }
 73 |    }
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "!pip install pyexasol\n",
 81 |     "\n",
 82 |     "import pyexasol\n",
 83 |     "import pandas as pd\n",
 84 |     "exasol = pyexasol.connect(dsn=EXASOL_CONNECTION, user=EXASOL_USER, password=EXASOL_PASSWORD, compression=True)"
 85 |    ],
 86 |    "metadata": {
 87 |     "collapsed": false,
 88 |     "pycharm": {
 89 |      "name": "#%%\n"
 90 |     }
 91 |    }
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "source": [
 96 |     "## Install UDF\n",
 97 |     "\n",
 98 |     "In order to use the Sagemaker inference Endpoint from within the Exasol database, we will create a Python UDF that does API calls to the endpoint with the data from the query."
 99 |    ],
100 |    "metadata": {
101 |     "collapsed": false
102 |    }
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "outputs": [],
108 |    "source": [
109 |     "# create schema\n",
110 |     "exasol.execute(\"CREATE SCHEMA IF NOT EXISTS DATA_SCIENCE\")\n",
111 |     "\n",
112 |     "#create UDF\n",
113 |     "exasol.execute(\"\"\"\n",
114 |     "CREATE OR REPLACE PYTHON3 SET SCRIPT DATA_SCIENCE.PREDICT(...) EMITS(id DECIMAL(20,0), \"result\" BOOLEAN) AS\n",
115 |     "def run(ctx):\n",
116 |     "    import boto3\n",
117 |     "    import pandas as pd\n",
118 |     "    import os\n",
119 |     "    f = open(\"/tmp/.config\", \"w\")\n",
120 |     "    f.write(\n",
121 |     "        \"[default]\\\\nregion = {region!r}\\\\nrole_arn = {role!r}\\\\ncredential_source = Ec2InstanceMetadata\")\n",
122 |     "    f.close()\n",
123 |     "    os.environ['AWS_CONFIG_FILE'] = '/tmp/.config'\n",
124 |     "    while True:\n",
125 |     "        df = ctx.get_dataframe(1000)\n",
126 |     "        if df is None:\n",
127 |     "            break\n",
128 |     "        id_column = df[\"0\"]\n",
129 |     "        df = df.drop(\"0\", 1)\n",
130 |     "        client = boto3.client('sagemaker-runtime')\n",
131 |     "        endpoint_name = \"{endpoint_name!r}\"\n",
132 |     "        response = client.invoke_endpoint(\n",
133 |     "            EndpointName=endpoint_name,\n",
134 |     "            ContentType='text/csv',\n",
135 |     "            Body=df.to_csv(header=False, index=False)\n",
136 |     "        )\n",
137 |     "        result_list = response['Body'].read().decode('ascii').split(\",\")\n",
138 |     "        rounded_result = map(lambda x: bool(round(float(x))),result_list)\n",
139 |     "        result = pd.DataFrame(list(rounded_result))\n",
140 |     "        ctx.emit(pd.concat([id_column,result],axis=1))\n",
141 |     "/\n",
142 |     "\"\"\", {\n",
143 |     "        \"region\": EXASOL_REGION,\n",
144 |     "        \"role\": EXASOL_CLUSTER_ROLE,\n",
145 |     "        \"endpoint_name\": ENDPOINT_NAME\n",
146 |     "})"
147 |    ],
148 |    "metadata": {
149 |     "collapsed": false,
150 |     "pycharm": {
151 |      "name": "#%%\n"
152 |     }
153 |    }
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "source": [
158 |     "## Run Query\n",
159 |     "\n",
160 |     "So let's run predictions on the test data table in Exasol."
161 |    ],
162 |    "metadata": {
163 |     "collapsed": false
164 |    }
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "outputs": [],
170 |    "source": [
171 |     "all_columns = exasol.export_to_pandas(\"SELECT * FROM \" + EXASOL_SCHEMA + \".TEST LIMIT 1;\")\n",
172 |     "column_names = list(all_columns)\n",
173 |     "column_names.remove(\"CLASS\")\n",
174 |     "result = exasol.export_to_pandas(\"\"\"SELECT CLASS = 'pos' as \"expected\", \"result\" FROM  (\n",
175 |     "                                     SELECT DATA_SCIENCE.PREDICT(ROWID, {columns_without_class!q}) FROM IDA.TEST t) r\n",
176 |     "                                    JOIN IDA.TEST o ON r.ID = o.ROWID\"\"\", {\"columns_without_class\": column_names})\n",
177 |     "pd.crosstab(index=result['expected'], columns=result[\"result\"], rownames=['actuals'], colnames=['predictions'])"
178 |    ],
179 |    "metadata": {
180 |     "collapsed": false,
181 |     "pycharm": {
182 |      "name": "#%%\n"
183 |     }
184 |    }
185 |   }
186 |  ],
187 |  "metadata": {
188 |   "kernelspec": {
189 |    "display_name": "Python 3",
190 |    "language": "python",
191 |    "name": "python3"
192 |   },
193 |   "language_info": {
194 |    "codemirror_mode": {
195 |     "name": "ipython",
196 |     "version": 2
197 |    },
198 |    "file_extension": ".py",
199 |    "mimetype": "text/x-python",
200 |    "name": "python",
201 |    "nbconvert_exporter": "python",
202 |    "pygments_lexer": "ipython2",
203 |    "version": "2.7.6"
204 |   }
205 |  },
206 |  "nbformat": 4,
207 |  "nbformat_minor": 0
208 | }


--------------------------------------------------------------------------------
/tutorials/machine-learning/python/scikit-learn/README.md:
--------------------------------------------------------------------------------
 1 | ## Scikit-learn Python Tutorials
 2 | 
 3 | This section contains tutorials with Scikit-learn in Python. We are going to provide examples for different tasks and use cases.
 4 | 
 5 | **Currently, this repository is under development and we will add more and more tutorials in the future.**
 6 | 
 7 | ### Overview
 8 | 
 9 | * [Classification](classification.ipynb)
10 | 


--------------------------------------------------------------------------------
/tutorials/machine-learning/sagemaker-extension/images/sme_deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/sagemaker-extension/images/sme_deployment.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/sagemaker-extension/images/sme_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/sagemaker-extension/images/sme_overview.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/sagemaker-extension/images/sme_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exasol/data-science-examples/06c75661c3a6a9bead446c87dd5b9fd4bd642614/tutorials/machine-learning/sagemaker-extension/images/sme_training.png


--------------------------------------------------------------------------------
/tutorials/machine-learning/sagemaker-extension/tutorial.md:
--------------------------------------------------------------------------------
  1 | # SageMaker Extension Tutorial
  2 | 
  3 | ## 1. Introduction
  4 | This tutorial walks you through the setup of the Exasol SageMaker-Extension 
  5 | project and presents an  use-case of how this extension can be used in Exasol.
  6 | 
  7 | The Exasol Sagemaker Extension enables you to develop an end-to-end machine 
  8 | learning project on data stored in Exasol using the AWS SageMaker Autopilot service.
  9 | 
 10 | The use-case handles a publicly available real-world dataset provided by a heavy 
 11 | truck manufacturer (see [Use Case](#use-case)). With the 
 12 | provided extension, a machine learning model is developed which allows 
 13 | predicting  whether the truck failures are related to a particular component.
 14 | 
 15 | ### 1.1 AWS Sagemaker Autopilot Service
 16 | AWS SageMaker is an AWS public cloud service in which users can build and deploy 
 17 | machine learning models. SageMaker provides a number of levels of abstraction to 
 18 | users while developing machine learning models. At one of the its highest level 
 19 | of abstraction, SageMaker enables users to use an Automated machine learning 
 20 | (AutoML) service, called Autopilot in AWS, that automatizes the process of 
 21 | applying machine learning  to real world problems.
 22 | 
 23 | Autopilot covers a  complete pipeline of developing an end-to end machine learning 
 24 | project, from raw data to a deployable model. It is able to automatically build, 
 25 | train and tune a number of machine learning models by inspecting your data set. 
 26 | In this way, the following tasks, which are repeatedly applied by ML-experts 
 27 | in machine learning projects, are automated:
 28 | - Pre-process and clean the data.
 29 | - Perform feature engineering and select the most appropriate ones
 30 | - Determine the most appropriate ML algorithm.
 31 | - Tune and optimize hyper-parameters of model.
 32 | - Post-process machine learning models.
 33 | 
 34 | The Exasol Sagemaker Extension takes these advantages of AWS Autopilot and enables 
 35 | users to easily create an effective and efficient machine learning models 
 36 | without expert knowledge.
 37 | 
 38 | ### 1.2 Exasol SageMaker Extension
 39 | 
 40 | The Exasol Sagemaker Extension provides a Python library together with Exasol 
 41 | Scripts and UDFs that train Machine Learning Models on data stored in Exasol 
 42 | using AWS SageMaker Autopilot service.
 43 | 
 44 | The extension basically exports a given Exasol table into AWS S3, and then 
 45 | triggers Machine Learning training using the AWS Autopilot service with the 
 46 | specified parameters. In addition, the training status can be polled using 
 47 | the auxiliary scripts provided within the scope of the project. In order to 
 48 | perform prediction on a trained Autopilot model, one of the methods is to 
 49 | deploy the model to the real-time AWS endpoint. This extension provides Lua 
 50 | scripts for creating/deleting real-time endpoint and creates a model-specific 
 51 | UDF script for making real-time predictions. The following figure
 52 | indicates the overview of this solution.
 53 | 
 54 | ![SME Overview](./images/sme_overview.png)
 55 | 
 56 | ## 2. Setup the Extension
 57 | 
 58 | ### 2.1 Installation
 59 | 
 60 | In order to use the Exasol SageMaker Extension, it is necessary to install the python package of the Extension 
 61 | , upload the given SageMaker-Extension Container into 
 62 | BucketFS and then activate the uploaded container in Exasol. These pre-packaged
 63 | releases are available in the [Releases](https://github.com/exasol/sagemaker-extension/releases) 
 64 | of the Github repository. 
 65 | 
 66 | Before starting the installation, let's define the variables required for the 
 67 | installation (Please note that you need to change variables below to use your 
 68 | own Exasol Database):
 69 | ```python
 70 | DATABASE_HOST="127.0.0. 1"
 71 | DATABASE_PORT=9563
 72 | DATABASE_USER="sys"
 73 | DATABASE_PASSWORD="exasol"
 74 | DATABASE_SCHEMA="IDA"
 75 | BUCKETFS_PORT=6666
 76 | BUCKETFS_USER="w"
 77 | BUCKETFS_PASSWORD="write"
 78 | BUCKETFS_NAME="bfsdefault"
 79 | BUCKET_NAME="default"
 80 | PATH_IN_BUCKET="container"
 81 | CONTAINER_NAME="exasol_sagemaker_extension_container-release"
 82 | CONTAINER_FILE="exasol_sagemaker_extension_container-release.tar.gz"
 83 | ```
 84 | 
 85 | - The sagemaker-extension python package provides a command line tool to 
 86 | deploy the Lua and UDF scripts to the database. It is installed as follows 
 87 | (Please check [the latest release](https://github.com/exasol/sagemaker-extension/releases/latest)):
 88 |     ```sh
 89 |     pip install https://github.com/exasol/sagemaker-extension/releases/download/<version>/exasol_sagemaker_extension-<version>-py3-none-any.whl
 90 |     ```
 91 | 
 92 | - The required libraries and dependencies of the Exasol SageMaker Extension are 
 93 | distributed into Exasol by uploading the pre-built Exasol SageMaker-Extension Language 
 94 | Container to the BucketFS. You can upload it with any http(s) client that can send 
 95 | files via HTTP-Put requests. For more details please check 
 96 | [Access Files in BucketFS](https://docs.exasol.com/database_concepts/bucketfs/file_access.htm). 
 97 | The following example uploads the pre-built SageMaker-Extension Container to BucketFS with the curl command, a http(s) client:
 98 |     ```sh
 99 |     curl -vX PUT -T \ 
100 |         "<CONTAINER_FILE>" 
101 |         "http://w:<BUCKETFS_WRITE_PASS>@$bucketfs_host:<BUCKETFS_PASS>/<BUCKETFS_NAME>/<PATH_IN_BUCKET><CONTAINER_FILE>"
102 |     ```
103 | 
104 | - You need to activate the uploaded container  for your session or the whole system through 
105 | adjusting parameter `SCRIPT_LANGUAGES`.  Please keep in mind, that 
106 | the name of the language alias is assumed to be `PYTHON_SME` in the 
107 | SageMaker-Extension. For more details, please check 
108 | [Adding New Packages to Existing Script Languages](https://docs.exasol.com/database_concepts/udf_scripts/adding_new_packages_script_languages.htm).
109 | The following example query activates the container session-wide:
110 |     ```sh
111 |     ALTER SESSION SET SCRIPT_LANGUAGES=\
112 |     'PYTHON_SME=localzmq+protobuf:///<BUCKETFS_NAME>/<BUCKET_NAME>/<PATH_IN_BUCKET><CONTAINER_NAME>/?\
113 |             lang=python#buckets/<BUCKETFS_NAME>/<BUCKET_NAME>/<PATH_IN_BUCKET><CONTAINER_NAME>/\
114 |             exaudf/exaudfclient_py3 PYTHON3=builtin_python3 PYTHON=builtin_python R=builtin_r JAVA=builtin_java'
115 | 
116 |     ```
117 | 
118 | ### 2.2 Deployment
119 | 
120 | The installed SageMaker-extension python package provides a command-line 
121 | interface (CLI), enabling you to deploy all necessary Lua and UDF scripts into 
122 | the specified `DATABASE_SCHEMA` of Exasol Database. The command line is run 
123 | as follows: 
124 | 
125 | ```sh
126 | python -m exasol_sagemaker_extension.deployment.deploy_cli \
127 |     --host <DATABASE_HOST> \ 
128 |     --port <DATABASE_PORT> \
129 |     --user <DATABASE_USER> \
130 |     --pass <DATABASE_PASSWORD> \
131 |     --schema <DATABASE_SCHEMA>
132 | ```
133 | 
134 | After running this deployment command, you should be able to find all the 
135 | required Lua and UDF scripts in the specified schema. To check this, you can 
136 | run the following query: 
137 | ```sql
138 | SELECT 
139 |     SCRIPT_NAME , 
140 |     SCRIPT_TYPE 
141 | FROM 
142 |     SYS.EXA_ALL_SCRIPTS 
143 | WHERE 
144 |     SCRIPT_SCHEMA='IDA';
145 | ```
146 | 
147 | |SCRIPT_NAME                            |SCRIPT_TYPE|
148 | |---------------------------------------|-----------|
149 | |SME_TRAIN_WITH_SAGEMAKER_AUTOPILOT     |SCRIPTING  |
150 | |SME_AUTOPILOT_TRAINING_UDF             |UDF        |
151 | |SME_POLL_SAGEMAKER_AUTOPILOT_JOB_STATUS|SCRIPTING  |
152 | |SME_AUTOPILOT_JOB_STATUS_POLLING_UDF   |UDF        |
153 | |SME_DEPLOY_SAGEMAKER_AUTOPILOT_ENDPOINT|SCRIPTING  |
154 | |SME_AUTOPILOT_ENDPOINT_DEPLOYMENT_UDF  |UDF        |
155 | |SME_DELETE_SAGEMAKER_AUTOPILOT_ENDPOINT|SCRIPTING  |
156 | |SME_AUTOPILOT_ENDPOINT_DELETION_UDF    |UDF        |
157 | 
158 | ### 2.3 Create Connection to AWS
159 | 
160 | The Exasol SageMaker Extension needs to connect to AWS SageMaker and your AWS S3 bucket. 
161 | For that, it needs AWS credentials that has AWS Sagemaker Execution permissions. 
162 | The required credentials are AWS Access Key (Please check how to 
163 | [create an access key](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html#Using_CreateAccessKey)).
164 | 
165 | 
166 | In order for the SageMaker-Extension to use the Access Key, you need to create 
167 | an Exasol `CONNECTION` object which securely stores your keys. For more information, 
168 | please check [Create Connection in Exasol](https://docs.exasol.com/sql/create_connection.htm?Highlight=connection):  
169 | 
170 | 
171 | Before creating the connection object, let's define the variables for the 
172 | AWS connection (Please note that you need to use your own credentials for 
173 | below variables.)
174 | ```python
175 | AWS_BUCKET="ida_dataset_bucket"
176 | AWS_REGION="eu-central-1"
177 | AWS_KEY_ID="*****"
178 | AWS_ACCESS_KEY="*****"
179 | AWS_CONNECTION_NAME="AWS_CONNECTION"
180 | ```
181 | 
182 | The Exasol `CONNECTION` object object is created as follows: 
183 | ```sh
184 | CREATE OR REPLACE  CONNECTION <CONNECTION_NAME>
185 |     TO 'https://<AWS_BUCKET>.s3.<AWS_REGION>.amazonaws.com/''
186 |     USER '<AWS_KEY_ID>'
187 |     IDENTIFIED BY '<AWS_ACCESS_KEY>'
188 | ```
189 | 
190 | 
191 | ## 3. Use Case
192 | In the use case, the publicly available [Air pressure system failures in Scania trucks](https://archive.ics.uci.edu/ml/datasets/IDA2016Challenge) 
193 | dataset is used. The dataset is  provided by  Scania CV AB as a challenge 
194 | dataset in Industrial Challenge at the [15th International Symposium on 
195 | Intelligent Data Analysis (IDA)](https://ida2016.blogs.dsv.su.se/) in 2016.
196 | 
197 | The dataset consists of data collected from heavy Scania trucks in everyday usage. The dataset includes two different classes accroding to Air Pressure system (APS): (1) The positive class 
198 | consists of component failures for a specific component of the APS system. (2) The negative class consists of trucks with failures for components not related to the APS. 
199 | 
200 | In this use case,  it is proposed to develop a predictive machine learning model using our SageMaker-Extension to classify failures according to whether they are related to APS, or not. 
201 | 
202 | ### 3.1 Load the Dataset
203 | The following python script downloads the train and test datasets as CSV files 
204 | to the local file system. Then it creates `TRAIN` and `TEST` tables in the 
205 | specified `DATABASE_SCHEMA` of Exasol and imports the downloaded csv files 
206 | to these tables respectively.
207 | 
208 | ```python
209 | import pyexasol
210 | import pandas as pd
211 | from zipfile import ZipFile
212 | from urllib.request import urlopen
213 | 
214 | DATABASE_CONNECTION = "{host}:{port}".format(host=DATABASE_HOST, port=DATABASE_PORT)
215 | exasol = pyexasol.connect(
216 |     dsn=DATABASE_CONNECTION,
217 |     user=DATABASE_USER,
218 |     password=DATABASE_PASSWORD,
219 |     compression=True)
220 | 
221 | DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00414/to_uci.zip"
222 | TRAINING_FILE = "to_uci/aps_failure_training_set.csv"
223 | TEST_FILE = "to_uci/aps_failure_test_set.csv"
224 | 
225 | # Data is preceeded with a 20-line header (copyright & license)
226 | NUM_SKIP_ROWS = 20
227 | NA_VALUE = "na"
228 | 
229 | # Download datasets as csv files
230 | resp = urlopen(DATA_URL)
231 | with open('to_uci.zip', 'wb') as f:
232 |     f.write(resp.read())
233 | with ZipFile('to_uci.zip') as z:
234 |     with z.open(TRAINING_FILE, "r") as f:
235 |         train_set = pd.read_csv(f, skiprows=NUM_SKIP_ROWS, na_values=NA_VALUE)
236 |     with z.open(TEST_FILE, "r") as f:
237 |         test_set = pd.read_csv(f, skiprows=NUM_SKIP_ROWS, na_values=NA_VALUE)
238 | 
239 | # Create the schema if not exists
240 | exasol.execute(
241 |     query="CREATE SCHEMA IF NOT EXISTS {schema!i}", 
242 |     query_params={"schema": DATABASE_SCHEMA})
243 | 
244 | # Define column names and types
245 | column_names = list(train_set.columns)
246 | column_types = ["VARCHAR(3)"] + ["DECIMAL(18,2)"] * (len(column_names) - 1)
247 | column_desc = [" ".join(t) for t in zip(column_names, column_types)]
248 | params = {
249 |     "schema": DATABASE_SCHEMA, 
250 |     "column_names": column_names, 
251 |     "column_desc": column_desc}
252 | 
253 | # Create tables for data
254 | exasol.execute(
255 |     query="CREATE OR REPLACE TABLE {schema!i}.TRAIN(" 
256 |           + ", ".join(column_desc) + ")", 
257 |     query_params=params)
258 | exasol.execute(
259 |     query="CREATE OR REPLACE TABLE {schema!i}.TEST "
260 |           "LIKE {schema!i}.TRAIN", 
261 |     query_params=params)
262 | 
263 | # Import data into Exasol
264 | exasol.import_from_pandas(train_set, (DATABASE_SCHEMA, "TRAIN"))
265 | print(f"Imported {exasol.last_statement().rowcount()} rows into TRAIN.")
266 | exasol.import_from_pandas(test_set, (DATABASE_SCHEMA, "TEST"))
267 | print(f"Imported {exasol.last_statement().rowcount()} rows into TEST.")
268 | ```
269 | 
270 | ### 3.2 Train with SageMaker Autopilot
271 | 
272 | When you execute the SQL command to train a model, the Exasol SageMaker-Extension 
273 | exports the specified table from the Exasol Database to your specified 
274 | AWS S3 bucket. This export operation is highly efficient, as it is performed 
275 | in parallel. After that the execution script calls Amazon SageMaker Autopilot, 
276 | which automatically perform an end-to end machine learning development, 
277 | to build a model. The following figure indicates this solution. 
278 | 
279 | ![SME Training](./images/sme_training.png)
280 | 
281 | First, let's define the variables required to execute the training SQL command:
282 | ```python
283 | JOB_NAME="APSClassifier"
284 | IAM_SAGEMAKER_ROLE="*****"
285 | S3_BUCKET_URI="s3://<AWS_BUCKET>" 
286 | S3_OUTPUT_PATH="ida_dataset_path"
287 | INPUT_TABLE_NAME="TARGET"
288 | TARGET_COLUMN="CLASS"
289 | MAX_CANDIDATES=2
290 | ```
291 | 
292 | The following command exports the `TRAIN` table  in the `DATABASE_SCHEMA`  using 
293 | the credentials stored in the `AWS_CONNECTION` into AWS `S3_OUTPUT_PATH`  and 
294 | enables Autopilot to start a job with the  `JOB_NAME`. Please note that 
295 | `JOB_NAME`  must be unique to the corresponding account, and it is 
296 | case-insensitive. In addition,  the maximum number of candidate models is 
297 | limited to 2 by an optional parameter called `max_candidates`. On the other side, 
298 | the other optional parameters that are not set in this sample SQL command 
299 | such as `problem_type`, `objective` ... etc. will be inferenced by Autopilot. 
300 | For more information please check the [User Guide](https://github.com/exasol/sagemaker-extension/blob/main/doc/user_guide/user_guide.md).
301 | 
302 | 
303 | ```sh
304 | EXECUTE SCRIPT IDA."SME_TRAIN_WITH_SAGEMAKER_AUTOPILOT"(
305 | '{
306 |     "job_name"                          : "<JOB_NAME>",
307 |     "aws_credentials_connection_name"   : "<AWS_CONNECTION_NAME>",
308 |     "aws_region"                        : "<AWS_REGION>",
309 |     "iam_sagemaker_role"                : "<IAM_SAGEMAKER_ROLE>", 
310 |     "s3_bucket_uri"                     : "<S3_BUCKET_URI>",
311 |     "s3_output_path"                    : "<S3_OUTPUT_PATH>",
312 |     "input_schema_name"                 : "<DATABASE_SCHEMA>",
313 |     "input_table_or_view_name"          : "<INPUT_TABLE_NAME>",
314 |     "target_attribute_name"             : "<TARGET_COLUMN>",
315 |     "max_candidates"                    : <MAX_CANDIDATES>
316 | }');
317 | ```
318 | 
319 | This SQL command does not wait for the job to finish after calling Autopilot 
320 | and completes its execution. The metadata information of the created Autopilot
321 | job is saved into the `SME_METADATA_AUTOPILOT_JOBS` table. You can query this 
322 | table as follows:
323 | ```sql
324 | SELECT
325 |     * 
326 | FROM 
327 |     IDA."SME_METADATA_AUTOPILOT_JOBS";
328 | ```
329 | 
330 | |DATETIME                  |JOB_NAME     |AWS_CREDENTIALS_CONNECTION_NAME|S3_BUCKET_URI          |S3_OUTPUT_PATH  |TARGET_ATTRIBUTE_NAME|PROBLEM_TYPE|OBJECTIVE| ... |
331 | ---------------------------|-------------|-------------------------------|-----------------------|----------------|---------------------|------------|---------| --- |
332 | |2021-11-24-13.35.11.569000|APSClassifier|AWS_CONNECTION                 |s3://ida-dataset-bucket|ida_dataset_path|CLASS                |            |         | ... |
333 | 
334 | 
335 | ### 3.3 Poll Training Status
336 | As mentioned in the above section, the training SQL script runs asynchronously. 
337 | Therefore, you don't have to wait the training to finish. However, you can poll 
338 | the status of the Autopilot training job with the polling SQL script provided 
339 | by Exasol SageMaker-Extension. This SQL command takes the name of the job 
340 | whose status will be queried, namely `JOB_NAME`, as input and returns the 
341 | current status of the job. For more information please check the 
342 | [User Guide](https://github.com/exasol/sagemaker-extension/blob/main/doc/user_guide/user_guide.md). 
343 | You can execute the polling SQL command as follows:
344 | 
345 | ```sh
346 | EXECUTE SCRIPT IDA."SME_POLL_SAGEMAKER_AUTOPILOT_JOB_STATUS"(
347 |   '<JOB_NAME>',
348 |   '<AWS_CONNECTION_NAME>', 
349 |   '<AWS_REGION>'
350 | );
351 | ```
352 | 
353 | You can below see the sample results of this polling SQL command executed 
354 | several times while the "APSClassifier" training job is running:
355 | 
356 | |JOB_STATUS|JOB_SECONDARY_STATUS|
357 | |----------|--------------------|
358 | |InProgress|AnalyzingData       |
359 | 
360 | 
361 | |JOB_STATUS|JOB_SECONDARY_STATUS|
362 | |----------|--------------------|
363 | |InProgress|FeatureEngineering  |
364 | 
365 | 
366 | |JOB_STATUS|JOB_SECONDARY_STATUS|
367 | |----------|--------------------|
368 | |Completed |Completed           |
369 | 
370 | ### 3.4 Deploy Sagemaker Endpoint
371 | In order to perform prediction on a trained Autopilot model, one of the methods 
372 | is to deploy the model to the real-time AWS SageMaker endpoint. You can use the 
373 | deployment SQL command to create a real-time endpoint and deploy the best 
374 | candidate  model of the trained Autopilot jobs on it. The deployment SQL command 
375 | additionally generates the prediction UDF script which is specific to the 
376 | deployed endpoint so that you are able to perform real-time predictions.
377 | The following figure indicates this solution. 
378 | 
379 | ![SME Training](./images/sme_deployment.png)
380 | 
381 | First, let's define the variables required to execute the deployment SQL command:
382 | ```python
383 | ENDPOINT_NAME="APSPredictor"
384 | INSTANCE_TYPE="ml.m5.large"
385 | INSTANCE_COUNT=1 
386 | DATABASE_PRED_SCHEMA="IDAPrediction"
387 | ```
388 | 
389 | The following deployment SQL command creates a SageMaker endpoint called 
390 | `EDNPOINT_NAME` and deploys the best model of `JOB_NAME` on it. Please keep 
391 | in mind, that the `ENDPOINT_NAME` is also the name of the UDF script generated 
392 | for the prediction. Furthermore, you can specify a different schema 
393 | (`DATABASE_PRED_SCHEMA`) for the prediction UDF script to be installed 
394 | than the one in which the scripts of the Exasol SageMaker-Extension project 
395 | are deployed. For more information please check the 
396 | [User Guide](https://github.com/exasol/sagemaker-extension/blob/main/doc/user_guide/user_guide.md). 
397 | You can execute the deployment script with the defined variables as follows:
398 | 
399 | ```sh
400 | EXECUTE SCRIPT IDA."SME_DEPLOY_SAGEMAKER_AUTOPILOT_ENDPOINT"(
401 |   '<JOB_NAME>', 
402 |   '<ENDPOINT_NAME>', 
403 |   '<DATABASE_PRED_SCHEMA>', 
404 |   '<INSTANCE_TYPE>',  
405 |   <INSTANCE_COUNT>, 
406 |   '<AWS_CONNECTION_NMAE>', 
407 |   '<AWS_REGION>'
408 | );
409 | ```
410 | 
411 | You should be able to see the created UDF script for prediction, as follows:
412 | 
413 | ```sql
414 | SELECT 
415 |   SCRIPT_NAME,
416 |   SCRIPT_LANGUAGE
417 | FROM 
418 |   SYS.EXA_ALL_SCRIPTS
419 | WHERE 
420 |   SCRIPT_SCHEMA = 'IDAPrediction'
421 |   AND SCRIPT_TYPE = 'UDF'
422 | 
423 | ```
424 | 
425 | |SCRIPT_NAME                          |SCRIPT_LANGUAGE|
426 | |-------------------------------------|---------------|
427 | |APSPredictor                         |PYTHON3_SME    |
428 | 
429 | ### 3.5 Predict via SageMaker Endpoint
430 | 
431 | The Exasol SageMaker-Extension generates a prediction UDF for each model, 
432 | enabling you to perform prediction on the deployed endpoint. The name of the 
433 | prediction script is the same as the name of the endpoint (`ENDPOINT_NAME`) 
434 | specified when creating the endpoint. 
435 | 
436 | The prediction UDF makes a real-time  and synchronous call to the SageMaker 
437 | endpoint. The prediction SQL command takes all the columns used while 
438 | creating the model as inputs, appends the prediction result to these columns and 
439 | the response is returned immediately. For more information, please check the 
440 | [User Guide](https://github.com/exasol/sagemaker-extension/blob/main/doc/user_guide/user_guide.md).
441 | You can  make prediction for this use case as follows:
442 | 
443 | ```sql
444 | SELECT IDAPrediction."APSPredictor"(
445 |   AA_000,AB_000,AC_000,AD_000,AE_000,AF_000,AG_000,
446 |   ...
447 |   EE_005,EE_006,EE_007,EE_008,EE_009,EF_000,EG_000
448 | ) FROM IDA.TEST
449 | GROUP BY IPROC(),
450 | MOD(ROWNUM, 6);
451 | ```
452 | 
453 | |AA_000    |AB_000|AC_000       |AD_000 |AE_000 |AF_000 |AG_000| ...  |PREDICTIONS |
454 | |----------|------|-------------|-------|-------|-------|------|------|------------|
455 | |  79492.00|      |         0.00|       |   0.00|   0.00|  0.00| ...  |         neg|
456 | |  41026.00|      |       518.00| 392.00|   0.00|   0.00|  0.00| ...  |         neg|
457 | |  43728.00|  0.00|2130706432.00| 144.00| 522.00| 142.00|  0.00| ...  |         neg|
458 | |  55896.00|      |        74.00|  70.00|   0.00|   0.00|  0.00| ...  |         neg|
459 | |  40122.00|      |       232.00| 210.00|   0.00|   0.00|  0.00| ...  |         neg|
460 | |       ...|   ...|          ...|    ...|    ...|    ...|   ...| ...  |         ...|
461 | 
462 | Please keep in mind, that you can get high efficiency by executing the prediction 
463 | UDF script using the `GROUP BY IPROC()` statement, which allows you to perform 
464 | predictions on each node in parallel. 
465 | 
466 | ### 3.6 Delete Endpoint
467 | It is important to delete the endpoint created, when you are finished with the 
468 | endpoint Otherwise, the endpoint will continue to be charged. You can use the 
469 | following SQL command to delete the endpoint and associated resources:
470 | 
471 | ```sh
472 | EXECUTE SCRIPT IDA."SME_DELETE_SAGEMAKER_AUTOPILOT_ENDPOINT"(
473 |   '<ENDPOINT_NAME>', 
474 |   '<AWS_CONNECTION_NAME>', 
475 |   '<AWS_REGION>'
476 |  ); 
477 | ```
478 | 
479 | Please note, that  by the execution of the deletion SQL command, the predicted 
480 | UDF script will not be deleted and will not be able to run until the endpoint 
481 | is restarted.
482 | 
483 | ## 4.Conclusion
484 | In this tutorial, we went through each steps of the installation and deployment 
485 | of the Exasol SageMaker-Extension, and examined in detail how it works on 
486 | a real-world problem.
487 | 
488 | The Exasol SageMaker-Extension provides a simple installation with the 
489 | pre-packaged releases and perform a functional deployment with an easy-to-use 
490 | CLI tool. The SQL commands which come with the deployment enable you to create 
491 | the machine learning model of the table you want using the SageMaker Autopilot 
492 | service and make your predictions. 
493 | 


--------------------------------------------------------------------------------
/tutorials/script-languages/README.md:
--------------------------------------------------------------------------------
1 | ## Script-Language Container Tutorials
2 | 
3 | This section contains tutorials for building and customizing Script-Language Container. Script-Language Container are used for adding packages to the Exasol UDFs. As such they are often used in Machine Learning use cases to provide access to additional Machine Learning libraries.
4 | 
5 | * [Building and Customization Script-Langugae Container](script-languages.ipynb)
6 | 


--------------------------------------------------------------------------------
/tutorials/script-languages/bash_runner.py:
--------------------------------------------------------------------------------
 1 | from pexpect import replwrap, EOF
 2 | import pexpect
 3 | # Inspired by https://github.com/takluyver/bash_kernel
 4 | class IREPLWrapper(replwrap.REPLWrapper):
 5 |     """A subclass of REPLWrapper that gives incremental output
 6 |     The parameters are the same as for REPLWrapper, except for one
 7 |     extra parameter:
 8 |     :param line_output_callback: a callback method to receive each batch
 9 |       of incremental output. It takes one string parameter.
10 |     """
11 |     def __init__(self, cmd_or_spawn, orig_prompt, prompt_change,
12 |                  extra_init_cmd=None, line_output_callback=None):
13 |         self.line_output_callback = line_output_callback
14 |         replwrap.REPLWrapper.__init__(self, cmd_or_spawn, orig_prompt,
15 |                                       prompt_change, extra_init_cmd=extra_init_cmd)
16 | 
17 |     def _expect_prompt(self, timeout=-1):
18 |         if timeout == None:
19 |             # "None" means we are executing code from a Jupyter cell by way of the run_command
20 |             # in the do_execute() code below, so do incremental output.
21 |             while True:
22 |                 pos = self.child.expect_exact([self.prompt, self.continuation_prompt, u'\r\n'],
23 |                                               timeout=None)
24 |                 if pos == 2:
25 |                     # End of line received
26 |                     self.line_output_callback(self.child.before)
27 |                 else:
28 |                     if len(self.child.before) != 0:
29 |                         # prompt received, but partial line precedes it
30 |                         self.line_output_callback(self.child.before)
31 |                     break
32 |         else:
33 |             # Otherwise, use existing non-incremental code
34 |             pos = replwrap.REPLWrapper._expect_prompt(self, timeout=timeout)
35 | 
36 |         # Prompt received, so return normally
37 |         return pos
38 | 
39 | def run(code):
40 |     child = pexpect.spawn("bash", echo=False, encoding='utf-8', codec_errors='replace')
41 |     ps1 = replwrap.PEXPECT_PROMPT[:5] + u'\[\]' + replwrap.PEXPECT_PROMPT[5:]
42 |     ps2 = replwrap.PEXPECT_CONTINUATION_PROMPT[:5] + u'\[\]' + replwrap.PEXPECT_CONTINUATION_PROMPT[5:]
43 |     prompt_change = u"PS1='{0}' PS2='{1}' PROMPT_COMMAND=''".format(ps1, ps2)
44 | 
45 |     # Using IREPLWrapper to get incremental output
46 |     bashwrapper = IREPLWrapper(child, u'\$', prompt_change,
47 |                                     extra_init_cmd="export PAGER=cat",
48 |                                     line_output_callback=lambda x: print(x))
49 |     bashwrapper.run_command(code.rstrip(), timeout=None)
50 | 


--------------------------------------------------------------------------------
/tutorials/script-languages/requirements.txt:
--------------------------------------------------------------------------------
1 | pexpect==4.8.0
2 | pyexasol==0.16.1


--------------------------------------------------------------------------------
/tutorials/script-languages/slc_main_build_steps.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="281px" height="281px" viewBox="-0.5 -0.5 281 281" content="&lt;mxfile host=&quot;Electron&quot; modified=&quot;2021-07-28T12:03:18.933Z&quot; agent=&quot;5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/14.6.13 Chrome/89.0.4389.128 Electron/12.0.7 Safari/537.36&quot; etag=&quot;JSKc_qoYdPT6DpqNOvlf&quot; version=&quot;14.6.13&quot; type=&quot;device&quot;&gt;&lt;diagram id=&quot;aVgbEy0VvDmYcJRNJiEj&quot; name=&quot;Page-1&quot;&gt;7VhNj9owEP01HFvlE9hrWbqV+nXg0HJCJh4Sr5xM5DgQ9tfXIQ5JiEBZqchB2gvKPHucmXnzxiITdxEXL4Kk0U+kwCeORYuJ+zxxnKe5pX5L4FgB/mxWAaFgtILsBlixN9Cg9gtzRiHrbJSIXLK0CwaYJBDIDkaEwEN32w55960pCaEHrALC++gfRmVUoXPfavBvwMKofrNt6ZWY1Js1kEWE4qEFucuJuxCIsnqKiwXwsnZ1XSq/r1dWz4EJSOQQh9RDKbPd9yWsf9Nwy8KX2a9PTzo2eawTBqry1yYKGWGICeHLBv0iME8olKdaymr2/EBMFWgr8BWkPGoySS5RQZGMuV6Fgsm/pftnX1vr1spzoU8+GcfaSKQ4tpxKc91ea9xOVu1X5VcmdbVsGsowFwHcqJWj24+IEOSNfe6ZXCUKwBhUPMpPACeS7btxEN2e4Xlfw6B60CS+g1Ad5J7wXL8pp7uAM5XthoISzSXfXTYPEZOwSsmpEAel6C5z+nAQEorb9eznrx2ceiLogeDUgjk08rJrzUQtaXnWnUpWj5rxi+A/NrM7sJk9k81s+yaZsVu8NCyNdjwNZdQ3yajbG0+cJGGubtGRTqe58enkPcp0GoEGvIEamJnUgNfTwDZnnG5Enoyu/92p8f53Pvp/cP/7A/t/arL//V7/7zjZo9hsSTaSa8DzLmTgGJeB+3AysDsiaDRxfxlMH+EamF6TQZBnEmP2pmJA8zfCpRQ84//XZr3CKcJAzQ7jtbq8Pc+1u0OtlNl8OzqttT7Auct/&lt;/diagram&gt;&lt;/mxfile&gt;" style="background-color: rgb(255, 255, 255);"><defs/><g><path d="M 60 40 L 60 73.63" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 60 78.88 L 56.5 71.88 L 60 73.63 L 63.5 71.88 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="0" y="0" width="120" height="40" fill="#ffffff" stroke="#000000" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 20px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">udfclient_deps</div></div></div></foreignObject><text x="60" y="24" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">udfclient_deps</text></switch></g><path d="M 60 120 L 60 153.63" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 60 158.88 L 56.5 151.88 L 60 153.63 L 63.5 151.88 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 120 100 L 220 100 L 220 113.63" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 220 118.88 L 216.5 111.88 L 220 113.63 L 223.5 111.88 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="0" y="80" width="120" height="40" fill="#ffffff" stroke="#000000" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 100px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">language_deps</div></div></div></foreignObject><text x="60" y="104" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">language_deps</text></switch></g><path d="M 60 200 L 60 233.63" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 60 238.88 L 56.5 231.88 L 60 233.63 L 63.5 231.88 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="0" y="160" width="120" height="40" fill="#ffffff" stroke="#000000" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 180px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">build_run</div></div></div></foreignObject><text x="60" y="184" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">build_run</text></switch></g><path d="M 220 160 L 220 193.63" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 220 198.88 L 216.5 191.88 L 220 193.63 L 223.5 191.88 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="160" y="120" width="120" height="40" fill="#ffffff" stroke="#000000" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 140px; margin-left: 161px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">flavor_base_deps</div></div></div></foreignObject><text x="220" y="144" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">flavor_base_deps</text></switch></g><path d="M 220 240 L 220 260 L 126.37 260" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 121.12 260 L 128.12 256.5 L 126.37 260 L 128.12 263.5 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="160" y="200" width="120" height="40" fill="#ffffff" stroke="#000000" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 220px; margin-left: 161px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">flavor_customization</div></div></div></foreignObject><text x="220" y="224" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">flavor_customization</text></switch></g><rect x="0" y="240" width="120" height="40" fill="#ffffff" stroke="#000000" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 260px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">release</div></div></div></foreignObject><text x="60" y="264" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">release</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.diagrams.net/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Viewer does not support full SVG 1.1</text></a></switch></svg>


--------------------------------------------------------------------------------
/tutorials/spatial-analysis/README.md:
--------------------------------------------------------------------------------
 1 | ## Spatial Analysis Tutorials
 2 | This section contains tutorials for doing Spatial Analysis within the Exasol database. We are going to provide examples, tasks and use cases.
 3 | 
 4 | ### Languages:
 5 | 
 6 | * [Visualizing Spatial Queries](visualizing_spatial_queries)
 7 |   
 8 | ### Prerequisites
 9 | 
10 | For general prerequisites, please refer to [Prerequisites](../../README.md). 
11 | 


--------------------------------------------------------------------------------
/tutorials/spatial-analysis/visualizing_spatial_queries/README.md:
--------------------------------------------------------------------------------
  1 | # Exasol Spatial Demo with Jupyter Notebook
  2 | 
  3 | [
  4 | Geospatial data](https://docs.exasol.com/sql_references/geospatialdata.htm) can be stored and analyzed in the Exasol database using the GEOMETRY datatype. In this solution, we will show you some examples of how to work with geospatial data inside a Jupyter Notebook with the help of SQL inline magic and visualize geospatial data on map using Python libraries. 
  5 | 
  6 | # Table of contents
  7 | 
  8 | <!-- toc -->
  9 | 
 10 | - [Prerequisites](#prerequisites)
 11 | - [Datasets](#datasets)
 12 | - [Use Cases](#use-cases)
 13 | - [External Resources](#external-resources)
 14 | 
 15 | <!-- tocstop -->
 16 | 
 17 | ### Prerequisites  
 18 | 
 19 | To run this demo a working [Jupyter notebook](https://jupyter.org/install) is required with python version 2.7 or greater.  After Python and Jupyter notebook installation, we need; [ipython-sql library](#ipython-sql-library) to run SQL from Jupyter notebook, [SQL Alchemy](https://www.sqlalchemy.org/) dialect to [connect to EXASOL](#connection-to-exasol) and some additional [python libraries](#additional-python-libraries) for data visualization. GeoJSON files containing spatial data for New York City needs to be downloaded in the [geojsonfiles](geojsonfiles) folder. 
 20 | 
 21 | #### GeoJSON files
 22 | 
 23 | Download the following GeoJSON files in [geojsonfiles](geojsonfiles) folder:
 24 | 
 25 | 1. New York City Streets data:
 26 |   - https://storage.googleapis.com/exasol_data_science_examples_data/visualizing_spatial_queries/geojsonfiles/nyc_street_data.geojson
 27 | 2. New York City Borough boundaries data:
 28 |   - https://storage.googleapis.com/exasol_data_science_examples_data/visualizing_spatial_queries/geojsonfiles/nycboroughboundaries.geojson
 29 | 3. New York City Neighborhood boundaries data:
 30 |   - https://storage.googleapis.com/exasol_data_science_examples_data/visualizing_spatial_queries/geojsonfiles/nycneighborhoods.geojson
 31 | 
 32 | #### IPython-sql library
 33 | 
 34 | [IPython-sql libraray](https://github.com/catherinedevlin/ipython-sql) enables the use of Jupyter magic functions. With Jupyter magic functions, Jupyter notebooks can be used for data analysis with SQL on a database. Magic functions are pre-defined functions in Jupyter kernel that executes supplied commands. They are prefaced with `%` character. Usage and installation instructions can be found [here](https://github.com/catherinedevlin/ipython-sql).  After installation run the following command:
 35 | 
 36 | ```mysql
 37 | %load_ext sql
 38 | ```
 39 | 
 40 | #### Connection to EXASOL
 41 | 
 42 | To connect to EXASOL install the [SQLAlchemy](https://www.sqlalchemy.org/) dialect for EXASOL database. Installation instructions and project details can be found [here](https://pypi.org/project/sqlalchemy-exasol/)
 43 | 
 44 | After installation, connect to EXASOL using the following command:
 45 | 
 46 | ```mysql
 47 | %sql exa+pyodbc://USER:PASSWORD@DSN
 48 | ```
 49 | 
 50 | DSN should point to your ODBC installation. For EXASOL6.2 ODBC download and installation details visit [EXASOL ODBC installation](https://www.exasol.com/portal/display/DOWNLOAD/6.2) 
 51 | 
 52 | #### Additional Python libraries
 53 | 
 54 | Additional python libraries are used to process and visualize geospatial data. We make use of the following python libraries for this demo:
 55 | 
 56 | 1. [Folium](https://pypi.org/project/folium/)
 57 | 2. [Pandas](https://pandas.pydata.org/pandas-docs/stable/install.html)
 58 | 3. [GeoJSON](https://pypi.org/project/geojson/)
 59 | 4. [JSON](https://docs.python.org/3/library/json.html)
 60 | 5. [Requests](https://pypi.org/project/requests/)
 61 | 
 62 | #### Jupter Notebook extensions
 63 | 
 64 | Extensions allow to enhance features of Jupyter Notebook. They are easy to install and configure using the `Nbextensions configuration` page. We have used two extensions in our demo. Remember that the purpose of these extensions is to help visualize the results and are not required to run the [visualizing_spatial_queries.ipynb](visualizing_spatial_queries.ipynb) demo. 
 65 | 
 66 | Installation and configuration details for these extensions can be found [here](https://github.com/ipython-contrib/jupyter_contrib_nbextensions)
 67 | 
 68 | ##### Hide Input
 69 | 
 70 | This extension allows hiding of an individual cell. All the code segments that are not necessary for this particular demo are hidden for better visualization and usability. 
 71 | 
 72 | ##### Limit Output
 73 | 
 74 | Limits the output of a cell. This comes in handy as large result outputs can break the notebook. Limiting the output makes it easy to render results. 
 75 | 
 76 | ### Datasets
 77 | 
 78 | For the purpose of this demo we use `NYC_UBER` and `NYC_TAXI` schemas from `demodb.exasol.com`. 
 79 | 
 80 | Use the following command to open a schema 
 81 | 
 82 | ```mysql
 83 | %sql open schema SCHEMA_NAME
 84 | ```
 85 | 
 86 | Uber pickups data is stored in `UBER_TAXI_DATA` table in `NYC_UBER` schema. Use `DESCRIBE` to get an overview of this table
 87 | 
 88 | ```mysql 
 89 | %sql describe NYC_UBER.UBER_TAXI_DATA
 90 | ```
 91 | 
 92 | New York City Taxi pickups data is stored in `TRIPS`  table in `NYC_TAXI schema`. Use `DESCRIBE` to get an overview of this table
 93 | 
 94 | ```mysql 
 95 | %sql describe NYC_TAXI.TRIPS
 96 | ```
 97 | 
 98 | ### Use Cases 
 99 | 
100 | Let's go briefly through the use cases implemented in [visualizing_spatial_queries.ipynb](visualizing_spatial_queries.ipynb)
101 | 
102 | #### Uber pickups grouped by New York City Boroughs
103 | 
104 | In the first use case, we use New York City data in `NYC_UBER` schema to show Uber pickups per borough in New York City. We use Uber pickups data and NYC borough data to query the total number of Uber pickups per borough using inline SQL magic. To visualize New York City borough boundaries we use [New York City Borough boundaries](#GeoJSON-files) dataset. 
105 | 
106 | #### Uber pickups grouped by New York City Neighborhoods
107 | 
108 | In the second use case, we use New York City data in `NYC_UBER` schema  to show Uber pickups per neighborhood in New York City. We use Uber pickups data and NYC neighborhood data to query the total number of Uber pickups per neighborhood using inline SQL magic. To visualize New York City neighborhood boundaries we use [New York City Neighborhood boundaries](#GeoJSON-files) dataset. 
109 | 
110 | #### New York City Streets with highest Uber pickups
111 | 
112 | In the third use case, we use NYC street data and NYC Uber pickup data to visualize top streets according to number of pickups. This data is stored in our demo database in `NYC_UBER` schema.  To visualize New York City neighborhood boundaries we use [New York City Streets](#GeoJSON-files) dataset. Example in this query can be parameterized to view different results on the map by providing a value for variable `NumberOfStreets` 
113 | 
114 | #### Comparison of Yellow Taxi and Uber pickups within a certain radius of a location in New York City 
115 | 
116 | In the fourth use case, we make a comparison between the number of Uber and Yellow Taxi pickups. For this example we have selected **Museum of the New York City** in Manhattan as a pickup point. We have used geocoding to find the latitude and longitude values of a given location. We have Uber data from April-Sept 2014. By changing the value for `month` within this range we can visualize different sets of geospatial data on map. Radius defines the `radius` value of given lat/long point. For speed purposes its recommended to keep radius value small. 
117 | 
118 | ### External Resources
119 | 
120 | GeoJSON files used for this demo were obtained from the following sources:
121 | 
122 | - NYC borough boundary polygons:  http://data.beta.nyc/dataset/nyc-borough-boundaries
123 | - NYC neighborhood boundary polygons:  http://data.beta.nyc/dataset/nyc-neighborhood-boundaries
124 | - NYC streets multi-line data: https://data.cityofnewyork.us/City-Government/NYC-Street-Centerline-CSCL-/exjm-f27b
125 | 
126 | Currently the following external resources are unavailable therefore you need to download required [GeoJSON files](#GeoJSON-files).
127 | 
128 | 


--------------------------------------------------------------------------------
/tutorials/spatial-analysis/visualizing_spatial_queries/geojsonfiles/README.md:
--------------------------------------------------------------------------------
1 | All GeoSJON files required for the spatial demo are downloaded here 
2 | 


--------------------------------------------------------------------------------
/tutorials/spatial-analysis/visualizing_spatial_queries/visualizing_spatial_queries.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Spatial demo with Jupyter Notebook and Exasol"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Prerequsities"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Installing all python libraries required for this demo"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "hide_input": false
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from IPython import get_ipython\n",
 33 |     "if get_ipython() is None:\n",
 34 |     "    from IPython.core.interactiveshell import InteractiveShell\n",
 35 |     "    InteractiveShell.instance()\n",
 36 |     "!pip install ipython-sql sqlalchemy-exasol folium pandas geojson requests jupyter_contrib_nbextensions geopy"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "Importing all required installed libraries to Jupyter Notebook"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 1,
 49 |    "metadata": {
 50 |     "hide_input": false
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "import folium\n",
 55 |     "import pandas as pd\n",
 56 |     "import os\n",
 57 |     "import geojson\n",
 58 |     "import warnings\n",
 59 |     "import requests as r\n",
 60 |     "import json\n",
 61 |     "import geopy.geocoders\n",
 62 |     "from geopy.geocoders import Nominatim\n",
 63 |     "warnings.filterwarnings('ignore')"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "Load Jupyter magic functions"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 2,
 76 |    "metadata": {
 77 |     "hide_input": false
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "%reload_ext sql"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "Enter user credentials"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 5,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "EXA_USER = \"\"\n",
 98 |     "EXA_PWD = \"\"\n",
 99 |     "DSN = \"exadb\""
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "Connect to DSN"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 6,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "data": {
116 |       "text/plain": [
117 |        "'Connected: @None'"
118 |       ]
119 |      },
120 |      "execution_count": 6,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "%sql exa+pyodbc://{EXA_USER}:{EXA_PWD}@{DSN}"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "Set query cache off"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "%sql alter session set query_cache='off';"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "### Datasets"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "#### Open schema NYC_UBER"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "%sql open schema NYC_UBER"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "Overview of UBER_TAXI_DATA table"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "%sql describe NYC_UBER.UBER_TAXI_DATA"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "Count of uber pickup records in UBER_TAXI_DATA table"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "%sql select count(*) from NYC_UBER.UBER_TAXI_DATA"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "Date and time range of uber pickups "
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 7,
210 |    "metadata": {
211 |     "hide_input": false
212 |    },
213 |    "outputs": [
214 |     {
215 |      "name": "stdout",
216 |      "output_type": "stream",
217 |      "text": [
218 |       "1 rows affected.\n"
219 |      ]
220 |     },
221 |     {
222 |      "data": {
223 |       "text/html": [
224 |        "<table>\n",
225 |        "    <tr>\n",
226 |        "        <th>start_date</th>\n",
227 |        "        <th>end_date</th>\n",
228 |        "    </tr>\n",
229 |        "    <tr>\n",
230 |        "        <td>2014-04-01 00:00:00</td>\n",
231 |        "        <td>2014-09-30 22:59:00</td>\n",
232 |        "    </tr>\n",
233 |        "</table>"
234 |       ],
235 |       "text/plain": [
236 |        "[(datetime.datetime(2014, 4, 1, 0, 0), datetime.datetime(2014, 9, 30, 22, 59))]"
237 |       ]
238 |      },
239 |      "execution_count": 7,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     }
243 |    ],
244 |    "source": [
245 |     "%sql select min(DATETIME) as START_DATE,max(DATETIME) as END_DATE from NYC_UBER.UBER_TAXI_DATA"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "#### Open schema NYC_TAXI"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "%sql open schema NYC_TAXI"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "Overview of TRIPS table"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "%sql describe NYC_TAXI.TRIPS"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "metadata": {},
283 |    "source": [
284 |     "Count of yellow taxi pickups records in TRIPS table"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "%sql select count(*) from NYC_TAXI.TRIPS where CAB_TYPE_ID=1"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "Date and time range of New York City yellow taxi pickups "
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 8,
306 |    "metadata": {
307 |     "hide_input": false
308 |    },
309 |    "outputs": [
310 |     {
311 |      "name": "stdout",
312 |      "output_type": "stream",
313 |      "text": [
314 |       "1 rows affected.\n"
315 |      ]
316 |     },
317 |     {
318 |      "data": {
319 |       "text/html": [
320 |        "<table>\n",
321 |        "    <tr>\n",
322 |        "        <th>start_date</th>\n",
323 |        "        <th>end_date</th>\n",
324 |        "    </tr>\n",
325 |        "    <tr>\n",
326 |        "        <td>2009-01-01 00:00:00</td>\n",
327 |        "        <td>2017-06-30 23:59:59</td>\n",
328 |        "    </tr>\n",
329 |        "</table>"
330 |       ],
331 |       "text/plain": [
332 |        "[(datetime.datetime(2009, 1, 1, 0, 0), datetime.datetime(2017, 6, 30, 23, 59, 59))]"
333 |       ]
334 |      },
335 |      "execution_count": 8,
336 |      "metadata": {},
337 |      "output_type": "execute_result"
338 |     }
339 |    ],
340 |    "source": [
341 |     "%sql select min(PICKUP_DATETIME) as START_DATE,max(PICKUP_DATETIME) as END_DATE from NYC_TAXI.TRIPS"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "metadata": {},
347 |    "source": [
348 |     "## Use Cases\n"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "### Uber pickups grouped by  New York City  boroughs"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "In the first use case, we use `DISJUNCT_NIGHBORHOODS` and `UBER_TAXI_DATA` tables from `NYC_UBER` schema to visualize uber pickups grouped by boroughs. The geometry column in `DISJUNCT_NEIGHBORHOODS` table contains polygons for boroughs while the geometry column in `UBER_TAXI_DATA` table from `NYC_UBER` schema contains pickup points  "
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "metadata": {},
368 |    "source": [
369 |     "Geometry column of type `Polygon` in `DISJUNCT_NEIGHBORHOODS`table "
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {
376 |     "hide_input": false,
377 |     "scrolled": false
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "%sql select THE_GEOM from NYC_UBER.DISJUNCT_NEIGHBORHOODS limit 1"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "metadata": {},
387 |    "source": [
388 |     "Geometry column of type `POINT` in `UBER_TAXI_DATA` table"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "%sql select THE_GEOM from NYC_UBER.UBER_TAXI_DATA limit 1"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "Exasol automatically creates indices for equality join conditions, even when expressions are used for comparison.\n",
405 |     "Exasol 6.1 introduced indices on geospatial data types for joins using geospatial functions like ST_CONTAINS or ST_INTERSECTS. In this use case we use `ST_CONTAINS` function to join table `DISJUNCT_NEIGHBORHOODS` with `UBER_TAXI_DATA` on geometry columns grouped by New York City boroughs "
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {},
411 |    "source": [
412 |     "`%%time` is a cell magic function used here to calculate query execution time. `Wall time` gives the total of query runtime and cell rendering time (negligible)"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": 9,
418 |    "metadata": {},
419 |    "outputs": [
420 |     {
421 |      "name": "stdout",
422 |      "output_type": "stream",
423 |      "text": [
424 |       "5 rows affected.\n",
425 |       "Wall time: 56.7 ms\n"
426 |      ]
427 |     },
428 |     {
429 |      "data": {
430 |       "text/html": [
431 |        "<table>\n",
432 |        "    <tr>\n",
433 |        "        <th>borough_id</th>\n",
434 |        "        <th>pickups</th>\n",
435 |        "    </tr>\n",
436 |        "    <tr>\n",
437 |        "        <td>Manhattan</td>\n",
438 |        "        <td>3443402</td>\n",
439 |        "    </tr>\n",
440 |        "    <tr>\n",
441 |        "        <td>Brooklyn</td>\n",
442 |        "        <td>593648</td>\n",
443 |        "    </tr>\n",
444 |        "    <tr>\n",
445 |        "        <td>Queens</td>\n",
446 |        "        <td>342186</td>\n",
447 |        "    </tr>\n",
448 |        "    <tr>\n",
449 |        "        <td>Bronx</td>\n",
450 |        "        <td>31589</td>\n",
451 |        "    </tr>\n",
452 |        "    <tr>\n",
453 |        "        <td>Staten Island</td>\n",
454 |        "        <td>1034</td>\n",
455 |        "    </tr>\n",
456 |        "</table>"
457 |       ],
458 |       "text/plain": [
459 |        "[('Manhattan', 3443402),\n",
460 |        " ('Brooklyn', 593648),\n",
461 |        " ('Queens', 342186),\n",
462 |        " ('Bronx', 31589),\n",
463 |        " ('Staten Island', 1034)]"
464 |       ]
465 |      },
466 |      "execution_count": 9,
467 |      "metadata": {},
468 |      "output_type": "execute_result"
469 |     }
470 |    ],
471 |    "source": [
472 |     "%%time\n",
473 |     "%sql select borough_id, count(*) as pickups FROM NYC_UBER.DISJUNCT_NEIGHBORHOODS n INNER JOIN NYC_UBER.UBER_TAXI_DATA t ON ST_CONTAINS(n.THE_GEOM, t.THE_GEOM) group by borough_id order by pickups desc"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "markdown",
478 |    "metadata": {},
479 |    "source": [
480 |     "Visualizing geospatial data of uber pickups grouped by  New York City  boroughs "
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": null,
486 |    "metadata": {
487 |     "hide_input": true,
488 |     "scrolled": false
489 |    },
490 |    "outputs": [],
491 |    "source": [
492 |     "#--- to be removed if direct links to http://data.beta.nyc works again---#\n",
493 |     "\n",
494 |     "nyc_boroughs = 'geojsonfiles/nycboroughboundaries.geojson'\n",
495 |     "\n",
496 |     "#nyc_boroughs = \"http://data.beta.nyc//dataset/68c0332f-c3bb-4a78-a0c1-32af515892d6/resource/7c164faa-4458-4ff2-9ef0-09db00b509ef/download/42c737fd496f4d6683bba25fb0e86e1dnycboroughboundaries.geojson\"\n",
497 |     "\n",
498 |     "borough_pickups_sql = %sql select borough_id, count(*) as pickups FROM NYC_UBER.DISJUNCT_NEIGHBORHOODS n INNER JOIN NYC_UBER.UBER_TAXI_DATA t ON ST_CONTAINS(n.THE_GEOM, t.THE_GEOM) group by borough_id order by pickups desc\n",
499 |     "borough_pickups_df = borough_pickups_sql.DataFrame()\n",
500 |     "\n",
501 |     "#base map\n",
502 |     "m1 = folium.Map([40.7586,-73.9706], zoom_start=10)\n",
503 |     "\n",
504 |     "# Choropleth:\n",
505 |     "# geo_data: data of borough polygons\n",
506 |     "# Columns: 1st column is key (Borough) and 2nd column is value(total number of pickups)\n",
507 |     "# Key_on: Variable in the GeoJSON file to bind the data to\n",
508 |     "# bins = width between values\n",
509 |     "choropleth  = folium.Choropleth(geo_data=nyc_boroughs,name = 'choropleth', data = borough_pickups_df, columns = ['borough_id','pickups'],key_on='feature.properties.borough', fill_color='YlGnBu',bins=[1,100,300000,500000,600000,3500000],fill_opacity = 0.5,nan_fill_color='yellow' ,legend_name='Number of pickups', highlight=True).add_to(m1)\n",
510 |     "\n",
511 |     "#hover over to view tooltip with borough name \n",
512 |     "choropleth.geojson.add_child(\n",
513 |     "    folium.features.GeoJsonTooltip(['borough'])\n",
514 |     ")\n",
515 |     "\n",
516 |     "# We can also export this interactive map to results/...html file\n",
517 |     "#m.save(os.path.join('results', 'GeoJSONWithoutTitles_2.html'))\n",
518 |     "\n",
519 |     "# display map \n",
520 |     "display(m1)"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "markdown",
525 |    "metadata": {
526 |     "hide_input": true
527 |    },
528 |    "source": [
529 |     "### Uber pickups grouped by New York City Neighborhoods"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "markdown",
534 |    "metadata": {
535 |     "hide_input": false
536 |    },
537 |    "source": [
538 |     "In the second use case, we use `DISJUNCT_NIGHBORHOODS` and `UBER_TAXI_DATA` tables from `NYC_UBER` schema to visualize uber pickups grouped by neighborhoods. The geometry column in `DISJUNCT_NEIGHBORHOODS` table contains polygons for neighborhoods while the geometry column in `UBER_TAXI_DATA` table from `NYC_UBER` schema contains pickup points  "
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "markdown",
543 |    "metadata": {},
544 |    "source": [
545 |     "Similar to the previous use case, we join neighborhood polygons with uber pickup points using `ST_CONTAINS` function to count total uber pickups grouped by New York City neighborhoods"
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "code",
550 |    "execution_count": null,
551 |    "metadata": {
552 |     "hide_input": false,
553 |     "scrolled": false
554 |    },
555 |    "outputs": [],
556 |    "source": [
557 |     "%%time\n",
558 |     "%sql select neighborhood,count(*) as pickups FROM NYC_UBER.DISJUNCT_NEIGHBORHOODS n INNER JOIN NYC_UBER.UBER_TAXI_DATA t ON ST_CONTAINS(n.THE_GEOM, t.THE_GEOM) group by neighborhood order by pickups desc limit 10"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "markdown",
563 |    "metadata": {},
564 |    "source": [
565 |     "Visualizing geospatial data of uber pickups grouped by New York City neighborhoods"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": null,
571 |    "metadata": {
572 |     "hide_input": true,
573 |     "scrolled": false
574 |    },
575 |    "outputs": [],
576 |    "source": [
577 |     "#--- to be removed if direct links to data.beta.nyc works ---#\n",
578 |     "\n",
579 |     "nyc_neighborhoods = 'geojsonfiles/nycneighborhoods.geojson'\n",
580 |     "\n",
581 |     "#nyc_neighborhoods = \"http://data.beta.nyc//dataset/0ff93d2d-90ba-457c-9f7e-39e47bf2ac5f/resource/35dd04fb-81b3-479b-a074-a27a37888ce7/download/d085e2f8d0b54d4590b1e7d1f35594c1pediacitiesnycneighborhoods.geojson\"\n",
582 |     "neighborhood_pickups_sql = %sql select neighborhood,count(*) as pickups FROM NYC_UBER.DISJUNCT_NEIGHBORHOODS n INNER JOIN NYC_UBER.UBER_TAXI_DATA t ON ST_CONTAINS(n.THE_GEOM, t.THE_GEOM) group by neighborhood order by pickups desc \n",
583 |     "neighborhood_pickups_df = neighborhood_pickups_sql.DataFrame()\n",
584 |     "\n",
585 |     "#base map\n",
586 |     "m2 = folium.Map([40.7586,-73.9706], zoom_start=10)\n",
587 |     "\n",
588 |     "# Choropleth:\n",
589 |     "# geo_data: data of borough polygons\n",
590 |     "# Columns: 1st column is key (Neighborhood) and 2nd column is value(total number of pickups). \n",
591 |     "# Key_on: Variable in the GeoJSON file to bind the data to\n",
592 |     "# bins = width between values\n",
593 |     "# nan_fill_colors= yellow for neighborhoods with no pickup data\n",
594 |     "# For a detailed reference see https://python-visualization.github.io/folium/modules.html#Extra_Features\n",
595 |     "\n",
596 |     "choropleth  = folium.Choropleth(geo_data=nyc_neighborhoods,name = 'choropleth', data = neighborhood_pickups_df, columns = ['neighborhood','pickups'],key_on='feature.properties.neighborhood', fill_color='YlOrRd',fill_opacity = 0.5, legend_name='Number of pickups',nan_fill_color='yellow',nan_fill_opacity=0.4,bins=[1,560,114694,181979,349255,666970],highlight=True).add_to(m2)\n",
597 |     "\n",
598 |     "choropleth.geojson.add_child(\n",
599 |     "    folium.features.GeoJsonTooltip(['neighborhood'])\n",
600 |     ")\n",
601 |     "\n",
602 |     "\n",
603 |     "# We can also export this interactive map to results/...html file\n",
604 |     "#m.save(os.path.join('results', 'GeoJSONWithoutTitles_2.html'))\n",
605 |     "\n",
606 |     "# display map with choropleth\n",
607 |     "display(m2)"
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "markdown",
612 |    "metadata": {
613 |     "hide_input": true
614 |    },
615 |    "source": [
616 |     "### New York City Streets with highest Uber pickups"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "markdown",
621 |    "metadata": {},
622 |    "source": [
623 |     "In this use case, we use NYC street data and NYC Uber pickup data to visualize top streets according to number of pickups. Lets have a look at `STREETS` table from `NYC_UBER` schema. `PHYSICALID`, `THE_GEOM` and `ST_NAME` are columns used for this demo"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "code",
628 |    "execution_count": null,
629 |    "metadata": {},
630 |    "outputs": [],
631 |    "source": [
632 |     "%sql describe NYC_UBER.STREETS"
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "markdown",
637 |    "metadata": {},
638 |    "source": [
639 |     "Before querying for New York City streets with highest Uber pickups, we create a view from `STREETS` table and transform the geometry column from a spherical coordinate system(SRID:4326) to a Mercator cordinate system(SRID:3857) using `ST_TRANSFORM` function. The transformation from 4326 WGS84 (spherical coordinates) to 3857 (Google) Mercator has the advantage that for Mercator, distance is measured in meters (in contrast to 4326 where distance is measured in degrees).  Mercator is used by most of the map services including OpenStreetMap (used in this demo). After transformation a buffer of 50 meters is added around the street geometry column using `ST_BUFFER` function to account for positioning inaccuracy. A snapshot of `STREETS_TRANSFORMED` view:"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "markdown",
644 |    "metadata": {
645 |     "hide_input": false
646 |    },
647 |    "source": [
648 |     "``` mysql\n",
649 |     "CREATE OR REPLACE VIEW \"NYC_UBER\".\"STREETS_TRANSFORMED\" as select\n",
650 |     "...\n",
651 |     "ST_BUFFER(ST_TRANSFORM(THE_GEOM, 3857),50) as THE_GEOM,\n",
652 |     "...\n",
653 |     "from NYC_UBER.STREETS;\n",
654 |     "```"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "markdown",
659 |    "metadata": {},
660 |    "source": [
661 |     "Similary the geometry column from `UBER_TAXI_DATA` is transformed to Mercator using `ST_TRANSFORM` function. A snapshot of the transformed `UBER_TAXI_DATA_TRANSFORMED` view: "
662 |    ]
663 |   },
664 |   {
665 |    "cell_type": "markdown",
666 |    "metadata": {},
667 |    "source": [
668 |     "``` mysql\n",
669 |     "CREATE OR REPLACE VIEW \"NYC_UBER\".\"UBER_TAXI_DATA_TRANSFORMED\"\n",
670 |     "...\n",
671 |     "as select DATETIME,LAT,LON, BASE, ST_TRANSFORM(the_geom,3857) as the_geom \n",
672 |     "...\n",
673 |     "from UBER_TAXI_DATA;\n",
674 |     "```\n"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "markdown",
679 |    "metadata": {},
680 |    "source": [
681 |     "Select the number of streets with highest pickups to view on map"
682 |    ]
683 |   },
684 |   {
685 |    "cell_type": "code",
686 |    "execution_count": null,
687 |    "metadata": {},
688 |    "outputs": [],
689 |    "source": [
690 |     "NumberOfStreets = 5"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "markdown",
695 |    "metadata": {},
696 |    "source": [
697 |     "EXASOL query joins the views based on the geometry columns using `ST_CONTAINS`"
698 |    ]
699 |   },
700 |   {
701 |    "cell_type": "code",
702 |    "execution_count": null,
703 |    "metadata": {
704 |     "scrolled": false
705 |    },
706 |    "outputs": [],
707 |    "source": [
708 |     "%%time\n",
709 |     "%sql select s.full_stree as street_name,count(*) as pickups from (select * from \"NYC_UBER\".\"STREETS_TRANSFORMED\" order by false) s INNER JOIN (select * from \"NYC_UBER\".\"UBER_TAXI_DATA_TRANSFORMED\" order by false) t ON ST_CONTAINS(s.the_geom,t.the_geom) group by s.full_stree order by pickups desc limit $NumberOfStreets"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "markdown",
714 |    "metadata": {},
715 |    "source": [
716 |     "Visualizing geospatial data of uber pickups grouped by New York City streets"
717 |    ]
718 |   },
719 |   {
720 |    "cell_type": "code",
721 |    "execution_count": null,
722 |    "metadata": {
723 |     "hide_input": true,
724 |     "scrolled": false
725 |    },
726 |    "outputs": [],
727 |    "source": [
728 |     "#---- to be removed it API call gets fixed: atm returns only 1000 rows -----#\n",
729 |     "\n",
730 |     "#data = open('C:/Users/smha/GeoSpatialViz/geojsonfiles/nyc_street_data.geojson','r')\n",
731 |     "#jsondata = json.loads(data)\n",
732 |     "path = 'geojsonfiles/nyc_street_data.geojson'\n",
733 |     "with open(path) as f:\n",
734 |     "    data = geojson.load(f)\n",
735 |     "features = data['features'][0]\n",
736 |     "\n",
737 |     "#---------------------------------------------------------------------------#\n",
738 |     "#---------------  Instead use API call to data endpoint --------------------#\n",
739 |     "\n",
740 |     "# url = 'https://data.cityofnewyork.us/resource/gr6w-nsbv.json'\n",
741 |     "\n",
742 |     "# # get json street data by from NYC city data API\n",
743 |     "# req = r.get('https://data.cityofnewyork.us/resource/gr6w-nsbv.json')\n",
744 |     "# jsondata = json.loads(req.text)\n",
745 |     "\n",
746 |     "\n",
747 |     "# # convert json to geojson for folium choropleth\n",
748 |     "# GeoJSON = []\n",
749 |     "# for i in range(0,len(jsondata)-1):\n",
750 |     "#    GeoJSON.append(\n",
751 |     "#    {\n",
752 |     "#          \"type\": \"Feature\", \n",
753 |     "#          \"properties\":\n",
754 |     "#              {\n",
755 |     "#              \"physicalid\": jsondata[i][\"physicalid\"],\n",
756 |     "#              \"full_stree\": jsondata[i][\"full_stree\"],\n",
757 |     "#              },\n",
758 |     "#           \"geometry\": jsondata[i]['the_geom'],\n",
759 |     "#    } )\n",
760 |     "    \n",
761 |     "# GeoJSON[0]\n",
762 |     "# data= {\"type\": \"FeatureCollection\",\"features\": GeoJSON }\n",
763 |     "\n",
764 |     "#top street SQL inline magic + EXASOL query\n",
765 |     "top5streets_sql = %sql select s.full_stree,count(*) as pickups from (select * from \"NYC_UBER\".\"STREETS_TRANSFORMED\" order by false) s INNER JOIN (select * from \"NYC_UBER\".\"UBER_TAXI_DATA_TRANSFORMED\" order by false) t ON ST_CONTAINS(s.the_geom,t.the_geom) group by s.full_stree order by pickups desc limit $NumberOfStreets\n",
766 |     "top5streets_df = top5streets_sql.DataFrame()\n",
767 |     "\n",
768 |     "# taking top x 'physicalid's as a type string\n",
769 |     "top5streets_df['full_stree'] = top5streets_df.full_stree.astype(str)\n",
770 |     "\n",
771 |     "# save column 'physicalid' from top street dataframe for the next steps\n",
772 |     "dfList = list(top5streets_df['full_stree'])\n",
773 |     "\n",
774 |     "# match full street column names with street names from json and save correspoding cordinates to a list  \n",
775 |     "l = list()    \n",
776 |     "for i in range(0,len(data['features'])-1):  \n",
777 |     "  if data['features'][i]['properties']['full_stree'] in dfList:  \n",
778 |     "     l.append(data['features'][i])\n",
779 |     "     \n",
780 |     "\n",
781 |     "# create a new dataframe with only the selected street geometry points\n",
782 |     "data['features'] = l\n",
783 |     "streetdata = json.dumps(data)\n",
784 |     "#base map\n",
785 |     "\n",
786 |     "m3 = folium.Map([40.7586,-73.9706], zoom_start=12)\n",
787 |     "\n",
788 |     "# Choropleth\n",
789 |     "# geo_data: data of borough polygons\n",
790 |     "# Key_on: Variable in the GeoJSON file to bind the data to\n",
791 |     "# bins = width bins between values\n",
792 |     "# For a detailed reference see https://python-visualization.github.io/folium/modules.html#Extra_Features\n",
793 |     "\n",
794 |     "choropleth = folium.Choropleth(geo_data=streetdata,name = 'choropleth',key_on='feature.properties.full_stree', fill_color='YlGnBu',line_color = 'blue', line_weight= 5 , highlight=True).add_to(m3)\n",
795 |     "\n",
796 |     "choropleth.geojson.add_child(\n",
797 |     "   folium.features.GeoJsonTooltip(['full_stree'])\n",
798 |     ")\n",
799 |     "\n",
800 |     "# We can also export this interactive map to results/...html file\n",
801 |     "# m3.save(os.path.join('results', 'GeoJSONWithoutTitles_5.html'))\n",
802 |     "\n",
803 |     "# display map with choropleth\n",
804 |     "display(m3)"
805 |    ]
806 |   },
807 |   {
808 |    "cell_type": "markdown",
809 |    "metadata": {
810 |     "hide_input": true
811 |    },
812 |    "source": [
813 |     "### Comparison of Taxi and Uber pickups within a certain radius of a location in New York City "
814 |    ]
815 |   },
816 |   {
817 |    "cell_type": "markdown",
818 |    "metadata": {
819 |     "hide_input": true
820 |    },
821 |    "source": [
822 |     "The following use case compares the number of Uber and Yellow Taxi pickups. For this example we have selected `Museum of the City of New York` in Manhattan as a pickup point. We have used geocoding to find the latitude and longitude values of a given location. To visualize geospatial data on map for a different location, change the value of `pos` variable. The value of month can be adjusted to visualize different results. Radius defines the radius around the given lat/long point. For speed purposes its recommended to keep radius value small. "
823 |    ]
824 |   },
825 |   {
826 |    "cell_type": "markdown",
827 |    "metadata": {},
828 |    "source": [
829 |     "Assigning query parameters "
830 |    ]
831 |   },
832 |   {
833 |    "cell_type": "code",
834 |    "execution_count": null,
835 |    "metadata": {},
836 |    "outputs": [],
837 |    "source": [
838 |     "pos = \"Museum of the City of New York\"\n",
839 |     "geolocator = Nominatim()\n",
840 |     "geo = geolocator.geocode(pos, timeout=None) \n",
841 |     "location_latitude = geo.latitude\n",
842 |     "location_longitude = geo.longitude\n",
843 |     "month = 6\n",
844 |     "radius = 100\n",
845 |     "geo_point = f\"\\'POINT({location_longitude} {location_latitude})\\'\""
846 |    ]
847 |   },
848 |   {
849 |    "cell_type": "markdown",
850 |    "metadata": {
851 |     "hide_input": true
852 |    },
853 |    "source": [
854 |     "`ST_SETSRID` geospatial function is used to set the SRID(Spatial reference system identifier) of the given `geo_point`"
855 |    ]
856 |   },
857 |   {
858 |    "cell_type": "markdown",
859 |    "metadata": {},
860 |    "source": [
861 |     "``` mysql\n",
862 |     "st_setsrid($geo_point,4326)\n",
863 |     "```"
864 |    ]
865 |   },
866 |   {
867 |    "cell_type": "markdown",
868 |    "metadata": {},
869 |    "source": [
870 |     "After setting the SRID, the given `geopoint` is transformed to Mercator using `ST_TRANSFORM` function. To count the number of pickups within the radius of the given `geopoint` we use `ST_DISTANCE` function. `ST_DISTANCE` function calculates the distance between two geospatial points. "
871 |    ]
872 |   },
873 |   {
874 |    "cell_type": "markdown",
875 |    "metadata": {},
876 |    "source": [
877 |     "``` mysql\n",
878 |     "st_distance(st_transform(st_setsrid($geo_point,4326),3857),the_geom) < 100\n",
879 |     "```"
880 |    ]
881 |   },
882 |   {
883 |    "cell_type": "markdown",
884 |    "metadata": {},
885 |    "source": [
886 |     "Querying EXASOL to list pickup points for New York City yellow taxi given the above parameters"
887 |    ]
888 |   },
889 |   {
890 |    "cell_type": "code",
891 |    "execution_count": null,
892 |    "metadata": {
893 |     "scrolled": false
894 |    },
895 |    "outputs": [],
896 |    "source": [
897 |     "%%time\n",
898 |     "%sql select pickup_latitude, pickup_longitude from nyc_taxi.trips where id in (select id from nyc_uber.nyc_taxi_with_point where st_distance(st_transform(st_setsrid($geo_point,4326),3857),the_geom) < $radius and year(pickup_date)=2014 and month(pickup_date)=$month) and CAB_TYPE_ID=1 "
899 |    ]
900 |   },
901 |   {
902 |    "cell_type": "markdown",
903 |    "metadata": {},
904 |    "source": [
905 |     "Querying EXASOL to list pickup points for Uber given the above parameters"
906 |    ]
907 |   },
908 |   {
909 |    "cell_type": "code",
910 |    "execution_count": null,
911 |    "metadata": {},
912 |    "outputs": [],
913 |    "source": [
914 |     "%%time\n",
915 |     "%sql select lat,lon from nyc_uber.uber_taxi_data_transformed where st_distance(st_transform(st_setsrid($geo_point,4326),3857),the_geom) < $radius and year(datetime)=2014 and month(datetime)=$month"
916 |    ]
917 |   },
918 |   {
919 |    "cell_type": "markdown",
920 |    "metadata": {
921 |     "hide_input": false
922 |    },
923 |    "source": [
924 |     "Visualizing geospatial data comparing Taxi and Uber pickups within a certain radius of a location in New York City"
925 |    ]
926 |   },
927 |   {
928 |    "cell_type": "code",
929 |    "execution_count": null,
930 |    "metadata": {
931 |     "hide_input": true
932 |    },
933 |    "outputs": [],
934 |    "source": [
935 |     "nyc_jfk = %sql select pickup_latitude, pickup_longitude from nyc_taxi.trips where id in (select id from nyc_uber.nyc_taxi_with_point where st_distance(st_transform(st_setsrid($geo_point,4326),3857),the_geom) < $radius and year(pickup_date)=2014 and month(pickup_date)=$month)\n",
936 |     "taxi_df = nyc_jfk.DataFrame()\n",
937 |     "\n",
938 |     "uber_JFK = %sql select * from nyc_uber.uber_taxi_data_transformed where st_distance(st_transform(st_setsrid($geo_point,4326),3857),the_geom) < $radius  and year(datetime)=2014 and month(datetime)=$month\n",
939 |     "uber_df = uber_JFK.DataFrame()\n",
940 |     "\n",
941 |     "#base map\n",
942 |     "emp_m = folium.Map([location_latitude,location_longitude], zoom_start=20)\n",
943 |     "\n",
944 |     "# Add markers to pickup points on the map object \n",
945 |     "for i in range(0,taxi_df.shape[0]-1):\n",
946 |     "    folium.Marker([taxi_df.iloc[i]['pickup_latitude'], taxi_df.iloc[i]['pickup_longitude']],icon=folium.Icon(color='orange', icon='taxi')).add_to(emp_m)\n",
947 |     "for i in range(0,uber_df.shape[0]-1):\n",
948 |     "    folium.Marker([uber_df.iloc[i]['lat'], uber_df.iloc[i]['lon']]).add_to(emp_m)  \n",
949 |     "\n",
950 |     "display(emp_m)"
951 |    ]
952 |   }
953 |  ],
954 |  "metadata": {
955 |   "kernelspec": {
956 |    "display_name": "Python 3",
957 |    "language": "python",
958 |    "name": "python3"
959 |   },
960 |   "language_info": {
961 |    "codemirror_mode": {
962 |     "name": "ipython",
963 |     "version": 3
964 |    },
965 |    "file_extension": ".py",
966 |    "mimetype": "text/x-python",
967 |    "name": "python",
968 |    "nbconvert_exporter": "python",
969 |    "pygments_lexer": "ipython3",
970 |    "version": "3.6.7"
971 |   }
972 |  },
973 |  "nbformat": 4,
974 |  "nbformat_minor": 2
975 | }
976 | 


--------------------------------------------------------------------------------