├── 00-env-setup
    ├── main.tf
    ├── variables.tf
    └── versions.tf
├── 01-datasets
    ├── customer_churn_score_data.csv
    └── customer_churn_train_data.csv
├── 02-scripts
    ├── airflow
    │   └── pipeline.py
    ├── bash
    │   ├── build-container-image.sh
    │   └── mnbs-exec-post-startup.sh
    ├── cloud-functions
    │   ├── function-source.zip
    │   ├── main.py
    │   └── requirements.txt
    └── pyspark
    │   ├── batch_scoring.py
    │   ├── common_utils.py
    │   ├── hyperparameter_tuning.py
    │   ├── model_training.py
    │   └── preprocessing.py
├── 03-notebooks
    ├── pyspark
    │   ├── Dataproc-Spark-Servereless-Batch-PythonSDK-Sample.ipynb
    │   ├── batch_scoring.ipynb
    │   ├── chicago-crimes-analytics.ipynb
    │   ├── hyperparameter_tuning.ipynb
    │   ├── model_training.ipynb
    │   ├── preprocessing.ipynb
    │   └── vertex_scoring_preprocessor.ipynb
    └── vai-pipelines
    │   └── customer_churn_training_pipeline.ipynb
├── 04-templates
    ├── batch_scoring.ipynb
    ├── customer_churn_training_pipeline.ipynb
    ├── customer_churn_vai_pipeline_template.json
    ├── hyperparameter_tuning.ipynb
    ├── mnbs-exec-post-startup.sh
    ├── model_training.ipynb
    ├── preprocessing.ipynb
    └── umnbs-exec-post-startup.sh
├── 05-lab-guide
    ├── Module-01-Environment-Provisioning.md
    ├── Module-02-Spark-IDE-on-GCP.md
    ├── Module-03-Author-ML-Experiments-With-Spark-Notebooks.md
    ├── Module-04-Author-ML-PySpark-Scripts.md
    ├── Module-05-Author-Vertex-AI-Pipeline.md
    ├── Module-06-Author-CloudFunction-For-Vertex-AI-Pipeline.md
    ├── Module-07-Schedule-VertexAI-Pipeline.md
    ├── Module-08-Orchestrate-Batch-Scoring.md
    └── Services-Created.md
├── 06-images
    ├── .DS_Store
    ├── landing-page-01.png
    ├── landing-page-02.png
    ├── landing-page-03.png
    ├── landing-page-04.png
    ├── module-1-bq-01.png
    ├── module-1-bq-02.png
    ├── module-1-cloud-function-01.png
    ├── module-1-cloud-function-02.png
    ├── module-1-cloud-function-03.png
    ├── module-1-cloud-function-04.png
    ├── module-1-cloud-function-05.png
    ├── module-1-cloud-function-06.png
    ├── module-1-cloud-scheduler-01.png
    ├── module-1-cloud-scheduler-02.png
    ├── module-1-cloud-scheduler-03.png
    ├── module-1-cloud-scheduler-04.png
    ├── module-1-cloud-scheduler-05.png
    ├── module-1-composer-01.png
    ├── module-1-composer-02.png
    ├── module-1-composer-03.png
    ├── module-1-composer-04.png
    ├── module-1-composer-05.png
    ├── module-1-composer-06.png
    ├── module-1-composer-07.png
    ├── module-1-composer-08.png
    ├── module-1-gcr-01.png
    ├── module-1-gcr-02.png
    ├── module-1-gcr-03.png
    ├── module-1-iam-01.png
    ├── module-1-iam-02.png
    ├── module-1-iam-03.png
    ├── module-1-iam-04.png
    ├── module-1-networking-01.png
    ├── module-1-networking-02.png
    ├── module-1-networking-03.png
    ├── module-1-networking-04.png
    ├── module-1-networking-05.png
    ├── module-1-networking-06.png
    ├── module-1-networking-07.png
    ├── module-1-networking-08.png
    ├── module-1-phs-01.png
    ├── module-1-phs-02.png
    ├── module-1-phs-03.png
    ├── module-1-phs-04.png
    ├── module-1-phs-05.png
    ├── module-1-pictorial-01.png
    ├── module-1-pictorial-02.png
    ├── module-1-pictorial-03.png
    ├── module-1-pipeline-json-01.png
    ├── module-1-pipeline-json-02.png
    ├── module-1-pipeline-json-03.png
    ├── module-1-pipeline-json-04.png
    ├── module-1-storage-01.png
    ├── module-1-storage-02.png
    ├── module-1-vai-wb-01.png
    ├── module-1-vai-wb-mnb-01.png
    ├── module-1-vai-wb-mnbs-02.png
    ├── module-1-vai-wb-umnb-01.png
    ├── module-1-vai-wb-umnb-02.png
    ├── module-1-vai-wb-umnb-03.png
    ├── module-2-01.png
    ├── module-2-02.png
    ├── module-2-03.png
    ├── module-2-04.png
    ├── module-2-05.png
    ├── module-2-06.png
    ├── module-2-07.png
    ├── module-2-08.png
    ├── module-2-09.png
    ├── module-2-10.png
    ├── module-2-11.png
    ├── module-2-12.png
    ├── module-2-13.png
    ├── module-2-14.png
    ├── module-2-15.png
    ├── module-2-16.png
    ├── module-2-17.png
    ├── module-2-18.png
    ├── module-2-19.png
    ├── module-2-summary-01.png
    ├── module-2-summary-02.png
    ├── module-2-summary-03.png
    ├── module-2-summary-04.png
    ├── module-3-01.png
    ├── module-3-02.png
    ├── module-3-03.png
    ├── module-3-04.png
    ├── module-3-05.png
    ├── module-3-06.png
    ├── module-3-07.png
    ├── module-3-08.png
    ├── module-3-09.png
    ├── module-3-10.png
    ├── module-3-11.png
    ├── module-3-12.png
    ├── module-3-13.png
    ├── module-3-14.png
    ├── module-3-15.png
    ├── module-3-16.png
    ├── module-3-17.png
    ├── module-3-18.png
    ├── module-3-19.png
    ├── module-3-20.png
    ├── module-3-21.png
    ├── module-3-22.png
    ├── module-3-23.png
    ├── module-3-24.png
    ├── module-3-25.png
    ├── module-3-26.png
    ├── module-3-27.png
    ├── module-3-28.png
    ├── module-3-29.png
    ├── module-3-30.png
    ├── module-3-31.png
    ├── module-3-32.png
    ├── module-3-33.png
    ├── module-3-34.png
    ├── module-3-35.png
    ├── module-3-36.png
    ├── module-3-37.png
    ├── module-3-38.png
    ├── module-4-01.png
    ├── module-4-02.png
    ├── module-4-03.png
    ├── module-4-04.png
    ├── module-4-05.png
    ├── module-4-06.png
    ├── module-4-07.png
    ├── module-4-08.png
    ├── module-4-09.png
    ├── module-4-10.png
    ├── module-4-100.png
    ├── module-4-101.png
    ├── module-4-102.png
    ├── module-4-11.png
    ├── module-4-12.png
    ├── module-4-13.png
    ├── module-4-14.png
    ├── module-4-15.png
    ├── module-4-16.png
    ├── module-4-17.png
    ├── module-5-01.png
    ├── module-5-02.png
    ├── module-5-03.png
    ├── module-5-04.png
    ├── module-5-05.png
    ├── module-5-06.png
    ├── module-5-07.png
    ├── module-5-08.png
    ├── module-5-09.png
    ├── module-5-10.png
    ├── module-5-11.png
    ├── module-5-12.png
    ├── module-5-13.png
    ├── module-5-14.png
    ├── module-5-15.png
    ├── module-5-16.png
    ├── module-5-17.png
    ├── module-5-18.png
    ├── module-5-19.png
    ├── module-5-20.png
    ├── module-5-21.png
    ├── module-5-22.png
    ├── module-5-23.png
    ├── module-5-24.png
    ├── module-5-25.png
    ├── module-5-26.png
    ├── module-5-27.png
    ├── module-5-28.png
    ├── module-5-29.png
    ├── module-5-30.png
    ├── module-5-31.png
    ├── module-5-32a.png
    ├── module-5-32b.png
    ├── module-5-33.png
    ├── module-5-34.png
    ├── module-5-35.png
    ├── module-5-36.png
    ├── module-6-01.png
    ├── module-6-02.png
    ├── module-6-03.png
    ├── module-6-04.png
    ├── module-6-05.png
    ├── module-6-06.png
    ├── module-6-07.png
    ├── module-6-08.png
    ├── module-6-09.png
    ├── module-6-10.png
    ├── module-6-11.png
    ├── module-6-12.png
    ├── module-6-13.png
    ├── module-6-14.png
    ├── module-6-15.png
    ├── module-6-16.png
    ├── module-6-17.png
    ├── module-6-18.png
    ├── module-6-19.png
    ├── module-6-20.png
    ├── module-7-01.png
    ├── module-7-02.png
    ├── module-7-03.png
    ├── module-7-04.png
    ├── module-7-05.png
    ├── module-7-06.png
    ├── module-7-07.png
    ├── module-8-01.png
    ├── module-8-02.png
    ├── module-8-03.png
    ├── module-8-04.png
    ├── module-8-05.png
    ├── module-8-06.png
    ├── module-8-07.png
    ├── module-8-08.png
    ├── module-8-09.png
    ├── module-8-10.png
    ├── module-8-11.png
    ├── module-8-12.png
    ├── module-8-13.png
    ├── module-8-14.png
    ├── module-8-15.png
    ├── module-8-16.png
    ├── module-8-17.png
    ├── module-8-18.png
    └── module-8-19.png
└── README.md


/00-env-setup/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "project_id" {
 2 |   type        = string
 3 |   description = "project id required"
 4 | }
 5 | variable "project_name" {
 6 |  type        = string
 7 |  description = "project name in which demo deploy"
 8 | }
 9 | variable "project_number" {
10 |  type        = string
11 |  description = "project number in which demo deploy"
12 | }
13 | variable "gcp_account_name" {
14 |  description = "user performing the demo"
15 | }
16 | variable "org_id" {
17 |  description = "Organization ID in which project created"
18 | }
19 | variable "cloud_composer_image_version" {
20 |  description = "Version of Cloud Composer 2 image to use"
21 | }
22 | variable "spark_container_image_tag" {
23 |  description = "Tag number to assign to container image"
24 | }
25 | variable "gcp_region" {
26 |  description = "GCP region"
27 | }
28 | variable "gcp_zone" {
29 |  description = "GCP zone"
30 | }
31 | variable "gcp_multi_region" {
32 |  description = "GCP multi-region"
33 | }
34 | variable "bq_connector_jar_gcs_uri" {
35 |  description = "BQ connector jar to use"
36 | }
37 | variable "cloud_scheduler_time_zone" {
38 |  description = "Cloud Scheduler Time Zone e.g. America/Chicago"
39 | }
40 | variable "dataproc_runtime_version" {
41 |  description = "Version of Dataproc Serverless Runtime"
42 | }
43 | 


--------------------------------------------------------------------------------
/00-env-setup/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 0.13"
 3 |   required_providers {
 4 |     google = {
 5 |       source  = "hashicorp/google"
 6 |     }
 7 |   }
 8 |   provider_meta "google" {
 9 |     module_name = "blueprints/terraform/test/v0.0.1"
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/02-scripts/airflow/pipeline.py:
--------------------------------------------------------------------------------
  1 | # ======================================================================================
  2 | # ABOUT
  3 | # This script orchestrates batch scoring 
  4 | # ======================================================================================
  5 | 
  6 | import os
  7 | from airflow.models import Variable
  8 | from datetime import datetime
  9 | from airflow import models
 10 | from airflow.providers.google.cloud.operators.dataproc import (DataprocCreateBatchOperator,DataprocGetBatchOperator)
 11 | from datetime import datetime
 12 | from airflow.utils.dates import days_ago
 13 | import string
 14 | import random 
 15 | 
 16 | # .......................................................
 17 | # Variables
 18 | # .......................................................
 19 | 
 20 | # {{
 21 | # a) General
 22 | randomizerCharLength = 10 
 23 | randomVal = ''.join(random.choices(string.digits, k = randomizerCharLength))
 24 | airflowDAGName= "customer-churn-prediction"
 25 | batchIDPrefix = f"{airflowDAGName}-edo-{randomVal}"
 26 | # +
 27 | # b) Capture from Airflow variables
 28 | region = models.Variable.get("region")
 29 | subnet=models.Variable.get("subnet")
 30 | phsServer=Variable.get("phs_server")
 31 | containerImageUri=Variable.get("container_image_uri")
 32 | bqDataset=Variable.get("bq_dataset")
 33 | umsaFQN=Variable.get("umsa_fqn")
 34 | bqConnectorJarUri=Variable.get("bq_connector_jar_uri")
 35 | dataprocRunTimeVersion=Variable.get("dataproc_runtime_version")
 36 | # +
 37 | # c) For the Spark application
 38 | pipelineID = randomVal
 39 | projectID = models.Variable.get("project_id")
 40 | projectNbr = models.Variable.get("project_nbr")
 41 | displayPrintStatements=Variable.get("display_print_statements")
 42 | # +
 43 | # d) Arguments array
 44 | batchScoringArguments = [f"--pipelineID={pipelineID}", \
 45 |         f"--projectID={projectID}", \
 46 |         f"--projectNbr={projectNbr}", \
 47 |         f"--displayPrintStatements={displayPrintStatements}" ]
 48 | # +
 49 | # e) PySpark script to execute
 50 | scoringScript= "gs://s8s_code_bucket-"+projectNbr+"/pyspark/batch_scoring.py"
 51 | commonUtilsScript= "gs://s8s_code_bucket-"+projectNbr+"/pyspark/common_utils.py"
 52 | # }}
 53 | 
 54 | # .......................................................
 55 | # s8s Spark batch config
 56 | # .......................................................
 57 | 
 58 | s8sSparkBatchConfig = {
 59 |     "pyspark_batch": {
 60 |         "main_python_file_uri": scoringScript,
 61 |         "python_file_uris": [ commonUtilsScript ],
 62 |         "args": batchScoringArguments,
 63 |         "jar_file_uris": [ bqConnectorJarUri ]
 64 |     },
 65 |     "runtime_config": {
 66 |         "container_image": containerImageUri,
 67 |         "version": dataprocRunTimeVersion
 68 |     },
 69 |     "environment_config":{
 70 |         "execution_config":{
 71 |             "service_account": umsaFQN,
 72 |             "subnetwork_uri": subnet
 73 |             },
 74 |         "peripherals_config": {
 75 |             "spark_history_server_config": {
 76 |                 "dataproc_cluster": f"projects/{projectID}/regions/{region}/clusters/{phsServer}"
 77 |                 }
 78 |             }
 79 |         }
 80 | }
 81 | 
 82 | 
 83 | # .......................................................
 84 | # DAG
 85 | # .......................................................
 86 | 
 87 | with models.DAG(
 88 |     airflowDAGName,
 89 |     schedule_interval=None,
 90 |     start_date = days_ago(2),
 91 |     catchup=False,
 92 | ) as scoringDAG:
 93 |     customerChurnPredictionStep = DataprocCreateBatchOperator(
 94 |         task_id="Predict-Customer-Churn",
 95 |         project_id=projectID,
 96 |         region=region,
 97 |         batch=s8sSparkBatchConfig,
 98 |         batch_id=batchIDPrefix 
 99 |     )
100 |     customerChurnPredictionStep 
101 | 


--------------------------------------------------------------------------------
/02-scripts/bash/build-container-image.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #........................................................................
  4 | # Purpose: Build custom container image for serverless spark
  5 | # Parameters: (1) Docker image tag (2) gs URI of BQ connector jar (3) GCP region
  6 | # e.g. ./build-container-image.sh 1.0.0 gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar us-central1
  7 | #........................................................................
  8 | 
  9 | # Variables
 10 | PROJECT_ID=`gcloud config list --format 'value(core.project)'`
 11 | LOCAL_SCRATCH_DIR=~/build
 12 | DOCKER_IMAGE_TAG=$1
 13 | DOCKER_IMAGE_NM="customer_churn_image"
 14 | DOCKER_IMAGE_FQN="gcr.io/$PROJECT_ID/$DOCKER_IMAGE_NM:$DOCKER_IMAGE_TAG"
 15 | BQ_CONNECTOR_JAR_URI=$2
 16 | GCP_REGION=$3
 17 | 
 18 | # Create local directory
 19 | cd ~
 20 | mkdir build
 21 | cd build
 22 | rm -rf *
 23 | echo "Created local directory for the Docker image building"
 24 | 
 25 | # Create Dockerfile in local directory
 26 | cd $LOCAL_SCRATCH_DIR
 27 | 
 28 | cat << 'EOF' > Dockerfile
 29 | # Debian 11 is recommended.
 30 | FROM debian:11-slim
 31 | 
 32 | # Suppress interactive prompts
 33 | ENV DEBIAN_FRONTEND=noninteractive
 34 | 
 35 | # (Required) Install utilities required by Spark scripts.
 36 | RUN apt update && apt install -y procps tini
 37 | 
 38 | # (Optional) Add extra jars.
 39 | # Debian 11 is recommended.
 40 | FROM debian:11-slim
 41 | 
 42 | # Suppress interactive prompts
 43 | ENV DEBIAN_FRONTEND=noninteractive
 44 | 
 45 | # (Required) Install utilities required by Spark scripts.
 46 | RUN apt update && apt install -y procps tini
 47 | 
 48 | # (Optional) Add extra jars.
 49 | ENV SPARK_EXTRA_JARS_DIR=/opt/spark/jars/
 50 | ENV SPARK_EXTRA_CLASSPATH='/opt/spark/jars/*'
 51 | RUN mkdir -p "${SPARK_EXTRA_JARS_DIR}"
 52 | COPY spark-bigquery-with-dependencies_2.12-0.22.2.jar "${SPARK_EXTRA_JARS_DIR}"
 53 | 
 54 | # (Optional) Install and configure Miniconda3.
 55 | ENV CONDA_HOME=/opt/miniconda3
 56 | ENV PYSPARK_PYTHON=${CONDA_HOME}/bin/python
 57 | ENV PATH=${CONDA_HOME}/bin:${PATH}
 58 | COPY Miniconda3-py39_4.10.3-Linux-x86_64.sh .
 59 | RUN bash Miniconda3-py39_4.10.3-Linux-x86_64.sh -b -p /opt/miniconda3 \
 60 |   && ${CONDA_HOME}/bin/conda config --system --set always_yes True \
 61 |   && ${CONDA_HOME}/bin/conda config --system --set auto_update_conda False \
 62 |   && ${CONDA_HOME}/bin/conda config --system --prepend channels conda-forge \
 63 |   && ${CONDA_HOME}/bin/conda config --system --set channel_priority strict
 64 | 
 65 | # (Optional) Install Conda packages.
 66 | #
 67 | # The following packages are installed in the default image, it is strongly
 68 | # recommended to include all of them.
 69 | #
 70 | # Use mamba to install packages quickly.
 71 | RUN ${CONDA_HOME}/bin/conda install mamba -n base -c conda-forge \
 72 |     && ${CONDA_HOME}/bin/mamba install \
 73 |       conda \
 74 |       cython \
 75 |       fastavro \
 76 |       fastparquet \
 77 |       gcsfs \
 78 |       google-cloud-bigquery-storage \
 79 |       google-cloud-bigquery[pandas] \
 80 |       google-cloud-bigtable \
 81 |       google-cloud-container \
 82 |       google-cloud-datacatalog \
 83 |       google-cloud-dataproc \
 84 |       google-cloud-datastore \
 85 |       google-cloud-language \
 86 |       google-cloud-logging \
 87 |       google-cloud-monitoring \
 88 |       google-cloud-pubsub \
 89 |       google-cloud-redis \
 90 |       google-cloud-spanner \
 91 |       google-cloud-speech \
 92 |       google-cloud-storage \
 93 |       google-cloud-texttospeech \
 94 |       google-cloud-translate \
 95 |       google-cloud-vision \
 96 |       koalas \
 97 |       matplotlib \
 98 |       mleap \
 99 |       nltk \
100 |       numba \
101 |       numpy \
102 |       openblas \
103 |       orc \
104 |       pandas \
105 |       pyarrow \
106 |       pysal \
107 |       pytables \
108 |       python \
109 |       regex \
110 |       requests \
111 |       rtree \
112 |       scikit-image \
113 |       scikit-learn \
114 |       scipy \
115 |       seaborn \
116 |       sqlalchemy \
117 |       sympy \
118 |       virtualenv
119 | 
120 | 
121 | # (Optional) Install R and R libraries.
122 | RUN apt update \
123 |   && apt install -y gnupg \
124 |   && apt-key adv --no-tty \
125 |       --keyserver "hkp://keyserver.ubuntu.com:80" \
126 |       --recv-keys 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 \
127 |   && echo "deb http://cloud.r-project.org/bin/linux/debian bullseye-cran40/" \
128 |       >/etc/apt/sources.list.d/cran-r.list \
129 |   && apt update \
130 |   && apt install -y \
131 |       libopenblas-base \
132 |       libssl-dev \
133 |       r-base \
134 |       r-base-dev \
135 |       r-recommended \
136 |       r-cran-blob
137 | 
138 | ENV R_HOME=/usr/lib/R
139 | 
140 | # (Required) Create the 'spark' group/user.
141 | # The GID and UID must be 1099. Home directory is required.
142 | RUN groupadd -g 1099 spark
143 | RUN useradd -u 1099 -g 1099 -d /home/spark -m spark
144 | USER spark
145 | 
146 | EOF
147 | 
148 | echo "Completed Dockerfile creation"
149 | 
150 | # Download dependencies to be baked into image
151 | cd $LOCAL_SCRATCH_DIR
152 | gsutil cp $BQ_CONNECTOR_JAR_URI .
153 | wget -P . https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh
154 | echo "Completed downloading dependencies"
155 | 
156 | # Authenticate 
157 | gcloud auth configure-docker ${GCP_REGION}-docker.pkg.dev -q
158 | 
159 | # Build image
160 | docker build . --progress=tty -f Dockerfile -t $DOCKER_IMAGE_FQN
161 | echo "Completed docker image build"
162 | 
163 | # Push to GCR
164 | docker push $DOCKER_IMAGE_FQN
165 | echo "Completed docker image push to GCR"
166 | 
167 | 


--------------------------------------------------------------------------------
/02-scripts/bash/mnbs-exec-post-startup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #........................................................................
4 | # Purpose: Copy existing notebooks to Workbench server Jupyter home dir
5 | # (Managed notebook server)
6 | #........................................................................
7 | 
8 | gsutil cp gs://s8s_notebook_bucket-PROJECT_NBR/pyspark/*.ipynb /home/jupyter/ 
9 | chown jupyter:jupyter /home/jupyter/* 


--------------------------------------------------------------------------------
/02-scripts/cloud-functions/function-source.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/02-scripts/cloud-functions/function-source.zip


--------------------------------------------------------------------------------
/02-scripts/cloud-functions/main.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from google.cloud import aiplatform as vertex_ai
  3 | import functions_framework
  4 | import os, random
  5 | from os import path
  6 | from google.cloud import storage
  7 | from urllib.parse import urlparse, urljoin
  8 | 
  9 | 
 10 | def process_request(request):
 11 |    """Processes the incoming HTTP request.
 12 | 
 13 |    Args:
 14 |      request (flask.Request): HTTP request object.
 15 | 
 16 |    Returns:
 17 |      The response text or any set of values that can be turned into a Response
 18 |      object using `make_response
 19 |      <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
 20 |    """
 21 | 
 22 |    # decode http request payload and translate into JSON object
 23 |    request_str = request.data.decode('utf-8')
 24 |    request_json = json.loads(request_str)
 25 | 
 26 |    # ........................................
 27 |    # Capture and print environment variables
 28 |    # ........................................
 29 |    
 30 |    # a) Pipeline template file in GCS
 31 |    VAI_PIPELINE_JSON_TEMPLATE_GCS_FILE_FQN = os.environ.get("VAI_PIPELINE_JSON_TEMPLATE_GCS_FILE_FQN")
 32 |    print("VAI_PIPELINE_JSON_TEMPLATE_GCS_FILE_FQN is {}".format(VAI_PIPELINE_JSON_TEMPLATE_GCS_FILE_FQN))
 33 |    
 34 |    # b) Pipeline execution directory in GCS
 35 |    VAI_PIPELINE_JSON_EXEC_DIR_URI = os.environ.get("VAI_PIPELINE_JSON_EXEC_DIR_URI")
 36 |    print("VAI_PIPELINE_JSON_EXEC_DIR_URI is {}".format(VAI_PIPELINE_JSON_EXEC_DIR_URI))
 37 | 
 38 |    # c) Project ID
 39 |    PROJECT_ID = os.environ.get("PROJECT_ID")
 40 |    print("PROJECT_ID is {}".format(PROJECT_ID))
 41 | 
 42 |    # d) GCP location
 43 |    GCP_LOCATION = os.environ.get("GCP_LOCATION")
 44 |    print("GCP_LOCATION is {}".format(GCP_LOCATION))
 45 | 
 46 |    # e) VAI pipeline root for logs
 47 |    VAI_PIPELINE_ROOT_LOG_DIR = os.environ.get("VAI_PIPELINE_ROOT_LOG_DIR")
 48 |    print("VAI_PIPELINE_ROOT_LOG_DIR is {}".format(VAI_PIPELINE_ROOT_LOG_DIR))  
 49 |    
 50 |    # f) DATAPROC SERVERLESS RUNTIME VERSION
 51 |    DATAPROC_RUNTIME_VERSION = os.environ.get("DATAPROC_RUNTIME_VERSION")
 52 |    print("DATAPROC_RUNTIME_VERSION is {}".format(DATAPROC_RUNTIME_VERSION))  
 53 | 
 54 |    # ........................................
 55 |    # Create local scratch directory in /tmp
 56 |    # ........................................
 57 |    
 58 |    LOCAL_SCRATCH_DIR = "/tmp/scratch"
 59 |    if not os.path.exists(LOCAL_SCRATCH_DIR):
 60 |     os.makedirs(LOCAL_SCRATCH_DIR)
 61 | 
 62 |    # ........................................
 63 |    # Variables
 64 |    # ........................................
 65 | 
 66 |    # a) Generate custom job ID for Vertex AI pipeline run
 67 |    vaiPipelineExecutionInstanceID = random.randint(1, 10000)
 68 |    print("VAI_PIPELINE_EXECUTION_INSTANCE_ID is {}".format(vaiPipelineExecutionInstanceID))
 69 |    
 70 |    # b) Customized pipeline JSON filename
 71 |    pipelineFileName = "pipeline_{}.json".format(vaiPipelineExecutionInstanceID)
 72 |    print("PIPELINE_FILE_NM is {}".format(pipelineFileName))
 73 | 
 74 |    # c) Local path to customized pipeline JSON
 75 |    localCustomPipelineJsonFileFQN = LOCAL_SCRATCH_DIR + "/" + pipelineFileName
 76 |    print("VAI_PIPELINE_JSON_TO_EXECUTE is locally at {}".format(localCustomPipelineJsonFileFQN))
 77 | 
 78 |    # d) Local (download) path for template pipeline JSON
 79 |    localTemplatePipelineJsonFileFQN = LOCAL_SCRATCH_DIR + "/customer_churn_template.json" 
 80 | 
 81 |    # e) GCS URI for customized pipeline JSON
 82 |    PIPELINE_JSON_GCS_URI = VAI_PIPELINE_JSON_EXEC_DIR_URI + "/executions/{}".format(pipelineFileName)
 83 | 
 84 |    # ........................................
 85 |    # Create custom VAI pipeline JSON
 86 |    # ........................................
 87 | 
 88 |    # a) Download the template VAI pipeline JSON
 89 |    downloadVaiPipelineTemplateInGCS(VAI_PIPELINE_JSON_TEMPLATE_GCS_FILE_FQN,localTemplatePipelineJsonFileFQN)
 90 | 
 91 |    # b) Create custom VAI pipeline JSON
 92 |    createCustomVaiPipelineJson(vaiPipelineExecutionInstanceID,localTemplatePipelineJsonFileFQN,localCustomPipelineJsonFileFQN,DATAPROC_RUNTIME_VERSION)
 93 | 
 94 |    # c) Push custom VAI pipeline JSON to GCS execution directory
 95 |    pushLocalFileToGCS(urlparse(VAI_PIPELINE_JSON_EXEC_DIR_URI).netloc, localCustomPipelineJsonFileFQN, "executions/{}".format(pipelineFileName))
 96 | 
 97 |    # ........................................
 98 |    # Vertex AI Pipeline execution
 99 |    # ........................................
100 | 
101 |    vertex_ai.init(
102 |        project=PROJECT_ID,
103 |        location=GCP_LOCATION,
104 |        staging_bucket=VAI_PIPELINE_ROOT_LOG_DIR
105 |    )
106 | 
107 |    job = vertex_ai.PipelineJob(
108 |        display_name='customer-churn-prediction-pipeline',
109 |        template_path=PIPELINE_JSON_GCS_URI,
110 |        pipeline_root=VAI_PIPELINE_ROOT_LOG_DIR,
111 |        enable_caching=False
112 |    )
113 | 
114 |    job.submit()
115 |    return "Job submitted"
116 | 
117 | #}} End of entry point
118 |   
119 | def downloadVaiPipelineTemplateInGCS(gcsFQVaiPipelineTemplateJsonFileUri, fileToDownloadToLocally):
120 | #{{
121 |    googleCloudStorageClient = storage.Client()
122 |    with open(fileToDownloadToLocally, 'wb') as fileObject:
123 |     googleCloudStorageClient.download_blob_to_file(
124 |         gcsFQVaiPipelineTemplateJsonFileUri, fileObject)
125 | 
126 |    print("Downloaded template to {}".format(fileToDownloadToLocally))
127 | #}}
128 | 
129 | def createCustomVaiPipelineJson(pipelineID, templatePipelineJsonLocalFile, customPipelineJsonLocalFile, dataprocRuntimeVersion):
130 | #{{
131 |     searchTextPipelineID = "YOUR_USER_DEFINED_EXECUTION_ID"
132 |     replaceTextPipelineID = str(pipelineID)
133 |     searchTextDataprocRuntimeVersion = "YOUR_DATAPROC_RUNTIME_VERSION"
134 |     replaceTextDataprocRuntimeVersion = str(dataprocRuntimeVersion)
135 |   
136 |     with open(templatePipelineJsonLocalFile, 'r') as templateFileHandle:
137 |         templateContent = templateFileHandle.read()
138 |         customContent = templateContent.replace(searchTextPipelineID, replaceTextPipelineID)
139 |         customContent = customContent.replace(searchTextDataprocRuntimeVersion, replaceTextDataprocRuntimeVersion)
140 |     
141 |     with open(customPipelineJsonLocalFile, 'w') as customFileHandle:
142 |         customFileHandle.write(customContent)
143 |     
144 |     print("Created customPipelineJsonLocalFile at {}".format(customPipelineJsonLocalFile))
145 | #}}
146 | 
147 | def pushLocalFileToGCS(executionPipelineGCSDirUri, customPipelineJsonLocalFilePath, customPipelineFileName):
148 | #{{
149 |     googleCloudStorageClient = storage.Client()
150 |     googleCloudStorageBucket = googleCloudStorageClient.bucket(executionPipelineGCSDirUri)
151 |     blob = googleCloudStorageBucket.blob(customPipelineFileName)
152 |     blob.upload_from_filename(customPipelineJsonLocalFilePath)
153 | #}}
154 | 


--------------------------------------------------------------------------------
/02-scripts/cloud-functions/requirements.txt:
--------------------------------------------------------------------------------
1 | functions-framework==3.*
2 | google-cloud-storage == 2.5.0
3 | google-api-python-client>=1.7.8,<2
4 | google-cloud-aiplatform[pipelines]


--------------------------------------------------------------------------------
/02-scripts/pyspark/batch_scoring.py:
--------------------------------------------------------------------------------
  1 | # ............................................................
  2 | # Batch Scoring
  3 | # ............................................................
  4 | # This script does batch scoring.
  5 | # 1. It loads the model in GCS
  6 | # 2. Parses, transforms data to be scored
  7 | # 3. Uses the model to predict
  8 | # 4. Persists predictions to BigQuery
  9 | # ............................................................
 10 | 
 11 | from pyspark.sql import SparkSession
 12 | from pyspark.sql.functions import *
 13 | from pyspark.ml import PipelineModel
 14 | import common_utils
 15 | import sys, logging, argparse
 16 | from datetime import datetime
 17 | 
 18 | import common_utils
 19 | 
 20 | def fnParseArguments():
 21 | # {{ Start 
 22 |     """
 23 |     Purpose:
 24 |         Parse arguments received by script
 25 |     Returns:
 26 |         args
 27 |     """
 28 |     argsParser = argparse.ArgumentParser()
 29 |     argsParser.add_argument(
 30 |         '--pipelineID',
 31 |         help='Unique ID for the pipeline stages for traceability',
 32 |         type=str,
 33 |         required=True)
 34 |     argsParser.add_argument(
 35 |         '--projectNbr',
 36 |         help='The project number',
 37 |         type=str,
 38 |         required=True)
 39 |     argsParser.add_argument(
 40 |         '--projectID',
 41 |         help='The project id',
 42 |         type=str,
 43 |         required=True)
 44 |     argsParser.add_argument(
 45 |         '--displayPrintStatements',
 46 |         help='Boolean - print to screen or not',
 47 |         type=bool,
 48 |         required=True)
 49 |     return argsParser.parse_args()
 50 | # }} End fnParseArguments()
 51 | 
 52 | def fnMain(logger, args):
 53 | # {{ Start main
 54 | 
 55 |     # 1a. Arguments
 56 |     pipelineID = args.pipelineID
 57 |     projectNbr = args.projectNbr
 58 |     projectID = args.projectID
 59 |     displayPrintStatements = args.displayPrintStatements
 60 | 
 61 |     # 1b. Variables 
 62 |     appBaseName = "customer-churn-model"
 63 |     appNameSuffix = "batch-scoring"
 64 |     appName = f"{appBaseName}-{appNameSuffix}"
 65 |     modelBaseNm = appBaseName
 66 |     bqDatasetNm = f"{projectID}.customer_churn_ds"
 67 |     scoreDatasetBucketFQN = f"gs://s8s_data_bucket-{projectNbr}/customer_churn_score_data.csv"
 68 |     bigQueryOutputTableFQN = f"{bqDatasetNm}.batch_predictions"
 69 |     bigQueryModelAssetTrackerTableFQN = f"{bqDatasetNm}.model_asset_tracker"
 70 |     scratchBucketUri = f"s8s-spark-bucket-{projectNbr}/{appBaseName}/pipelineId-{pipelineID}/{appNameSuffix}/"
 71 |     pipelineExecutionDt = datetime.now().strftime("%Y%m%d%H%M%S")
 72 | 
 73 |     # 1c. Display input and output
 74 |     if displayPrintStatements:
 75 |         print("Starting batch_scoring for Customer Churn Predictions")
 76 |         print(".....................................................")
 77 |         print(f"The datetime now is - {pipelineExecutionDt}")
 78 |         print(" ")
 79 |         print("INPUT-")
 80 |         print(f"....pipelineID={pipelineID}")
 81 |         print(f"....projectNbr={projectNbr}")
 82 |         print(f"....projectID={projectID}")
 83 |         print(f"....displayPrintStatements={displayPrintStatements}")
 84 |         print(" ")
 85 | 
 86 |     try:
 87 |         # 2. Spark Session creation
 88 |         print('....Initializing spark & spark configs')
 89 |         spark = SparkSession.builder.appName(appName).getOrCreate()
 90 | 
 91 |         # Spark configuration setting for writes to BigQuery
 92 |         spark.conf.set("parentProject", projectID)
 93 |         spark.conf.set("temporaryGcsBucket", scratchBucketUri)
 94 | 
 95 |         # 3. Read data to be scored from GCS
 96 |         print('....Read batch scoring input and profile')
 97 |         scoreRawDF = spark.read.options(inferSchema = True, header= True).csv(scoreDatasetBucketFQN)
 98 |         if displayPrintStatements:
 99 |             print(scoreRawDF.count())
100 | 
101 |         # 4. Display data, display summary stats
102 |         if displayPrintStatements:
103 |             scoreRawDF.show(2)
104 |             scoreRawDF.describe().show()
105 | 
106 |         # 5. Replace spaces, space with null values in the TotalCharges and MonthlyCharges columns
107 |         print('....Data pre-process: fnReplaceSpaceWithNone in TotalCharges and MonthlyCharges')
108 |         spaceReplacedDF = common_utils.fnReplaceSpaceWithNone(scoreRawDF)
109 |         if displayPrintStatements:
110 |             print(spaceReplacedDF.count())
111 | 
112 |         # 6. Replace non-numeric values in the TotalCharges and MonthlyCharges columns
113 |         print('....Data pre-process: ReplaceNotANumberWithNone in TotalCharges and MonthlyCharges')
114 |         nanReplacedDF = common_utils.fnReplaceNotANumberWithNone(spaceReplacedDF)
115 |         if displayPrintStatements:
116 |             print(nanReplacedDF.count())
117 | 
118 |         # 7. Drop rows with null in columns
119 |         print('....Data pre-process: Drop rows with none')
120 |         nullDroppedDF = nanReplacedDF.na.drop()
121 | 
122 |         if displayPrintStatements:
123 |             print(nullDroppedDF.count())
124 | 
125 |         # 8. Replace 'No internet service' across columns to 'No'
126 |         print('....Data pre-process: Replace -No internet service- across columns with -No-')
127 |         partiallyProcessedDF = common_utils.fnReplaceWithNoForInternetService(nullDroppedDF)
128 |         if displayPrintStatements:
129 |             print(partiallyProcessedDF.count())
130 | 
131 |         # 9. Add a bin/bucket category for tenure range using Spark SQL and write transformed to dataframe
132 |         print('....Data pre-process: Replace -No internet service- across columns with -No-') 
133 |         scoreTargetDF = common_utils.fnAddBinForTenure(partiallyProcessedDF, True, spark)
134 |         if displayPrintStatements:
135 |             print(scoreTargetDF.count())
136 |             scoreTargetDF.show(2)  
137 | 
138 |         # 10. Format dataframe names for column name format consistency
139 |         scorableDF = scoreTargetDF.select("customerID", "gender", "SeniorCitizen", "Partner", "Dependents", "tenure", "Tenure_Group", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "MonthlyCharges", "TotalCharges") \
140 |                                         .toDF("customer_id", "gender", "senior_citizen", "partner", "dependents", "tenure", "tenure_group", "phone_service", "multiple_lines", "internet_service", "online_security", "online_backup", "device_protection", "tech_support", "streaming_tv", "streaming_movies", "contract", "paperless_billing", "payment_method", "monthly_charges", "total_charges") 
141 | 
142 |         if displayPrintStatements:
143 |             print(scorableDF.count())
144 |             scorableDF.show(2)
145 | 
146 |         # 11a. Determine the version of the model available in the Asset Tracker table
147 |         modelAssetSpecsDF=spark.read \
148 |             .format('bigquery') \
149 |             .load(bigQueryModelAssetTrackerTableFQN)
150 | 
151 |         modelVersion=modelAssetSpecsDF.first()["model_version"]
152 |         modelGcsUriFromAssetTracker=modelAssetSpecsDF.first()["model_gcs_uri"]
153 |         print(f"The model version is: {modelVersion}")
154 |         print(f"The model GCS URI is: {modelGcsUriFromAssetTracker}")
155 | 
156 |         # 11b. Load the pre-trained, persisted model in GCS
157 |         print(f'....Scoring: Load model out of bucket at {modelGcsUriFromAssetTracker} into memory') 
158 |         model = PipelineModel.load(f"{modelGcsUriFromAssetTracker}/bestModel/")
159 | 
160 |         # 12. Batch scoring
161 |         print('....Scoring: Execute model.transform') 
162 |         batchScoreResultsDF = model.transform(scorableDF) \
163 |                                 .withColumn("model_version", lit(modelVersion).cast("string")) \
164 |                                 .withColumn("pipeline_id", lit(pipelineID).cast("string")) \
165 |                                 .withColumn("pipeline_execution_dt", lit(pipelineExecutionDt)) 
166 | 
167 |         if displayPrintStatements:
168 |             batchScoreResultsDF.show(2)
169 | 
170 |         # 13. Persist to BigQuery
171 |         print('....Persisting: Batch scoring results to BigQuery')
172 |         batchScoreResultsDF.select("customer_id", "gender", "senior_citizen", "partner", "dependents", "tenure", "tenure_group", "phone_service", "multiple_lines", "internet_service", "online_security", "online_backup", "device_protection", "tech_support", "streaming_tv", "streaming_movies", "contract", "paperless_billing", "payment_method", "monthly_charges", "total_charges","prediction","model_version","pipeline_id","pipeline_execution_dt") \
173 |         .write.format('bigquery') \
174 |         .mode("append")\
175 |         .option('table', bigQueryOutputTableFQN) \
176 |         .save()
177 | 
178 |         print("VALIDATE RESULTS AT-")
179 |         print(f"....BigQuery Table={bigQueryOutputTableFQN}")
180 |         print(f"SELECT * FROM {bigQueryOutputTableFQN} WHERE model_version='{modelVersion}' AND pipeline_id='{pipelineID}' AND pipeline_execution_dt='{pipelineExecutionDt}' LIMIT 10" )
181 |    
182 |     except RuntimeError as coreError:
183 |             logger.error(coreError)
184 |     else:
185 |         logger.info('Successfully completed batch scoring!')
186 | # }} End fn_main()
187 | 
188 | def fnConfigureLogger():
189 | # {{ Start 
190 |     """
191 |     Purpose:
192 |         Configure a logger for the script
193 |     Returns:
194 |         Logger object
195 |     """
196 |     logFormatter = logging.Formatter('%(asctime)s - %(filename)s - %(levelname)s - %(message)s')
197 |     logger = logging.getLogger("data_engineering")
198 |     logger.setLevel(logging.INFO)
199 |     logger.propagate = False
200 |     logStreamHandler = logging.StreamHandler(sys.stdout)
201 |     logStreamHandler.setFormatter(logFormatter)
202 |     logger.addHandler(logStreamHandler)
203 |     return logger
204 | # }} End fn_configureLogger()
205 | 
206 | if __name__ == "__main__":
207 |     arguments = fnParseArguments()
208 |     logger = fnConfigureLogger()
209 |     fnMain(logger, arguments)


--------------------------------------------------------------------------------
/02-scripts/pyspark/preprocessing.py:
--------------------------------------------------------------------------------
  1 | # ............................................................
  2 | # Preprocessing
  3 | # ............................................................
  4 | # This script performs data preprocessing on raw data in GCS
  5 | # and persists to BigQuery
  6 | # ............................................................
  7 | 
  8 | import sys,logging,argparse
  9 | import pyspark
 10 | from pyspark.sql import SparkSession
 11 | from pyspark.sql.functions import *
 12 | from datetime import datetime
 13 | import common_utils
 14 | 
 15 | 
 16 | def fnParseArguments():
 17 | # {{ Start 
 18 |     """
 19 |     Purpose:
 20 |         Parse arguments received by script
 21 |     Returns:
 22 |         args
 23 |     """
 24 |     argsParser = argparse.ArgumentParser()
 25 |     argsParser.add_argument(
 26 |         '--pipelineID',
 27 |         help='Unique ID for the pipeline stages for traceability',
 28 |         type=str,
 29 |         required=True)
 30 |     argsParser.add_argument(
 31 |         '--projectNbr',
 32 |         help='The project number',
 33 |         type=str,
 34 |         required=True)
 35 |     argsParser.add_argument(
 36 |         '--projectID',
 37 |         help='The project id',
 38 |         type=str,
 39 |         required=True)
 40 |     argsParser.add_argument(
 41 |         '--displayPrintStatements',
 42 |         help='Boolean - print to screen or not',
 43 |         type=bool,
 44 |         required=True)
 45 |     return argsParser.parse_args()
 46 | # }} End fnParseArguments()
 47 | 
 48 | def fnMain(logger, args):
 49 | # {{ Start main
 50 | 
 51 |     # 1. Capture Spark application input
 52 |     pipelineID = args.pipelineID
 53 |     projectNbr = args.projectNbr
 54 |     projectID = args.projectID
 55 |     displayPrintStatements = args.displayPrintStatements
 56 | 
 57 |     # 1b. Variables 
 58 |     bqDatasetNm = f"{projectID}.customer_churn_ds"
 59 |     appBaseName = "customer-churn-model"
 60 |     appNameSuffix = "preprocessing"
 61 |     appName = f"{appBaseName}-{appNameSuffix}"
 62 |     scratchBucketUri = f"s8s-spark-bucket-{projectNbr}/{appBaseName}/pipelineId-{pipelineID}/{appNameSuffix}"
 63 |     sourceBucketUri = f"gs://s8s_data_bucket-{projectNbr}/customer_churn_train_data.csv"
 64 |     bigQueryTargetTableFQN = f"{bqDatasetNm}.training_data"
 65 |     pipelineExecutionDt = datetime.now().strftime("%Y%m%d%H%M%S")
 66 | 
 67 |     # 1c. Display input and output
 68 |     if displayPrintStatements:
 69 |         logger.info("Starting preprocessing for the *Customer Churn* experiment")
 70 |         logger.info(".....................................................")
 71 |         logger.info(f"The datetime now is - {pipelineExecutionDt}")
 72 |         logger.info(" ")
 73 |         logger.info("INPUT PARAMETERS-")
 74 |         logger.info(f"....pipelineID={pipelineID}")
 75 |         logger.info(f"....projectID={projectID}")
 76 |         logger.info(f"....projectNbr={projectNbr}")
 77 |         logger.info(f"....displayPrintStatements={displayPrintStatements}")
 78 |         logger.info(" ")
 79 |         logger.info("EXPECTED SETUP-")  
 80 |         logger.info(f"....BQ Dataset={bqDatasetNm}")
 81 |         logger.info(f"....Source Data={sourceBucketUri}")
 82 |         logger.info(f"....Scratch Bucket for BQ connector=gs://s8s-spark-bucket-{projectNbr}") 
 83 |         logger.info("OUTPUT-")
 84 |         logger.info(f"....BigQuery Table={bigQueryTargetTableFQN}")
 85 |         logger.info(f"....Sample query-")
 86 |         logger.info(f"....SELECT * FROM {bigQueryTargetTableFQN} WHERE pipeline_id='{pipelineID}' LIMIT 10" )
 87 | 
 88 |     try:
 89 |         # 2. Spark Session creation
 90 |         logger.info('....Initializing spark & spark configs')
 91 |         spark = SparkSession.builder.appName(appName).getOrCreate()
 92 | 
 93 |         # Spark configuration setting for writes to BigQuery
 94 |         spark.conf.set("parentProject", projectID)
 95 |         spark.conf.set("temporaryGcsBucket", scratchBucketUri)
 96 | 
 97 |         # 3. Read raw data in GCS into a Spark Dataframe
 98 |         logger.info('....Read source data')
 99 |         rawChurnDF = spark.read.options(inferSchema = True, header= True).csv(sourceBucketUri)
100 | 
101 |         # 4. View the data
102 |         if displayPrintStatements:
103 |             logger.info(rawChurnDF.count())
104 |             rawChurnDF.show(2)
105 | 
106 |         # 5. Profile the data
107 |         if displayPrintStatements:
108 |             rawChurnDF.describe().show()
109 | 
110 |         # 6. Check for spaces, nulls in monthly & total charges
111 |         logger.info('....Exploratory Data Analysis')
112 |         if displayPrintStatements:
113 |             rawChurnDF.createOrReplaceTempView("base_customer_churn")
114 |             spark.sql("select count(*) from base_customer_churn where MonthlyCharges is null or MonthlyCharges=' '").show(5)
115 |             spark.sql("select count(*) from base_customer_churn where TotalCharges is null or TotalCharges=' '").show(5)
116 | 
117 |         # 7. Replace spaces, space with null values in the TotalCharges and MonthlyCharges columns
118 |         logger.info('....Replace space, nulls with None')
119 |         spaceReplacedDF = common_utils.fnReplaceSpaceWithNone(rawChurnDF)
120 |         if displayPrintStatements:
121 |             logger.info(spaceReplacedDF.count())
122 | 
123 |         # 8. Replace non-numeric values values in the TotalCharges and MonthlyCharges columns
124 |         logger.info('....Replace non-numeric values in numeric columns with null')
125 |         nanReplacedDF = common_utils.fnReplaceNotANumberWithNone(spaceReplacedDF)
126 |         if displayPrintStatements:
127 |             logger.info(nanReplacedDF.count())
128 | 
129 |         # 9. Drop rows with null in columns
130 |         logger.info('....Drop nulls')
131 |         nullDroppedDF = nanReplacedDF.na.drop()
132 |         if displayPrintStatements:
133 |             logger.info(nullDroppedDF.count())
134 | 
135 |         # 10. Replace 'No internet service' across columns to 'No'
136 |         logger.info('....Replace -No internet service across columns- to -No-')
137 |         partiallyProcessedDF = common_utils.fnReplaceWithNoForInternetService(nullDroppedDF)
138 |         if displayPrintStatements:
139 |             logger.info(partiallyProcessedDF.count())
140 | 
141 |         # 11. Add a bin/bucket category for tenure range using Spark SQL and write transformed to dataframe
142 |         logger.info('....Add a bin for tenure')
143 |         modelTrainingReadyDF = common_utils.fnAddBinForTenure(partiallyProcessedDF, False, spark)
144 |         if displayPrintStatements:
145 |             logger.info(modelTrainingReadyDF.count())
146 | 
147 |         # 12. Run summary statistics
148 |         if displayPrintStatements:
149 |             modelTrainingReadyDF.describe().show()
150 | 
151 |         # 13. Print schema
152 |         modelTrainingReadyDF.printSchema()
153 | 
154 |         # 14. Format column names for consistency (title case to DB style & lowercase)
155 |         logger.info('....Format column names for consistency')
156 |         persistDF = modelTrainingReadyDF.select("customerID", "gender", "SeniorCitizen", "Partner", "Dependents", "tenure", "Tenure_Group", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "MonthlyCharges", "TotalCharges","Churn") \
157 |                                         .toDF("customer_id", "gender", "senior_citizen", "partner", "dependents", "tenure", "tenure_group", "phone_service", "multiple_lines", "internet_service", "online_security", "online_backup", "device_protection", "tech_support", "streaming_tv", "streaming_movies", "contract", "paperless_billing", "payment_method", "monthly_charges", "total_charges","churn") \
158 |                                         .withColumn("pipeline_id", lit(pipelineID).cast("string")) \
159 |                                         .withColumn("pipeline_execution_dt", lit(pipelineExecutionDt)) 
160 | 
161 |         persistDF.printSchema()
162 | 
163 |         # 15. Persist training dataset to a table in BQ with the pipeline ID and execution date for traceability
164 |         logger.info('....Persist to BQ')  
165 |         persistDF.write.format('bigquery') \
166 |         .mode("overwrite")\
167 |         .option('table', bigQueryTargetTableFQN) \
168 |         .save()
169 | 
170 |     except RuntimeError as coreError:
171 |             logger.error(coreError)
172 |     else:
173 |         logger.info('Successfully completed preprocessing!')
174 | # }} End fnMain()
175 | 
176 | def fnConfigureLogger():
177 | # {{ Start 
178 |     """
179 |     Purpose:
180 |         Configure a logger for the script
181 |     Returns:
182 |         Logger object
183 |     """
184 |     logFormatter = logging.Formatter('%(asctime)s - %(filename)s - %(levelname)s - %(message)s')
185 |     logger = logging.getLogger("data_engineering")
186 |     logger.setLevel(logging.INFO)
187 |     logger.propagate = False
188 |     logStreamHandler = logging.StreamHandler(sys.stdout)
189 |     logStreamHandler.setFormatter(logFormatter)
190 |     logger.addHandler(logStreamHandler)
191 |     return logger
192 | # }} End fnConfigureLogger()
193 | 
194 | if __name__ == "__main__":
195 |     arguments = fnParseArguments()
196 |     logger = fnConfigureLogger()
197 |     fnMain(logger, arguments)


--------------------------------------------------------------------------------
/03-notebooks/pyspark/preprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "application/vnd.databricks.v1+cell": {
  7 |      "inputWidgets": {},
  8 |      "nuid": "e8aeb0e7-0c23-471d-ba1a-d3d733a118e0",
  9 |      "showTitle": false,
 10 |      "title": ""
 11 |     }
 12 |    },
 13 |    "source": [
 14 |     "### Preprocessing\n",
 15 |     "This module performs data transformation in preparation for the customer churn model training.\n",
 16 |     "\n",
 17 |     "1. It reads raw data in CSV from GCS\n",
 18 |     "2. Performs some basic transformations and\n",
 19 |     "3. Persists to BigQuery"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "spark"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "application/vnd.databricks.v1+cell": {
 36 |      "inputWidgets": {},
 37 |      "nuid": "8023dc7c-ec1c-41a7-a049-c8e2a44a5beb",
 38 |      "showTitle": false,
 39 |      "title": ""
 40 |     }
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import sys\n",
 45 |     "import pyspark\n",
 46 |     "from pyspark.sql import SparkSession\n",
 47 |     "from pyspark.sql.functions import *\n",
 48 |     "from datetime import datetime\n",
 49 |     "import random"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# 1a. Arguments\n",
 59 |     "pipelineID = random.randint(1, 10000)\n",
 60 |     "projectNbr = \"YOUR_PROJECT_NBR\"\n",
 61 |     "projectID = \"YOUR_PROJECT_ID\"\n",
 62 |     "displayPrintStatements = True"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# 1b. Variables \n",
 72 |     "bqDatasetNm = f\"{projectID}.customer_churn_ds\"\n",
 73 |     "appBaseName = \"customer-churn-model\"\n",
 74 |     "appNameSuffix = \"preprocessing\"\n",
 75 |     "appName = f\"{appBaseName}-{appNameSuffix}\"\n",
 76 |     "scratchBucketUri = f\"s8s-spark-bucket-{projectNbr}/{appBaseName}/pipelineId-{pipelineID}/{appNameSuffix}\"\n",
 77 |     "sourceBucketUri = f\"gs://s8s_data_bucket-{projectNbr}/customer_churn_train_data.csv\"\n",
 78 |     "bigQueryTargetTableFQN = f\"{bqDatasetNm}.training_data\"\n",
 79 |     "pipelineExecutionDt = datetime.now().strftime(\"%Y%m%d%H%M%S\")"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "# 1c. Display input and output\n",
 89 |     "if displayPrintStatements:\n",
 90 |     "    print(\"Starting preprocessing for the *Customer Churn* experiment\")\n",
 91 |     "    print(\".....................................................\")\n",
 92 |     "    print(f\"The datetime now is - {pipelineExecutionDt}\")\n",
 93 |     "    print(\" \")\n",
 94 |     "    print(\"INPUT PARAMETERS-\")\n",
 95 |     "    print(f\"....pipelineID={pipelineID}\")\n",
 96 |     "    print(f\"....projectID={projectID}\")\n",
 97 |     "    print(f\"....projectNbr={projectNbr}\")\n",
 98 |     "    print(f\"....displayPrintStatements={displayPrintStatements}\")\n",
 99 |     "    print(\" \")\n",
100 |     "    print(\"EXPECTED SETUP-\")  \n",
101 |     "    print(f\"....BQ Dataset={bqDatasetNm}\")\n",
102 |     "    print(f\"....Source Data={sourceBucketUri}\")\n",
103 |     "    print(f\"....Scratch Bucket for BQ connector=gs://s8s-spark-bucket-{projectNbr}\") \n",
104 |     "    print(\"OUTPUT-\")\n",
105 |     "    print(f\"....BigQuery Table={bigQueryTargetTableFQN}\")\n",
106 |     "    print(f\"....Sample query-\")\n",
107 |     "    print(f\"....SELECT * FROM {bigQueryTargetTableFQN} WHERE pipeline_id='{pipelineID}' LIMIT 10\" )\n",
108 |     "  "
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "# 2. Spark Session creation\n",
118 |     "print('....Initializing spark & spark configs')\n",
119 |     "spark = SparkSession.builder.appName(appName).getOrCreate()\n",
120 |     "\n",
121 |     "# Spark configuration setting for writes to BigQuery\n",
122 |     "spark.conf.set(\"parentProject\", projectID)\n",
123 |     "spark.conf.set(\"temporaryGcsBucket\", scratchBucketUri)\n",
124 |     "\n",
125 |     "# Add Python modules\n",
126 |     "sc.addPyFile(f\"gs://s8s_code_bucket-{projectNbr}/pyspark/common_utils.py\")\n",
127 |     "import common_utils"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "# 3. Read raw data in GCS into a Spark Dataframe\n",
137 |     "print('....Read source data')\n",
138 |     "rawChurnDF = spark.read.options(inferSchema = True, header= True).csv(sourceBucketUri)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "# 4. View the data\n",
148 |     "if displayPrintStatements:\n",
149 |     "    print(rawChurnDF.count())\n",
150 |     "    rawChurnDF.show(2)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "# 5. Profile the data\n",
160 |     "if displayPrintStatements:\n",
161 |     "    rawChurnDF.describe().show()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "# 6. Check for spaces, nulls in monthly & total charges\n",
171 |     "print('....Exploratory Data Analysis')\n",
172 |     "if displayPrintStatements:\n",
173 |     "    rawChurnDF.createOrReplaceTempView(\"base_customer_churn\")\n",
174 |     "    spark.sql(\"select count(*) from base_customer_churn where MonthlyCharges is null or MonthlyCharges=' '\").show(5)\n",
175 |     "    spark.sql(\"select count(*) from base_customer_churn where TotalCharges is null or TotalCharges=' '\").show(5)\n"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "# 7. Replace spaces, space with null values in the TotalCharges and MonthlyCharges columns\n",
185 |     "print('....Replace space, nulls with None')\n",
186 |     "spaceReplacedDF = common_utils.fnReplaceSpaceWithNone(rawChurnDF)\n",
187 |     "if displayPrintStatements:\n",
188 |     "    print(spaceReplacedDF.count())"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "# 8. Replace non-numeric values values in the TotalCharges and MonthlyCharges columns\n",
198 |     "print('....Replace non-numeric values in numeric columns with null')\n",
199 |     "nanReplacedDF = common_utils.fnReplaceNotANumberWithNone(spaceReplacedDF)\n",
200 |     "if displayPrintStatements:\n",
201 |     "    print(nanReplacedDF.count())"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "# 9. Drop rows with null in columns\n",
211 |     "print('....Drop nulls')\n",
212 |     "nullDroppedDF = nanReplacedDF.na.drop()\n",
213 |     "if displayPrintStatements:\n",
214 |     "    print(nullDroppedDF.count())"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "# 10. Replace 'No internet service' across columns to 'No'\n",
224 |     "print('....Replace -No internet service across columns- to -No-')\n",
225 |     "partiallyProcessedDF = common_utils.fnReplaceWithNoForInternetService(nullDroppedDF)\n",
226 |     "if displayPrintStatements:\n",
227 |     "    print(partiallyProcessedDF.count())"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "# 11. Add a bin/bucket category for tenure range using Spark SQL and write transformed to dataframe\n",
237 |     "print('....Add a bin for tenure')\n",
238 |     "modelTrainingReadyDF = common_utils.fnAddBinForTenure(partiallyProcessedDF, False, spark)\n",
239 |     "if displayPrintStatements:\n",
240 |     "    print(modelTrainingReadyDF.count())"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "# 12. Run summary statistics\n",
250 |     "if displayPrintStatements:\n",
251 |     "    modelTrainingReadyDF.describe().show()"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "# 13. Print schema\n",
261 |     "modelTrainingReadyDF.printSchema()"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "# 14. Format column names for consistency (title case to DB style & lowercase)\n",
271 |     "print('....Format column names for consistency')\n",
272 |     "persistDF = modelTrainingReadyDF.select(\"customerID\", \"gender\", \"SeniorCitizen\", \"Partner\", \"Dependents\", \"tenure\", \"Tenure_Group\", \"PhoneService\", \"MultipleLines\", \"InternetService\", \"OnlineSecurity\", \"OnlineBackup\", \"DeviceProtection\", \"TechSupport\", \"StreamingTV\", \"StreamingMovies\", \"Contract\", \"PaperlessBilling\", \"PaymentMethod\", \"MonthlyCharges\", \"TotalCharges\",\"Churn\") \\\n",
273 |     "                                .toDF(\"customer_id\", \"gender\", \"senior_citizen\", \"partner\", \"dependents\", \"tenure\", \"tenure_group\", \"phone_service\", \"multiple_lines\", \"internet_service\", \"online_security\", \"online_backup\", \"device_protection\", \"tech_support\", \"streaming_tv\", \"streaming_movies\", \"contract\", \"paperless_billing\", \"payment_method\", \"monthly_charges\", \"total_charges\",\"churn\") \\\n",
274 |     "                                .withColumn(\"pipeline_id\", lit(pipelineID).cast(\"string\")) \\\n",
275 |     "                                .withColumn(\"pipeline_execution_dt\", lit(pipelineExecutionDt)) \n",
276 |     "\n",
277 |     "persistDF.printSchema()\n"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "# 15. Persist training dataset to a table in BQ with the pipeline ID and execution date for traceability\n",
287 |     "print('....Persist to BQ')  \n",
288 |     "persistDF.write.format('bigquery') \\\n",
289 |     ".mode(\"append\")\\\n",
290 |     ".option('table', bigQueryTargetTableFQN) \\\n",
291 |     ".save()"
292 |    ]
293 |   }
294 |  ],
295 |  "metadata": {
296 |   "application/vnd.databricks.v1+notebook": {
297 |    "dashboards": [],
298 |    "language": "python",
299 |    "notebookMetadata": {
300 |     "pythonIndentUnit": 4
301 |    },
302 |    "notebookName": "01-data-engineering",
303 |    "notebookOrigID": 1914343434663113,
304 |    "widgets": {}
305 |   },
306 |   "kernelspec": {
307 |    "display_name": "",
308 |    "name": ""
309 |   },
310 |   "language_info": {
311 |    "codemirror_mode": {
312 |     "name": "ipython",
313 |     "version": 3
314 |    },
315 |    "file_extension": ".py",
316 |    "mimetype": "text/x-python",
317 |    "name": "python",
318 |    "nbconvert_exporter": "python",
319 |    "pygments_lexer": "ipython3",
320 |    "version": "3.9.13"
321 |   },
322 |   "serverless_spark": "{\"name\":\"projects/s8s-spark-ml-mlops/locations/us-central1/sessions/agni-6\",\"uuid\":\"35fda7e3-be7b-4913-99c5-83e97b677386\",\"createTime\":\"2022-08-04T02:37:17.836903Z\",\"jupyterSession\":{},\"spark\":{},\"runtimeInfo\":{},\"state\":\"ACTIVE\",\"stateTime\":\"2022-08-04T02:38:37.084371Z\",\"creator\":\"s8s-lab-sa@s8s-spark-ml-mlops.iam.gserviceaccount.com\",\"runtimeConfig\":{\"containerImage\":\"gcr.io/s8s-spark-ml-mlops/dataproc_serverless_custom_runtime:1.0.3\",\"properties\":{\"spark:spark.executor.instances\":\"2\",\"spark:spark.driver.cores\":\"4\",\"spark:spark.executor.cores\":\"4\",\"spark:spark.eventLog.dir\":\"gs://s8s-sphs-974925525028/35fda7e3-be7b-4913-99c5-83e97b677386/spark-job-history\"}},\"environmentConfig\":{\"executionConfig\":{\"serviceAccount\":\"s8s-lab-sa@s8s-spark-ml-mlops.iam.gserviceaccount.com\",\"subnetworkUri\":\"https://www.googleapis.com/compute/v1/projects/s8s-spark-ml-mlops/regions/us-central1/subnetworks/spark-snet\"},\"peripheralsConfig\":{\"sparkHistoryServerConfig\":{\"dataprocCluster\":\"projects/s8s-spark-ml-mlops/regions/us-central1/clusters/s8s-sphs-974925525028\"}}},\"stateHistory\":[{\"state\":\"CREATING\",\"stateStartTime\":\"2022-08-04T02:37:17.836903Z\"}]}",
323 |   "serverless_spark_kernel_name": "remote-bc514a4a91cec988ad3c15a7-pyspark"
324 |  },
325 |  "nbformat": 4,
326 |  "nbformat_minor": 4
327 | }
328 | 


--------------------------------------------------------------------------------
/03-notebooks/pyspark/vertex_scoring_preprocessor.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "463506b1-7a4f-49ac-a131-0b5d7561164e",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Preprocessor for scoring from Vertex AI Serving\n",
  9 |     "\n",
 10 |     "This script does preprocessing in advance of batch scoring.\n",
 11 |     "1. It parses, transforms data in GCS to be scored\n",
 12 |     "2. Persists the transfomed data to BigQuery"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "id": "edbe3069-3d05-4e94-8954-1997d6511e3f",
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "spark"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "7a4fc9b1-36b2-4fd8-9765-7ad15b3d02a0",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from pyspark.sql import SparkSession\n",
 33 |     "from pyspark.sql.functions import *\n",
 34 |     "from pyspark.ml import PipelineModel\n",
 35 |     "import sys\n",
 36 |     "from datetime import datetime\n",
 37 |     "import random"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "id": "684b8dbc-9ea5-4391-870e-cb32f406e7ed",
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# 1a. Arguments\n",
 48 |     "pipelineID = random.randint(1, 10000)\n",
 49 |     "projectNbr = \"YOUR_PROJECT_NBR\"\n",
 50 |     "projectID = \"YOUR_PROJECT_ID\"\n",
 51 |     "displayPrintStatements = True"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "c91b4bc7-4f08-4411-9b1f-540b34d7be39",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# 1b. Variables \n",
 62 |     "appBaseName = \"customer-churn-model\"\n",
 63 |     "appNameSuffix = \"vertex_serving_preprocessor\"\n",
 64 |     "appName = f\"{appBaseName}-{appNameSuffix}\"\n",
 65 |     "modelBaseNm = appBaseName\n",
 66 |     "bqDatasetNm = f\"{projectID}.customer_churn_ds\"\n",
 67 |     "scoreDatasetBucketFQN = f\"gs://s8s_data_bucket-{projectNbr}/customer_churn_score_data.csv\"\n",
 68 |     "bigQueryOutputTableFQN = f\"{bqDatasetNm}.batch_scoring_candidates\"\n",
 69 |     "bigQueryModelAssetTrackerTableFQN = f\"{bqDatasetNm}.model_asset_tracker\"\n",
 70 |     "scratchBucketUri = f\"s8s-spark-bucket-{projectNbr}/{appBaseName}/pipelineId-{pipelineID}/{appNameSuffix}/\"\n",
 71 |     "pipelineExecutionDt = datetime.now().strftime(\"%Y%m%d%H%M%S\")"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "67e57cbb-7585-4118-8a90-a6844182374b",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# 1c. Display input and output\n",
 82 |     "if displayPrintStatements:\n",
 83 |     "    print(\"Preprocessing for Customer Churn Predictions from Vertex AI Serving\")\n",
 84 |     "    print(\".....................................................\")\n",
 85 |     "    print(f\"The datetime now is - {pipelineExecutionDt}\")\n",
 86 |     "    print(\" \")\n",
 87 |     "    print(\"INPUT-\")\n",
 88 |     "    print(f\"....pipelineID={pipelineID}\")\n",
 89 |     "    print(f\"....projectNbr={projectNbr}\")\n",
 90 |     "    print(f\"....projectID={projectID}\")\n",
 91 |     "    print(f\"....displayPrintStatements={displayPrintStatements}\")\n",
 92 |     "    print(f\"OUTPUT-\")\n",
 93 |     "    print(f\"....bqSink={bigQueryOutputTableFQN}\")\n",
 94 |     "    print(\" \")"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "8822c32c-984d-42e8-acaa-1c64712c1119",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "# 2. Spark Session creation\n",
105 |     "print('....Initializing spark & spark configs')\n",
106 |     "spark = SparkSession.builder.appName(appName).getOrCreate()\n",
107 |     "\n",
108 |     "# Spark configuration setting for writes to BigQuery\n",
109 |     "spark.conf.set(\"parentProject\", projectID)\n",
110 |     "spark.conf.set(\"temporaryGcsBucket\", scratchBucketUri)\n",
111 |     "\n",
112 |     "# Add Python modules\n",
113 |     "sc.addPyFile(f\"gs://s8s_code_bucket-{projectNbr}/pyspark/common_utils.py\")\n",
114 |     "import common_utils"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "id": "a7259f0f-a5e6-4e5b-876d-fe08de2ba659",
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "# 3. Read data to be scored from GCS\n",
125 |     "print('....Read batch scoring input and profile')\n",
126 |     "scoreRawDF = spark.read.options(inferSchema = True, header= True).csv(scoreDatasetBucketFQN)\n",
127 |     "if displayPrintStatements:\n",
128 |     "    print(scoreRawDF.count())"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "925bb7db-642b-4412-b4c7-7c2ab3ff7999",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "# 4. Display data, display summary stats\n",
139 |     "if displayPrintStatements:\n",
140 |     "    scoreRawDF.show(2)\n",
141 |     "    scoreRawDF.describe().show()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "id": "c5f0d470-140b-498b-bd56-054aa7d9198c",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "# 5. Replace spaces, space with null values in the TotalCharges and MonthlyCharges columns\n",
152 |     "print('....Data pre-process: fnReplaceSpaceWithNone in TotalCharges and MonthlyCharges')\n",
153 |     "spaceReplacedDF = common_utils.fnReplaceSpaceWithNone(scoreRawDF)\n",
154 |     "if displayPrintStatements:\n",
155 |     "    print(spaceReplacedDF.count())"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "51eae237-9f27-48aa-b22f-e360fee251f1",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "# 6. Replace non-numeric values in the TotalCharges and MonthlyCharges columns\n",
166 |     "print('....Data pre-process: ReplaceNotANumberWithNone in TotalCharges and MonthlyCharges')\n",
167 |     "nanReplacedDF = common_utils.fnReplaceNotANumberWithNone(spaceReplacedDF)\n",
168 |     "if displayPrintStatements:\n",
169 |     "    print(nanReplacedDF.count())"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "id": "6f016816-d830-44ca-9926-c41d2409e956",
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "# 7. Drop rows with null in columns\n",
180 |     "print('....Data pre-process: Drop rows with none')\n",
181 |     "nullDroppedDF = nanReplacedDF.na.drop()\n",
182 |     "\n",
183 |     "if displayPrintStatements:\n",
184 |     "    print(nullDroppedDF.count())"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "id": "b2a5eff9-7ab0-46a3-bd80-8794582c50ee",
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "# 8. Replace 'No internet service' across columns to 'No'\n",
195 |     "print('....Data pre-process: Replace -No internet service- across columns with -No-')\n",
196 |     "partiallyProcessedDF = common_utils.fnReplaceWithNoForInternetService(nullDroppedDF)\n",
197 |     "if displayPrintStatements:\n",
198 |     "    print(partiallyProcessedDF.count())"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "id": "67583d3b-fd56-45c2-b7da-16158d11a3af",
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "# 9. Add a bin/bucket category for tenure range using Spark SQL and write transformed to dataframe\n",
209 |     "print('....Data pre-process: Replace -No internet service- across columns with -No-') \n",
210 |     "scoreTargetDF = common_utils.fnAddBinForTenure(partiallyProcessedDF, True, spark)\n",
211 |     "if displayPrintStatements:\n",
212 |     "    print(scoreTargetDF.count())\n",
213 |     "    scoreTargetDF.show(2)                            "
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "id": "4893d9f1-33dc-434f-b3b6-49c512d3ea84",
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "# 10. Format dataframe names for column name format consistency\n",
224 |     "scorableDF = scoreTargetDF.select(\"customerID\", \"gender\", \"SeniorCitizen\", \"Partner\", \"Dependents\", \"tenure\", \"Tenure_Group\", \"PhoneService\", \"MultipleLines\", \"InternetService\", \"OnlineSecurity\", \"OnlineBackup\", \"DeviceProtection\", \"TechSupport\", \"StreamingTV\", \"StreamingMovies\", \"Contract\", \"PaperlessBilling\", \"PaymentMethod\", \"MonthlyCharges\", \"TotalCharges\") \\\n",
225 |     "                                .toDF(\"customer_id\", \"gender\", \"senior_citizen\", \"partner\", \"dependents\", \"tenure\", \"tenure_group\", \"phone_service\", \"multiple_lines\", \"internet_service\", \"online_security\", \"online_backup\", \"device_protection\", \"tech_support\", \"streaming_tv\", \"streaming_movies\", \"contract\", \"paperless_billing\", \"payment_method\", \"monthly_charges\", \"total_charges\") \n",
226 |     "\n",
227 |     "if displayPrintStatements:\n",
228 |     "    print(scorableDF.count())\n",
229 |     "    scorableDF.show(2)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "id": "8e36f456-7bea-4b77-b7c2-534e6be5379c",
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "# 11. Persist to BigQuery\n",
240 |     "print('....Augmenting with pipeline ID') \n",
241 |     "scorableWithTraceDF = scorableDF \\\n",
242 |     "                           .withColumn(\"pipeline_id\", lit(pipelineID).cast(\"string\")) \\\n",
243 |     "                           .withColumn(\"pipeline_execution_dt\", lit(pipelineExecutionDt)) \n",
244 |     "\n",
245 |     "if displayPrintStatements:\n",
246 |     "    scorableWithTraceDF.show(2)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "id": "90cfcb15-5dab-4b2e-ac1b-ab0f5ec1d8df",
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "scorableWithTraceDF.printSchema()"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "id": "e475788b-62ca-4f4d-bf6f-b1c1330f3cd6",
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "# 12. Persist to BigQuery\n",
267 |     "print('....Persisting: Preprocessed data for batch scoring results to BigQuery')\n",
268 |     "scorableWithTraceDF.select(\"customer_id\", \"gender\", \"senior_citizen\", \"partner\", \"dependents\", \"tenure\", \"tenure_group\", \"phone_service\", \"multiple_lines\", \"internet_service\", \"online_security\", \"online_backup\", \"device_protection\", \"tech_support\", \"streaming_tv\", \"streaming_movies\", \"contract\", \"paperless_billing\", \"payment_method\", \"monthly_charges\", \"total_charges\",\"pipeline_id\",\"pipeline_execution_dt\") \\\n",
269 |     ".write.format('bigquery') \\\n",
270 |     ".mode(\"append\")\\\n",
271 |     ".option('table', bigQueryOutputTableFQN) \\\n",
272 |     ".save()"
273 |    ]
274 |   }
275 |  ],
276 |  "metadata": {
277 |   "language_info": {
278 |    "codemirror_mode": {
279 |     "name": "ipython",
280 |     "version": 3
281 |    },
282 |    "file_extension": ".py",
283 |    "mimetype": "text/x-python",
284 |    "name": "python",
285 |    "nbconvert_exporter": "python",
286 |    "pygments_lexer": "ipython3",
287 |    "version": "3.9.13"
288 |   },
289 |   "serverless_spark": "{\"name\":\"projects/gcp-scalable-ml-workshop/locations/us-central1/sessions/s8s-spark-session-7678-mleap-included\",\"uuid\":\"230b37a6-1a63-4eda-b830-0e0e0fe2a4e7\",\"createTime\":\"2022-10-04T21:45:01.554676Z\",\"jupyterSession\":{},\"spark\":{},\"runtimeInfo\":{\"endpoints\":{\"Spark History Server\":\"https://torvjlsgyjb73jwsyuujcs3pei-dot-us-central1.dataproc.googleusercontent.com/sparkhistory/\"}},\"state\":\"ACTIVE\",\"stateTime\":\"2022-10-04T21:48:58.470916Z\",\"creator\":\"admin@akhanolkar.altostrat.com\",\"runtimeConfig\":{\"containerImage\":\"gcr.io/gcp-scalable-ml-workshop/customer_churn_image:1.0.0\",\"properties\":{\"spark:spark.jars\":\"gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar\",\"spark:spark.jars.packages\":\"ml.combust.mleap:mleap-spark_2.12:0.20.0\",\"spark:spark.executor.instances\":\"2\",\"spark:spark.driver.cores\":\"4\",\"spark:spark.executor.cores\":\"4\",\"spark:spark.dynamicAllocation.executorAllocationRatio\":\"0.3\",\"spark:spark.eventLog.dir\":\"gs://s8s-sphs-569379262211/230b37a6-1a63-4eda-b830-0e0e0fe2a4e7/spark-job-history\"}},\"environmentConfig\":{\"executionConfig\":{\"subnetworkUri\":\"spark-snet\",\"idleTtl\":\"14400s\"},\"peripheralsConfig\":{\"sparkHistoryServerConfig\":{\"dataprocCluster\":\"projects/gcp-scalable-ml-workshop/regions/us-central1/clusters/s8s-sphs-569379262211\"}}},\"stateHistory\":[{\"state\":\"CREATING\",\"stateStartTime\":\"2022-10-04T21:45:01.554676Z\"}]}",
290 |   "serverless_spark_kernel_name": "remote-ba3ff7097a41c0f7da1855a3-pyspark"
291 |  },
292 |  "nbformat": 4,
293 |  "nbformat_minor": 5
294 | }
295 | 


--------------------------------------------------------------------------------
/04-templates/mnbs-exec-post-startup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #........................................................................
4 | # Purpose: Copy existing notebooks to Workbench server Jupyter home dir
5 | # (Managed notebook server)
6 | #........................................................................
7 | 
8 | gsutil cp gs://s8s_notebook_bucket-PROJECT_NBR/pyspark/*.ipynb /home/jupyter/ 
9 | #sudo chown jupyter:jupyter /home/jupyter/* 


--------------------------------------------------------------------------------
/04-templates/preprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "application/vnd.databricks.v1+cell": {
  7 |      "inputWidgets": {},
  8 |      "nuid": "e8aeb0e7-0c23-471d-ba1a-d3d733a118e0",
  9 |      "showTitle": false,
 10 |      "title": ""
 11 |     }
 12 |    },
 13 |    "source": [
 14 |     "### Preprocessing\n",
 15 |     "This module performs data transformation in preparation for the customer churn model training.\n",
 16 |     "\n",
 17 |     "1. It reads raw data in CSV from GCS\n",
 18 |     "2. Performs some basic transformations and\n",
 19 |     "3. Persists to BigQuery"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "spark"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "application/vnd.databricks.v1+cell": {
 36 |      "inputWidgets": {},
 37 |      "nuid": "8023dc7c-ec1c-41a7-a049-c8e2a44a5beb",
 38 |      "showTitle": false,
 39 |      "title": ""
 40 |     }
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import sys\n",
 45 |     "import pyspark\n",
 46 |     "from pyspark.sql import SparkSession\n",
 47 |     "from pyspark.sql.functions import *\n",
 48 |     "from datetime import datetime\n",
 49 |     "import random"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# 1a. Arguments\n",
 59 |     "pipelineID = random.randint(1, 10000)\n",
 60 |     "projectNbr = \"YOUR_PROJECT_NBR\"\n",
 61 |     "projectID = \"YOUR_PROJECT_ID\"\n",
 62 |     "displayPrintStatements = True"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# 1b. Variables \n",
 72 |     "bqDatasetNm = f\"{projectID}.customer_churn_ds\"\n",
 73 |     "appBaseName = \"customer-churn-model\"\n",
 74 |     "appNameSuffix = \"preprocessing\"\n",
 75 |     "appName = f\"{appBaseName}-{appNameSuffix}\"\n",
 76 |     "scratchBucketUri = f\"s8s-spark-bucket-{projectNbr}/{appBaseName}/pipelineId-{pipelineID}/{appNameSuffix}\"\n",
 77 |     "sourceBucketUri = f\"gs://s8s_data_bucket-{projectNbr}/customer_churn_train_data.csv\"\n",
 78 |     "bigQueryTargetTableFQN = f\"{bqDatasetNm}.training_data\"\n",
 79 |     "pipelineExecutionDt = datetime.now().strftime(\"%Y%m%d%H%M%S\")"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "# 1c. Display input and output\n",
 89 |     "if displayPrintStatements:\n",
 90 |     "    print(\"Starting preprocessing for the *Customer Churn* experiment\")\n",
 91 |     "    print(\".....................................................\")\n",
 92 |     "    print(f\"The datetime now is - {pipelineExecutionDt}\")\n",
 93 |     "    print(\" \")\n",
 94 |     "    print(\"INPUT PARAMETERS-\")\n",
 95 |     "    print(f\"....pipelineID={pipelineID}\")\n",
 96 |     "    print(f\"....projectID={projectID}\")\n",
 97 |     "    print(f\"....projectNbr={projectNbr}\")\n",
 98 |     "    print(f\"....displayPrintStatements={displayPrintStatements}\")\n",
 99 |     "    print(\" \")\n",
100 |     "    print(\"EXPECTED SETUP-\")  \n",
101 |     "    print(f\"....BQ Dataset={bqDatasetNm}\")\n",
102 |     "    print(f\"....Source Data={sourceBucketUri}\")\n",
103 |     "    print(f\"....Scratch Bucket for BQ connector=gs://s8s-spark-bucket-{projectNbr}\") \n",
104 |     "    print(\"OUTPUT-\")\n",
105 |     "    print(f\"....BigQuery Table={bigQueryTargetTableFQN}\")\n",
106 |     "    print(f\"....Sample query-\")\n",
107 |     "    print(f\"....SELECT * FROM {bigQueryTargetTableFQN} WHERE pipeline_id='{pipelineID}' LIMIT 10\" )\n",
108 |     "  "
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "# 2. Spark Session creation\n",
118 |     "print('....Initializing spark & spark configs')\n",
119 |     "spark = SparkSession.builder.appName(appName).getOrCreate()\n",
120 |     "\n",
121 |     "# Spark configuration setting for writes to BigQuery\n",
122 |     "spark.conf.set(\"parentProject\", projectID)\n",
123 |     "spark.conf.set(\"temporaryGcsBucket\", scratchBucketUri)\n",
124 |     "\n",
125 |     "# Add Python modules\n",
126 |     "sc.addPyFile(f\"gs://s8s_code_bucket-{projectNbr}/pyspark/common_utils.py\")\n",
127 |     "import common_utils"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "# 3. Read raw data in GCS into a Spark Dataframe\n",
137 |     "print('....Read source data')\n",
138 |     "rawChurnDF = spark.read.options(inferSchema = True, header= True).csv(sourceBucketUri)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "# 4. View the data\n",
148 |     "if displayPrintStatements:\n",
149 |     "    print(rawChurnDF.count())\n",
150 |     "    rawChurnDF.show(2)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "# 5. Profile the data\n",
160 |     "if displayPrintStatements:\n",
161 |     "    rawChurnDF.describe().show()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "# 6. Check for spaces, nulls in monthly & total charges\n",
171 |     "print('....Exploratory Data Analysis')\n",
172 |     "if displayPrintStatements:\n",
173 |     "    rawChurnDF.createOrReplaceTempView(\"base_customer_churn\")\n",
174 |     "    spark.sql(\"select count(*) from base_customer_churn where MonthlyCharges is null or MonthlyCharges=' '\").show(5)\n",
175 |     "    spark.sql(\"select count(*) from base_customer_churn where TotalCharges is null or TotalCharges=' '\").show(5)\n"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "# 7. Replace spaces, space with null values in the TotalCharges and MonthlyCharges columns\n",
185 |     "print('....Replace space, nulls with None')\n",
186 |     "spaceReplacedDF = common_utils.fnReplaceSpaceWithNone(rawChurnDF)\n",
187 |     "if displayPrintStatements:\n",
188 |     "    print(spaceReplacedDF.count())"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "# 8. Replace non-numeric values values in the TotalCharges and MonthlyCharges columns\n",
198 |     "print('....Replace non-numeric values in numeric columns with null')\n",
199 |     "nanReplacedDF = common_utils.fnReplaceNotANumberWithNone(spaceReplacedDF)\n",
200 |     "if displayPrintStatements:\n",
201 |     "    print(nanReplacedDF.count())"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "# 9. Drop rows with null in columns\n",
211 |     "print('....Drop nulls')\n",
212 |     "nullDroppedDF = nanReplacedDF.na.drop()\n",
213 |     "if displayPrintStatements:\n",
214 |     "    print(nullDroppedDF.count())"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "# 10. Replace 'No internet service' across columns to 'No'\n",
224 |     "print('....Replace -No internet service across columns- to -No-')\n",
225 |     "partiallyProcessedDF = common_utils.fnReplaceWithNoForInternetService(nullDroppedDF)\n",
226 |     "if displayPrintStatements:\n",
227 |     "    print(partiallyProcessedDF.count())"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "# 11. Add a bin/bucket category for tenure range using Spark SQL and write transformed to dataframe\n",
237 |     "print('....Add a bin for tenure')\n",
238 |     "modelTrainingReadyDF = common_utils.fnAddBinForTenure(partiallyProcessedDF, False, spark)\n",
239 |     "if displayPrintStatements:\n",
240 |     "    print(modelTrainingReadyDF.count())"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "# 12. Run summary statistics\n",
250 |     "if displayPrintStatements:\n",
251 |     "    modelTrainingReadyDF.describe().show()"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "# 13. Print schema\n",
261 |     "modelTrainingReadyDF.printSchema()"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "# 14. Format column names for consistency (title case to DB style & lowercase)\n",
271 |     "print('....Format column names for consistency')\n",
272 |     "persistDF = modelTrainingReadyDF.select(\"customerID\", \"gender\", \"SeniorCitizen\", \"Partner\", \"Dependents\", \"tenure\", \"Tenure_Group\", \"PhoneService\", \"MultipleLines\", \"InternetService\", \"OnlineSecurity\", \"OnlineBackup\", \"DeviceProtection\", \"TechSupport\", \"StreamingTV\", \"StreamingMovies\", \"Contract\", \"PaperlessBilling\", \"PaymentMethod\", \"MonthlyCharges\", \"TotalCharges\",\"Churn\") \\\n",
273 |     "                                .toDF(\"customer_id\", \"gender\", \"senior_citizen\", \"partner\", \"dependents\", \"tenure\", \"tenure_group\", \"phone_service\", \"multiple_lines\", \"internet_service\", \"online_security\", \"online_backup\", \"device_protection\", \"tech_support\", \"streaming_tv\", \"streaming_movies\", \"contract\", \"paperless_billing\", \"payment_method\", \"monthly_charges\", \"total_charges\",\"churn\") \\\n",
274 |     "                                .withColumn(\"pipeline_id\", lit(pipelineID)) \\\n",
275 |     "                                .withColumn(\"pipeline_execution_dt\", lit(pipelineExecutionDt)) \n",
276 |     "\n",
277 |     "persistDF.printSchema()\n"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "# 15. Persist training dataset to a table in BQ with the pipeline ID and execution date for traceability\n",
287 |     "print('....Persist to BQ')  \n",
288 |     "persistDF.write.format('bigquery') \\\n",
289 |     ".mode(\"append\")\\\n",
290 |     ".option('table', bigQueryTargetTableFQN) \\\n",
291 |     ".save()"
292 |    ]
293 |   }
294 |  ],
295 |  "metadata": {
296 |   "application/vnd.databricks.v1+notebook": {
297 |    "dashboards": [],
298 |    "language": "python",
299 |    "notebookMetadata": {
300 |     "pythonIndentUnit": 4
301 |    },
302 |    "notebookName": "01-data-engineering",
303 |    "notebookOrigID": 1914343434663113,
304 |    "widgets": {}
305 |   },
306 |   "language_info": {
307 |    "codemirror_mode": {
308 |     "name": "ipython",
309 |     "version": 3
310 |    },
311 |    "file_extension": ".py",
312 |    "mimetype": "text/x-python",
313 |    "name": "python",
314 |    "nbconvert_exporter": "python",
315 |    "pygments_lexer": "ipython3",
316 |    "version": "3.9.13"
317 |   },
318 |   "serverless_spark": "{\"name\":\"projects/s8s-spark-ml-mlops/locations/us-central1/sessions/agni-6\",\"uuid\":\"35fda7e3-be7b-4913-99c5-83e97b677386\",\"createTime\":\"2022-08-04T02:37:17.836903Z\",\"jupyterSession\":{},\"spark\":{},\"runtimeInfo\":{},\"state\":\"ACTIVE\",\"stateTime\":\"2022-08-04T02:38:37.084371Z\",\"creator\":\"s8s-lab-sa@s8s-spark-ml-mlops.iam.gserviceaccount.com\",\"runtimeConfig\":{\"containerImage\":\"gcr.io/s8s-spark-ml-mlops/dataproc_serverless_custom_runtime:1.0.3\",\"properties\":{\"spark:spark.executor.instances\":\"2\",\"spark:spark.driver.cores\":\"4\",\"spark:spark.executor.cores\":\"4\",\"spark:spark.eventLog.dir\":\"gs://s8s-sphs-974925525028/35fda7e3-be7b-4913-99c5-83e97b677386/spark-job-history\"}},\"environmentConfig\":{\"executionConfig\":{\"serviceAccount\":\"s8s-lab-sa@s8s-spark-ml-mlops.iam.gserviceaccount.com\",\"subnetworkUri\":\"https://www.googleapis.com/compute/v1/projects/s8s-spark-ml-mlops/regions/us-central1/subnetworks/spark-snet\"},\"peripheralsConfig\":{\"sparkHistoryServerConfig\":{\"dataprocCluster\":\"projects/s8s-spark-ml-mlops/regions/us-central1/clusters/s8s-sphs-974925525028\"}}},\"stateHistory\":[{\"state\":\"CREATING\",\"stateStartTime\":\"2022-08-04T02:37:17.836903Z\"}]}",
319 |   "serverless_spark_kernel_name": "remote-bc514a4a91cec988ad3c15a7-pyspark"
320 |  },
321 |  "nbformat": 4,
322 |  "nbformat_minor": 4
323 | }
324 | 


--------------------------------------------------------------------------------
/04-templates/umnbs-exec-post-startup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #........................................................................
4 | # Purpose: Copy existing notebooks to Workbench server Jupyter home dir
5 | # (User-managed notebook server)
6 | #........................................................................
7 | 
8 | gsutil cp gs://s8s_notebook_bucket-PROJECT_NBR/vai-pipelines/*.ipynb /home/jupyter/ 
9 | chown jupyter:jupyter /home/jupyter/* 


--------------------------------------------------------------------------------
/05-lab-guide/Module-01-Environment-Provisioning.md:
--------------------------------------------------------------------------------
  1 | # About Module 1
  2 | 
  3 | This module covers environment provisioning for the workshop. This module takes ~50-60 minutes to complete. 
  4 | <br><br>
  5 | ## Note:
  6 | 1. **Ensure services in use in the workshop are available in the location of your preference** and modify the variables in step 2.4.1 to reflect the same.
  7 | 2. Get any preview services **allow-listed**
  8 | 3. Some of the organization policies many not apply for your company, modify appropriately
  9 | 4. The lab is intended for each attendee to have a full environment to themselves with **shared nothing**
 10 | 5. Terraform state is deliberately local for simplicity
 11 | 6. Be sure to check out section 5 for glitches/nuances and workarounds.
 12 | 
 13 | ## 1. Details about the environment that is setup by this module
 14 | 
 15 | ![PICT1](../06-images/module-1-pictorial-01.png)   
 16 | <br><br>
 17 | 
 18 | ![PICT2](../06-images/module-1-pictorial-02.png)   
 19 | <br><br>
 20 | 
 21 | ![PICT3](../06-images/module-1-pictorial-03.png)   
 22 | <br><br>
 23 | 
 24 | ## Pictorial walkthrough of services provisioned & customization
 25 | The author's environment is showcased [here](../05-lab-guide/Services-Created.md)
 26 | 
 27 | <hr>
 28 | 
 29 | ## 2. Create the environment
 30 | 
 31 | ### 2.1. Create a directory in Cloud Shell for the workshop
 32 | ```
 33 | cd ~
 34 | mkdir gcp-spark-mllib-workshop
 35 | ```
 36 | 
 37 | ### 2.2. Clone the workshop git repo
 38 | ```
 39 | cd ~/gcp-spark-mllib-workshop
 40 | git clone https://github.com/anagha-google/s8s-spark-mlops-lab.git
 41 | ```
 42 | 
 43 | ### 2.3. Navigate to the Terraform provisioning directory
 44 | ```
 45 | cd ~/gcp-spark-mllib-workshop/s8s-spark-mlops-lab/00-env-setup
 46 | ```
 47 | 
 48 | ### 2.4. Provision the environment
 49 | 
 50 | #### 2.4.1. Define variables for use
 51 | Modify the below as appropriate for your deployment..e.g. region, zone etc. Be sure to use the right case for GCP region & zone.<br>
 52 | Regions and zones listing can be found [here](https://cloud.google.com/compute/docs/regions-zones)(zone has a -a/b/c as suffix to region/location).<br>
 53 | For Cloud Scheduler timezone, use the Cloud Scheduler UI and see available options for you.<br>
 54 | ```
 55 | PROJECT_ID=`gcloud config list --format "value(core.project)" 2>/dev/null`
 56 | PROJECT_NBR=`gcloud projects describe $PROJECT_ID | grep projectNumber | cut -d':' -f2 |  tr -d "'" | xargs`
 57 | PROJECT_NAME=`gcloud projects describe ${PROJECT_ID} | grep name | cut -d':' -f2 | xargs`
 58 | GCP_ACCOUNT_NAME=`gcloud auth list --filter=status:ACTIVE --format="value(account)"`
 59 | ORG_ID=`gcloud organizations list --format="value(name)"`
 60 | VPC_NM=s8s-vpc-$PROJECT_NBR
 61 | SPARK_SERVERLESS_SUBNET=spark-snet
 62 | PERSISTENT_HISTORY_SERVER_NM=s8s-sphs-${PROJECT_NBR}
 63 | UMSA_FQN=s8s-lab-sa@$PROJECT_ID.iam.gserviceaccount.com
 64 | DATA_BUCKET=s8s_data_bucket-${PROJECT_NBR}
 65 | CODE_BUCKET=s8s_code_bucket-${PROJECT_NBR}
 66 | MODEL_BUCKET=s8s_model_bucket-${PROJECT_NBR}
 67 | BQ_CONNECTOR_JAR_GCS_URI="gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar"
 68 | CLOUD_COMPOSER_IMG_VERSION="composer-2.1.11-airflow-2.4.3"
 69 | SPARK_CUSTOM_CONTAINER_IMAGE_TAG="1.0.0"
 70 | YOUR_GCP_REGION="us-central1"
 71 | YOUR_GCP_ZONE="us-central1-a"
 72 | YOUR_GCP_MULTI_REGION="US"
 73 | CLOUD_SCHEDULER_TIME_ZONE="America/Chicago"
 74 | DATAPROC_RUNTIME_VERSION="1.1"
 75 | 
 76 | echo "PROJECT_ID=$PROJECT_ID"
 77 | echo "PROJECT_NBR=$PROJECT_NBR"
 78 | echo "PROJECT_NAME=$PROJECT_NAME"
 79 | echo "VPC_NM=$VPC_NM"
 80 | echo "PERSISTENT_HISTORY_SERVER_NM=$PERSISTENT_HISTORY_SERVER_NM"
 81 | echo "UMSA_FQN=$UMSA_FQN"
 82 | echo "DATA_BUCKET=$DATA_BUCKET"
 83 | echo "CODE_BUCKET=$CODE_BUCKET"
 84 | echo "DATAPROC_RUNTIME_VERSION=$DATAPROC_RUNTIME_VERSION"
 85 | ```
 86 | 
 87 | ### 2.4.2. Initialize Terraform
 88 | Needs to run in cloud shell from ~/gcp-spark-mllib-workshop/s8s-spark-mlops-lab/00-env-setup
 89 | ```
 90 | cd ~/gcp-spark-mllib-workshop/s8s-spark-mlops-lab/00-env-setup
 91 | 
 92 | terraform init
 93 | ```
 94 | 
 95 | #### 2.4.3. Review the Terraform deployment plan
 96 | Needs to run in cloud shell from ~/gcp-spark-mllib-workshop/s8s-spark-mlops-lab/00-env-setup
 97 | ```
 98 | cd ~/gcp-spark-mllib-workshop/s8s-spark-mlops-lab/00-env-setup
 99 | 
100 | terraform plan \
101 |   -var="project_id=${PROJECT_ID}" \
102 |   -var="project_name=${PROJECT_NAME}" \
103 |   -var="project_number=${PROJECT_NBR}" \
104 |   -var="gcp_account_name=${GCP_ACCOUNT_NAME}" \
105 |   -var="org_id=${ORG_ID}" \
106 |   -var="cloud_composer_image_version=${CLOUD_COMPOSER_IMG_VERSION}" \
107 |   -var="spark_container_image_tag=${SPARK_CUSTOM_CONTAINER_IMAGE_TAG}" \
108 |   -var="gcp_region=${YOUR_GCP_REGION}" \
109 |   -var="gcp_zone=${YOUR_GCP_ZONE}" \
110 |   -var="gcp_multi_region=${YOUR_GCP_MULTI_REGION}" \
111 |   -var="bq_connector_jar_gcs_uri=${BQ_CONNECTOR_JAR_GCS_URI}" \
112 |   -var="cloud_scheduler_time_zone=${CLOUD_SCHEDULER_TIME_ZONE}" \
113 |   -var="dataproc_runtime_version=${DATAPROC_RUNTIME_VERSION}" 
114 |   
115 | ```
116 | 
117 | #### 2.4.4. Provision the environment
118 | Needs to run in cloud shell from ~/gcp-spark-mllib-workshop/s8s-spark-mlops/00-env-setup
119 | ```
120 | cd ~/gcp-spark-mllib-workshop/s8s-spark-mlops-lab/00-env-setup
121 | 
122 | terraform apply \
123 |   -var="project_id=${PROJECT_ID}" \
124 |   -var="project_name=${PROJECT_NAME}" \
125 |   -var="project_number=${PROJECT_NBR}" \
126 |   -var="gcp_account_name=${GCP_ACCOUNT_NAME}" \
127 |   -var="org_id=${ORG_ID}"  \
128 |   -var="cloud_composer_image_version=${CLOUD_COMPOSER_IMG_VERSION}" \
129 |   -var="spark_container_image_tag=${SPARK_CUSTOM_CONTAINER_IMAGE_TAG}" \
130 |   -var="gcp_region=${YOUR_GCP_REGION}" \
131 |   -var="gcp_zone=${YOUR_GCP_ZONE}" \
132 |   -var="gcp_multi_region=${YOUR_GCP_MULTI_REGION}" \
133 |   -var="bq_connector_jar_gcs_uri=${BQ_CONNECTOR_JAR_GCS_URI}" \
134 |   -var="cloud_scheduler_time_zone=${CLOUD_SCHEDULER_TIME_ZONE}"  \
135 |   -var="dataproc_runtime_version=${DATAPROC_RUNTIME_VERSION}" \
136 |   --auto-approve >> provisioning.output
137 | ```
138 | 
139 | <hr>
140 | 
141 | ## 3. Validate your Terraform deployment against a pictorial overview of services provisioned & customization
142 | Available [here](../05-lab-guide/Services-Created.md)
143 | 
144 | 
145 | <hr>
146 | 
147 | ## 4. Glitches/nuances to be mindful of
148 | **4.1. Cloud Composer 2**<br>
149 | If you edit the Terraform and run apply, Cloud Composer2 attempts to update the network and fails the deployment. <br>
150 | Workaround: Delete Cloud Composer manually and then rerun. 10+ minutes to delete, 20 minutes to recreate -> enough for a power nap. :)
151 | 
152 | **4.2. Managed Notebook Instance on Vertex AI Workbench**<br>
153 | Changing ownership of notebooks (uploaded to /home/jupyter via Terrafrom) from owner root to jupyter:jupyter does not work currently as part of Terraform deployment.<br>
154 | Workaround: Clone the notebook and save.
155 | 
156 | **4.3. Persistent Spark History Server (PHS)**<br>
157 | If you edit the Terraform and run apply, PHS gets destroyed and recreated. <br>
158 | Workaround: Not applicable. It just takes 90 seconds or less to destroy and 90 seconds to recreate.
159 | 
160 | <hr>
161 | 
162 | ## 5. Terraform How-Tos [DO NOT RUN THIS, ITS JUST FYI]
163 | 
164 | ### 5.1. For selective replacement of specific services/units of deployment [DO NOT RUN THIS, ITS JUST FYI]
165 | 
166 | This is not needed...and is informational only.<br>
167 | Needs to run in cloud shell from ~/gcp-spark-mllib-workshop/s8s-spark-mlops-lab/00-env-setup<br>
168 | If -target does not work, try -replace
169 | ```
170 | #terraform apply -target=google_storage_bucket_object.upload_cc2_dag_to_airflow_dag_bucket \
171 | -var="project_id=${PROJECT_ID}" \
172 |   -var="project_name=${PROJECT_NAME}" \
173 |   -var="project_number=${PROJECT_NBR}" \
174 |   -var="gcp_account_name=${GCP_ACCOUNT_NAME}" \
175 |   -var="org_id=${ORG_ID}"  \
176 |   -var="cloud_composer_image_version=${CLOUD_COMPOSER_IMG_VERSION}" \
177 |   -var="spark_container_image_tag=${SPARK_CUSTOM_CONTAINER_IMAGE_TAG}" \
178 |   -var="gcp_region=${YOUR_GCP_REGION}" \
179 |   -var="gcp_zone=${YOUR_GCP_ZONE}" \
180 |   -var="gcp_multi_region=${YOUR_GCP_MULTI_REGION}" \
181 |   -var="bq_connector_jar_gcs_uri=${BQ_CONNECTOR_JAR_GCS_URI}" \
182 |   -var="cloud_scheduler_time_zone=${CLOUD_SCHEDULER_TIME_ZONE}"  \
183 |   -var="dataproc_runtime_version=${DATAPROC_RUNTIME_VERSION}" \
184 |   --auto-approve
185 | ```
186 | 
187 | ### 5.2. To destroy the deployment [DO NOT RUN THIS, ITS JUST FYI]
188 | 
189 | You can (a) shutdown the project altogether in GCP Cloud Console or (b) use Terraform to destroy. Use (b) at your own risk as its a little glitchy while (a) is guaranteed to stop the billing meter pronto.
190 | <br>
191 | Needs to run in cloud shell from ~/gcp-spark-mllib-workshop/s8s-spark-mlops-lab/00-env-setup
192 | ```
193 | #terraform destroy \
194 |   -var="project_id=${PROJECT_ID}" \
195 |   -var="project_name=${PROJECT_NAME}" \
196 |   -var="project_number=${PROJECT_NBR}" \
197 |   -var="gcp_account_name=${GCP_ACCOUNT_NAME}" \
198 |   -var="org_id=${ORG_ID}" \
199 |   -var="cloud_composer_image_version=${CLOUD_COMPOSER_IMG_VERSION}" \
200 |   -var="spark_container_image_tag=${SPARK_CUSTOM_CONTAINER_IMAGE_TAG}" \
201 |   -var="gcp_region=${YOUR_GCP_REGION}" \
202 |   -var="gcp_zone=${YOUR_GCP_ZONE}" \
203 |   -var="gcp_multi_region=${YOUR_GCP_MULTI_REGION}" \
204 |   -var="bq_connector_jar_gcs_uri=${BQ_CONNECTOR_JAR_GCS_URI}" \
205 |   -var="cloud_scheduler_time_zone=${CLOUD_SCHEDULER_TIME_ZONE}"  \
206 |   -var="dataproc_runtime_version=${DATAPROC_RUNTIME_VERSION}" \
207 |   --auto-approve
208 |  ```
209 | 
210 | <hr>
211 | 
212 | ## 6. What's in the next module
213 | In the [next module](../05-lab-guide/Module-02-Spark-IDE-on-GCP.md), we will learn how to use Serverless Spark interactive notebooks for machine learning model development with Spark MLLib on Dataproc Serverless Spark.
214 | 
215 | <hr>
216 | 


--------------------------------------------------------------------------------
/05-lab-guide/Module-02-Spark-IDE-on-GCP.md:
--------------------------------------------------------------------------------
  1 | # About Module 2
  2 | 
  3 | This module covers how to use Vertex AI Workbench's "Managed Notebook Instance" for authoring Spark code in an interactive manner with Dataproc Serverless Spark interactive sessions. Understanding creation of serveless Spark interactive sessions and notebook nuances is crucial for the next module where you will run actual machine learning experiments.
  4 | 
  5 | <hr>
  6 | 
  7 | ## 1. About Dataproc Serverless Spark Interactive
  8 | Dataproc Serverless Spark Interactive is serverless, Dataproc managed, autoscaling, private infrastructure for interactive Spark code authoring via a Jupyter notebook hosted on Vertex AI Managed Notebook instance. The following is an overview of what to expect. Further in this lab guide there is detailed instructions with a pictorial overview.
  9 | 
 10 | ### 1a. Getting started - what's involved
 11 | 
 12 | ![ABOUT](../06-images/module-2-summary-01.png)   
 13 | <br><br>
 14 | 
 15 | 
 16 | ### 1b. Creating and using an Serverless Spark Interactive session in a notebook - what's involved
 17 | 
 18 | ![ABOUT](../06-images/module-2-summary-02.png)   
 19 | <br><br>
 20 | 
 21 | 
 22 | ### 1c. Switching notebooks and reusing the Serverless Spark Interactive session
 23 | 
 24 | ![ABOUT](../06-images/module-2-summary-03.png)   
 25 | <br><br>
 26 | 
 27 | <hr>
 28 | 
 29 | ## 2. The exercise
 30 | We will analyze Chicago Crimes in BigQuery from a Jupyer Notebook on Vertex AI Workbench - Managed Notebook Instance using dataproc Serverless Spark interactive sessions.
 31 | 
 32 | ![EXERCISE](../06-images/module-2-summary-04.png)   
 33 | <br><br>
 34 | 
 35 | **Goals:**
 36 | 1. Understand how to create and attach a Dataproc Serverless Spark interactive session to your Jupyter notebook  
 37 | 2. Learn how to switch the Dataproc Serverless Spark interactive session created, between Jupyter notebooks
 38 | 3. Learn to navigate Dataproc UI for the Serverless Spark interactive session 
 39 | 4. Browse the Spark UI of the persistent Spark History Server, for the Serverless Spark interactive session 
 40 | 5. Learn how to analyze data in BigQuery using the BigQuery Spark connector.
 41 | 
 42 | **Pre-requisite:**
 43 | 1. Ensure that any preview features are allow-listed by product engineering, ahead of time
 44 | 2. Provisioning from module 1 needs to be successfully completed
 45 | 
 46 | **Note:**
 47 | <br>If the notebook is not editable, make a copy and use the same.
 48 | 
 49 | <hr>
 50 | 
 51 | ## 3. Varibles you will need for this module
 52 | 
 53 | Run the below in Cloud Shell scoped to your project. The values in these variables are needed to create the interactive Spark session - you will need to paste these into the User Interface.
 54 | 
 55 | ```
 56 | PROJECT_ID=`gcloud config list --format "value(core.project)" 2>/dev/null`
 57 | PROJECT_NBR=`gcloud projects describe $PROJECT_ID | grep projectNumber | cut -d':' -f2 |  tr -d "'" | xargs`
 58 | UMSA_FQN=s8s-lab-sa@$PROJECT_ID.iam.gserviceaccount.com
 59 | SPARK_CUSTOM_CONTAINER_IMAGE_URI="gcr.io/$PROJECT_ID/customer_churn_image:1.0.0"
 60 | DATAPROC_RUNTIME_VERSION="1.1"
 61 | 
 62 | echo "PROJECT_ID=$PROJECT_ID"
 63 | echo "PROJECT_NBR=$PROJECT_NBR"
 64 | echo "UMSA_FQN=$UMSA_FQN"
 65 | echo "SPARK_CUSTOM_CONTAINER_IMAGE_URI=$SPARK_CUSTOM_CONTAINER_IMAGE_URI"
 66 | 
 67 | 
 68 | echo " "
 69 | echo " "
 70 | ```
 71 | 
 72 | Author's details:
 73 | ```
 74 | PROJECT_ID=gcp-scalable-ml-workshop
 75 | PROJECT_NBR=xxx
 76 | UMSA_FQN=s8s-lab-sa@gcp-scalable-ml-workshop.iam.gserviceaccount.com
 77 | SPARK_CUSTOM_CONTAINER_IMAGE_URI=gcr.io/gcp-scalable-ml-workshop/customer_churn_image:1.0.0
 78 | ```
 79 | 
 80 | <hr>
 81 | 
 82 | ## 4. Navigate on the Cloud Console to the Vertex AI Workbench, Managed Notebook Instance
 83 | Open JupyterLab as shown below
 84 | 
 85 | ![UMNBS](../06-images/module-1-vai-wb-01.png)   
 86 | <br><br>
 87 | 
 88 | **Be sure to select the right region in the dropdown.**
 89 | 
 90 | ![UMNBS](../06-images/module-1-vai-wb-mnb-01.png)   
 91 | <br><br>
 92 | 
 93 | <hr>
 94 | 
 95 | 
 96 | ## 5. Open the Chicago Crimes notebook 
 97 | 
 98 | ![UMNBS](../06-images/module-2-01.png)   
 99 | <br><br>
100 | 
101 | <hr>
102 | 
103 | ## 6. Click on "Launcher" to create an interactive Spark session 
104 | 
105 | ![UMNBS](../06-images/module-2-02.png)   
106 | <br><br>
107 | 
108 | <hr>
109 | 
110 | ## 7. Key in/select from dropdown, details required
111 | Note that the varibles run in Cloud shell have all the values you need to create the session. Copy paste where needed.
112 | 
113 | ![UMNBS](../06-images/module-2-03.png)   
114 | <br><br>
115 | 
116 | 
117 | ![UMNBS](../06-images/module-2-04.png)   
118 | <br><br>
119 | 
120 | 
121 | ![UMNBS](../06-images/module-2-05.png)   
122 | <br><br>
123 | 
124 | 
125 | ![UMNBS](../06-images/module-2-06.png)   
126 | <br><br>
127 | 
128 | 
129 | 
130 | ![UMNBS](../06-images/module-2-08.png)   
131 | <br><br>
132 | 
133 | Click on "submit". In less than 2 minutes, you should see a session created.
134 | 
135 | 
136 | ![UMNBS](../06-images/module-2-09.png)   
137 | <br><br>
138 | 
139 | <hr>
140 | 
141 | ## 8. Ensure you have the session you created, selected in the kernel picker dropdown
142 | 
143 | ### 8.1. The kernel picker - where to find it
144 | 
145 | ![UMNBS](../06-images/module-2-15.png)   
146 | <br><br>
147 | 
148 | ![UMNBS](../06-images/module-2-16.png)   
149 | <br><br>
150 | 
151 | ![UMNBS](../06-images/module-2-17.png)   
152 | <br><br>
153 | 
154 | ![UMNBS](../06-images/module-2-18.png)   
155 | <br><br>
156 | 
157 | ![UMNBS](../06-images/module-2-19.png)   
158 | <br><br>
159 | 
160 | 
161 | ### 8.2. Choosing the interactive spark kernel
162 | 
163 | ![UMNBS](../06-images/module-2-10.png)   
164 | <br><br>
165 | 
166 | 
167 | 
168 | <hr>
169 | 
170 | ## 9. Place your cursor in the first cell, then following the instructions below, run all cells or run each cell sequentially
171 | 
172 | 
173 | ![UMNBS](../06-images/module-2-11.png)   
174 | <br><br>
175 | 
176 | ![UMNBS](../06-images/module-2-12.png)   
177 | <br><br>
178 | 
179 | <hr>
180 | 
181 | ## 10. Close the notebook once the excerise is completed
182 | Save or discard changes as needed. Be sure to "keep session" though when prompted as you close the notebook.
183 | 
184 | ![UMNBS](../06-images/module-2-13.png)   
185 | <br><br>
186 | 
187 | ![UMNBS](../06-images/module-2-14.png)   
188 | <br><br>
189 | 
190 | 
191 | <hr>
192 | 
193 | This concludes the module. In the [next module](../05-lab-guide/Module-03-Author-ML-Experiments-With-Spark-Notebooks.md), you will run a complete model trainng exercise with notebooks - pre-processing, model training, hyperparameter tuning, batch scoring. 
194 | 
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/05-lab-guide/Module-03-Author-ML-Experiments-With-Spark-Notebooks.md:
--------------------------------------------------------------------------------
  1 | # About Module 3
  2 | The recommended GCP solution for scalable Spark based ML code interactive authoring is Serverless Spark notebooks on Vertex AI Workbench, Managed Notebooks. In this lab module, we will go through the typical data science/ML engineering work - preprocess data, train & test model, tune model, and do some scoring. Since this lab is focused on demystifying the integration, the notebooks are pre-created for you, so you can quickly understand the integration.
  3 | 
  4 | <hr>
  5 | 
  6 | ## 1. Use case recap
  7 | Telco Customer Churn Prediction with a [Kaggle dataset](https://www.kaggle.com/datasets/blastchar/telco-customer-churn) and [Spark MLLib, Random Forest Classifer](https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier)<br> 
  8 | 
  9 | <hr>
 10 | 
 11 | ## 2. The environment & exercises in the module
 12 | The environment for the module is Vertex AI Workbench, Managed notebook instance, custom container image for serverless Spark - pre-created as part of the Terraform deployent. We will reuse kernel created in the prior module. The following are the four exercises in the module.
 13 | 
 14 | ![M3](../06-images/module-3-03.png)   
 15 | <br><br>
 16 | 
 17 | <hr>
 18 | 
 19 | ## 3. Where we are in the model development lifecycle
 20 | 
 21 | ![M3](../06-images/module-3-38.png)   
 22 | <br><br>
 23 | 
 24 | ## 4. The data used in the experiment
 25 | Training and scoring data are available in GCS in the data bucket and the data is in CSV format.
 26 | 
 27 | ![M3](../06-images/module-3-01.png)   
 28 | <br><br>
 29 | 
 30 | ![M3](../06-images/module-3-02.png)   
 31 | <br><br>
 32 | 
 33 | <hr>
 34 | 
 35 | ## 5. Step 1: Preprocessing
 36 | 
 37 | ### 5.1. The exercise
 38 | We will read customer churn raw source data for model training, in GCS, cleanse/transform and persist to BigQuery for use in the model training step.
 39 | 
 40 | ![M3](../06-images/module-3-04.png)   
 41 | <br><br>
 42 | 
 43 | ### 5.2. Switch the spark interactive kernel to the pre-processing notebook
 44 | Lets ensure we have the Serverless Spark kernel created in the prior module attached to the pre-processing notebook. Follow the screenshots below-
 45 | 
 46 | ![M3](../06-images/module-3-05.png)   
 47 | <br><br>
 48 | 
 49 | ![M3](../06-images/module-3-06.png)   
 50 | <br><br>
 51 | 
 52 | ![M3](../06-images/module-3-07.png)   
 53 | <br><br>
 54 | 
 55 | ### 5.3. Review the code in the pre-processing notebook and run the notebook
 56 | 
 57 | ![M3](../06-images/module-3-10.png)   
 58 | <br><br>
 59 | 
 60 | ### 5.4. Review the pre-processed data in BigQuery
 61 | 
 62 | Navigate to BigQuery, and run the following query-
 63 | ```
 64 | SELECT * FROM `customer_churn_ds.training_data` LIMIT 1000
 65 | ```
 66 | The following is the author's results-
 67 | 
 68 | ![M3](../06-images/module-3-10.png)   
 69 | <br><br>
 70 | 
 71 | 
 72 | ### 5.5. Visit the Dataproc UI for the session
 73 | 
 74 | ![M3](../06-images/module-3-08.png)   
 75 | <br><br>
 76 | 
 77 | ### 5.6. Visit the Spark History Server UI for the session
 78 | 
 79 | ![M3](../06-images/module-3-09.png)   
 80 | <br><br>
 81 | 
 82 | ### 5.7. Review the notebook equivalent PySpark script in GCS for this step
 83 | For each notebook, there is complementary code in a PySpark script that will be used for operationalizing the model training Vertex AI pipeline.
 84 | 
 85 | ![M3](../06-images/module-3-11.png)   
 86 | <br><br>
 87 | 
 88 | ![M3](../06-images/module-3-12.png)   
 89 | <br><br>
 90 | <hr>
 91 | 
 92 | ## 6. Step 2: Model Training
 93 | 
 94 | Now that we have preprocessed data, lets create a model model.
 95 | 
 96 | ### 6.1. The exercise
 97 | 
 98 | ![M3](../06-images/module-3-15.png)   
 99 | <br><br>
100 | 
101 | ### 6.2. Run the model training notebook
102 | 1. Close the preprocessing notebook
103 | 2. Shutdown kernel, leave spark interactive session active. 
104 | 3. Open the model training notebook
105 | 4. Review the code, run all cells as showin section 4.3 above
106 | 
107 | ![M3](../06-images/module-3-13.png)   
108 | <br><br>
109 | 
110 | ![M3](../06-images/module-3-14.png)   
111 | <br><br>
112 | 
113 | ### 6.3. Review the model persisted in GCS
114 | 
115 | ![M3](../06-images/module-3-22.png)   
116 | <br><br>
117 | 
118 | ### 6.4. Review the model metrics persisted in GCS
119 | 
120 | ![M3](../06-images/module-3-23.png)   
121 | <br><br>
122 | 
123 | ![M3](../06-images/module-3-24.png)   
124 | <br><br>
125 | 
126 | This JSON is persisted so it can be visualized in Vertex AI pipeline. For queryability, we also persist to BigQuery.
127 | 
128 | ![M3](../06-images/module-3-25.png)   
129 | <br><br>
130 | 
131 | ### 6.5. The tables created in BigQuery for the experiment
132 | The following tables are created and written to in append mode.
133 | 
134 | ![M3](../06-images/module-3-16.png)   
135 | <br><br>
136 | 
137 | **Notice the columns for the tables. There is a pipeline_id column and a pipeline_execution_dt for traceability/lineage tracking.**
138 | 
139 | ### 6.6. Review the model feature importance scores persisted in BigQuery
140 | Run the below query in BigQuery. Be sure to add pipeline_id to the where clause if you are running the experiments multiple times.
141 | ```
142 | SELECT * FROM `customer_churn_ds.model_feature_importance_scores`
143 |  WHERE operation='training'  
144 | ```
145 | The following is the author's output-
146 | 
147 | ![M3](../06-images/module-3-21.png)   
148 | <br><br>
149 | 
150 | ### 6.7. Review the model metrics persisted in BigQuery
151 | Run the below query in BigQuery. Be sure to add pipeline_id to the where clause if you are running the experiments multiple times.
152 | ```
153 | SELECT * FROM `customer_churn_ds.model_metrics` 
154 |  WHERE operation='training'  
155 | ```
156 | The following is the author's output-
157 | 
158 | ![M3](../06-images/module-3-20.png)   
159 | <br><br>
160 | 
161 | ### 6.8. Review the model test results in BigQuery
162 | Run the below queries in BigQuery. Be sure to add pipeline_id to the where clause if you are running the experiments multiple times.
163 | 
164 | Just the predictions-
165 | ```
166 | SELECT churn, prediction, *
167 |  FROM `customer_churn_ds.test_predictions` 
168 |  WHERE operation='training'
169 | ```
170 | Confusion matrix-
171 | ```
172 | SELECT churn, prediction, count(*) as count
173 |  FROM `customer_churn_ds.test_predictions` 
174 |  WHERE operation='training'
175 | GROUP BY churn, prediction ORDER BY churn
176 | ```
177 | 
178 | The following is the author's output-
179 | 
180 | ![M3](../06-images/module-3-17.png)   
181 | <br><br>
182 | 
183 | The confusion matrix-
184 | 
185 | ![M3](../06-images/module-3-18.png)   
186 | <br><br>
187 | 
188 | ### 6.9. Review the notebook equivalent PySpark script in GCS for this step
189 | 
190 | ![M3](../06-images/module-3-26.png)   
191 | <br><br>
192 | 
193 | ![M3](../06-images/module-3-27.png)   
194 | <br><br>
195 | 
196 | <hr>
197 | 
198 | ## 7. Step 3: Hyperparameter Tuning
199 | 
200 | ### 7.1. The exercise
201 | 
202 | This sub-module demonstrates hyperparameter tuning with Spark MLLib in an effort to improve model performance. 
203 | 
204 | ![M3](../06-images/module-3-28.png)   
205 | <br><br>
206 | 
207 | ### 7.2. Create a new Serverless Spark Interactive Session through the CLI
208 | We need this to be able to pass multiple packages (BQ + MLEAP)...
209 | This is just to demonstrate session creation via CLI.
210 | 
211 | ```
212 | PROJECT_ID=`gcloud config list --format "value(core.project)" 2>/dev/null`
213 | PROJECT_NBR=`gcloud projects describe $PROJECT_ID | grep projectNumber | cut -d':' -f2 |  tr -d "'" | xargs`
214 | SESSION_NAME="s8s-spark-session-$RANDOM-mleap-included"
215 | REGION="us-central1" # REPLACE WITH YOUR REGION
216 | HISTORY_SERVER_NAME="s8s-sphs-${PROJECT_NBR}"
217 | SUBNET="spark-snet"
218 | NOTEBOOK_BUCKET="gs://s8s_notebook_bucket-${PROJECT_NBR}"
219 | CONTAINER_IMAGE_URI="gcr.io/$PROJECT_ID/customer_churn_image:1.0.0"
220 | DATAPROC_RUNTIME_VERSION="1.1"
221 | 
222 | gcloud beta dataproc sessions create spark $SESSION_NAME  \
223 | --project=${PROJECT_ID} \
224 | --location=${REGION} \
225 | --property=spark.jars.packages="ml.combust.mleap:mleap-spark_2.12:0.20.0" \
226 | --history-server-cluster=projects/$PROJECT_ID/regions/$REGION/clusters/$HISTORY_SERVER_NAME \
227 | --container-image=${CONTAINER_IMAGE_URI} \
228 | --subnet=$SUBNET \
229 | --version $DATAPROC_RUNTIME_VERSION
230 | ```
231 | 
232 | ### 7.3. Run the model hyperparameter tuning notebook
233 | Pick the serverless Spark interactive kernel created in previous step and attach to the hyperparameter tuning notebook and run the entire notebok. It takes ~30 minutes to complete. 
234 | 
235 | ![M3](../06-images/module-3-29.png)   
236 | <br><br>
237 | 
238 | ### 7.4. Review the model persisted in GCS
239 | Notice that Spark Mllib creates a bestModel directory and persists the tuned model there. We will use the model in the bestModel directory for batch scoring.
240 | 
241 | ![M3](../06-images/module-3-31.png)   
242 | <br><br>
243 | 
244 | ### 7.5. Review the model metrics persisted in GCS
245 | Again, this for the Vertex AI pipeline which we will cover in the module after the next.
246 | 
247 | ![M3](../06-images/module-3-30.png)   
248 | <br><br>
249 | 
250 | ### 7.6. Review the model metrics persisted in BigQuery
251 | 
252 | Run the below query in BigQuery. Be sure to add pipeline_id to the where clause if you are running the experiments multiple times.
253 | ```
254 | SELECT * FROM `customer_churn_ds.model_metrics` 
255 |  WHERE operation='hyperparameter-tuning'
256 | ```
257 | The following is the author's output-
258 | 
259 | ![M3](../06-images/module-3-32.png)   
260 | <br><br>
261 | 
262 | 
263 | ### 7.7. Review the model test results in BigQuery
264 | 
265 | Run the below queries in BigQuery. Be sure to add pipeline_id to the where clause if you are running the experiments multiple times.
266 | ```
267 | SELECT churn, prediction, *
268 |  FROM `customer_churn_ds.test_predictions` 
269 |  WHERE operation='hyperparameter-tuning'
270 | ```
271 | 
272 | ```
273 | SELECT churn, prediction, count(*) as count
274 |  FROM `customer_churn_ds.test_predictions` 
275 |  WHERE operation='hyperparameter-tuning'
276 | GROUP BY churn, prediction ORDER BY churn
277 | ```
278 | 
279 | The following is the author's output-
280 | 
281 | ![M3](../06-images/module-3-33.png)   
282 | <br><br>
283 | 
284 | ### 7.8. Validate availabity of model mleap bundle in Cloud Storage
285 | 
286 | The ID generated in the variables section for the author is 29657. You can locate artifacts by identifying your PIPELINE_ID.
287 | ```
288 | echo $PIPELINE_ID
289 | ```
290 | 
291 | ![M4](../06-images/module-4-100.png)   
292 | <br><br>
293 | 
294 | 
295 | 
296 | ### 7.9. Review the model asset tracker table in BigQuery
297 | This table has the latest model artifact specifics across storage systems.<br>
298 | The following is the table definition-
299 | ![M4](../06-images/module-4-101.png)   
300 | <br><br>
301 | 
302 | 
303 | Run the below query in BigQuery to view assets specific to your execution-
304 | ```
305 | SELECT  *
306 |  FROM `customer_churn_ds.model_asset_tracker` 
307 | ```
308 | 
309 | 
310 | Author's sample-
311 | ![M4](../06-images/module-4-102.png)   
312 | <br><br>
313 | 
314 | 
315 | <hr>
316 | 
317 | ## 8. Step 4: Batch Scoring
318 | 
319 | ### 8.1. The exercise
320 | In this sub-module, we will use the best model from the hyperparameter tuning exercise and complete batch scoring. The source is in GCS. We will transform, run predictions and persist results to BigQuery.
321 | 
322 | ![M3](../06-images/module-3-34.png)   
323 | <br><br>
324 | 
325 | ### 8.2. Run the batch scoring notebook
326 | Switch the serverless Spark interactive kernel to this notebook and run the entire notebok. It takes <5 minutes to complete. 
327 | 
328 | #### Note
329 | You need to get the model version from the hyperparameter tuning step and replace the modelVersion assignment (modelVersion = YOUR_MODEL_VERSION_HERE - 3rd code cell, line 5). You can do so by running this query in BigQuery-
330 | ```
331 | SELECT DISTINCT pipeline_id
332 |  FROM `customer_churn_ds.model_metrics` 
333 |  WHERE operation='hyperparameter-tuning' 
334 |  AND pipeline_execution_dt=(SELECT max(pipeline_execution_dt) FROM `customer_churn_ds.model_metrics` 
335 |  WHERE operation='hyperparameter-tuning')
336 | ```
337 | 
338 | ![M3](../06-images/module-3-35.png)   
339 | <br><br>
340 | 
341 | ![M3](../06-images/module-3-37.png)   
342 | <br><br>
343 | 
344 | 
345 | ### 8.3. Review the batch scoring results in BigQuery
346 | Switch the serverless Spark interactive kernel to this notebook and run the entire notebok. It takes <5 minutes to complete. 
347 | 
348 | Run the below queries in BigQuery. Be sure to add pipeline_id to the where clause if you are running the experiments multiple times.
349 | ```
350 | SELECT *
351 |  FROM `customer_churn_ds.batch_predictions` 
352 | ```
353 | 
354 | The following is the author's output-
355 | 
356 | ![M3](../06-images/module-3-36.png)   
357 | <br><br>
358 | 
359 | <hr>
360 | 
361 | ## 9. Lineage/Traceability
362 | 
363 | The author has created a pipeline ID and model version for tracking and the same attributes are added to all datasets, directories in GCS and wherever else applicable for traceability.
364 | 
365 | 
366 | <hr>
367 | 
368 | This concludes the lab module where you learned to author ML experiments on interactive Spark notebooks. Proceed to the [next module](../05-lab-guide/Module-04-Author-ML-PySpark-Scripts.md) where you will learn to execute equivalent Spark ML PySpark scripts via command line powered by Dataproc Serverless Spark batches.
369 | 


--------------------------------------------------------------------------------
/05-lab-guide/Module-05-Author-Vertex-AI-Pipeline.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # About Module 5
  3 | 
  4 | In this module, we will author and test a Vertex AI pipeline to orchestrate the Spark ML model training and prepare for operationalizing the same. The module takes about two and half hours to complete, but you need to actually only spend about 40 minutes, the rest is execution time - waiting for completion.
  5 | 
  6 | ## 1. Where we are in the model development lifecycle
  7 | 
  8 | ![M5](../06-images/module-5-01.png)   
  9 | <br><br>
 10 | 
 11 | <hr>
 12 | 
 13 | ## 2. The lab environment
 14 | 
 15 | ![M5](../06-images/module-5-02.png)   
 16 | <br><br>
 17 | 
 18 | <hr>
 19 | 
 20 | ## 3. The exercise
 21 | 
 22 | ![M5](../06-images/module-5-03.png)   
 23 | <br><br>
 24 | 
 25 | <hr>
 26 | 
 27 | ## 4. About Vertex AI pipelines & support for Spark ML Models in Vertex AI platform
 28 | We will use Vertex AI User Managed Notebook environment for this exercise and this is already created for you. When you open JupyterLab, you will also see a pre-created, customized notebook to get quick-started with learning pipeline authoring.
 29 | 
 30 | ### 4.1. Vertex AI pipelines - basics
 31 | 
 32 | Vertex AI Pipelines helps you to automate, monitor, and govern your ML systems by orchestrating your ML workflow in a serverless manner, and storing your workflow's artifacts using Vertex ML Metadata. By storing the artifacts of your ML workflow in Vertex ML Metadata, you can analyze the lineage of your workflow's artifacts — for example, an ML model's lineage may include the training data, hyperparameters, and code that were used to create the model.
 33 | 
 34 | Vertex AI Pipelines let you automate, monitor, and experiment with interdependent parts of a ML workflow. Vertex AI ML Pipelines are portable, scalable, and based on containers. Each individual part of your pipeline workflow (for example, creating a dataset or training a model) is defined by code. This code is referred to as a component. Each instance of a component is called a step.
 35 | 
 36 | Watch this [short video on vertex AI pipelines](https://youtu.be/Jrh-QLrVCvM) and [read the documentation](https://cloud.google.com/vertex-ai/docs/pipelines/introduction).
 37 | 
 38 | ### 4.2. What is supported/recommended for Spark ML models in Vertex AI from an MLOps perspective?
 39 | 
 40 | | # | Feature/Function | Supported? |  Recommended Product/Service | Workaround | Nuances/Comments | 
 41 | | -- | :--- | :--- |:--- |:--- |:--- |
 42 | | 1 | Development Environment for Model Training<br>and<br>corressponding Spark Infrastructure| Yes | Vertex AI Workbench Managed Notebook<br>with<br>Dataproc Serverless Spark Interactive Sessions | | Preview as of 8/22|
 43 | | 2 | Orchestration solution for Model Training | Yes | Vertex AI Managed Pipelines | | Preview as of 8/22|
 44 | | 3 | Development Environment for Model Training Orchestration | Yes | Vertex AI Workbench User-Managed Notebooks | | |
 45 | | 4 | Spark Infrastructure for Model Training Orchestration  | Yes | Dataproc Serverless Spark Batches | | |
 46 | | 5 | Scheduling solution for Model Training Pipeline  | Yes | Cloud Scheduler<br>calling<br>Cloud Function<br>calling<br>Vertex AI pipeline REST API | | |
 47 | | 6 | Model Registry  | No | | Persist to GCS | |
 48 | | 7 | Training Dataset  | Yes | Vertex AI managed datasets |  | No Spark reader support, therefore omitted |
 49 | | 8 | Feature Store  | Yes | Vertex AI feature store |  | |
 50 | | 9 | Model Metadata  | Yes | Vertex AI metadata |  | |
 51 | | 10 | Solution for Batch Scoring  | Yes | Dataproc Serverless Spark Batches  |  | Option 1: Small-scale - Vertex AI predictions service can be used - requires MLEAP<br>Option 2: Large-scale - Use Dataproc Serverless Spark |
 52 | | 11 | Solution for (near) Real Time/Stream Scoring  | Yes | Dataproc Spark on GCE  |  |  |
 53 | | 12 | Solution for Online/On-demand Scoring  | Yes | Vertex AI online serving  |  | Requires MLEAP   |
 54 | | 13 | Explainability | Yes | with<br>Vertex AI online/batch serving  |  | Requires MLEAP   |
 55 | | 14 | Model Monitoring | Yes | with<br>Vertex AI online/batch serving  |  | Requires MLEAP |
 56 | 
 57 | <hr>
 58 | 
 59 | ## 5. Authoring and operationalizing Vertex AI pipelines
 60 | 
 61 | ### 5.1. Authoring a pipeline in a notebook - what's involved
 62 | 
 63 | ![M5](../06-images/module-5-05.png)   
 64 | <br><br>
 65 | 
 66 | ### 5.2. Taking a pipeline developed in a notebook to production - steps involved
 67 | 
 68 | ![M5](../06-images/module-5-04.png)   
 69 | <br><br>
 70 | 
 71 | <hr>
 72 | 
 73 | ## 6. Review and execute the pre-authored Vertex AI pipeline from the Jupyter notebook
 74 | 
 75 | ### 6.1. Open the pipeline notebook
 76 | 
 77 | ![M5](../06-images/module-5-06.png)   
 78 | <br><br>
 79 | 
 80 | ![M5](../06-images/module-5-07.png)   
 81 | <br><br>
 82 | 
 83 | ### 6.2. Install dependencies
 84 | 
 85 | Be sure to uncomment the cell that installs dependencies and restarts kernel, once and comment it back.
 86 | 
 87 | ![M5](../06-images/module-5-08.png)   
 88 | <br><br>
 89 | 
 90 | ### 6.3. Study the notebook
 91 | Read through the notebook before you run it.
 92 | 
 93 | - Note how it calls all the PySpark batch jobs we tested in module 3
 94 | - Note how it uses your network (instead of Vertex AI network)
 95 | - Note how it does not use caching at a task/component/step level and at a pipeline job level
 96 | - Note how it uses a custom pipeline name
 97 | 
 98 | ### 6.4. Run the pipeline notebook in entirety
 99 | 
100 | In section 7, a URL is output, click on it.
101 | 
102 | ![M5](../06-images/module-5-09.png)   
103 | <br><br>
104 | 
105 | ### 6.5. Study and monitor the pipeline execution between Vertex AI pipeline step and Dataproc Batches UI
106 | 
107 | Takes about ~an hour to complete. Toggle between the Vertex AI pipeline UI and Dataproc Batches UI to monitor to completion.
108 | 
109 | ![M5](../06-images/module-5-10.png)   
110 | <br><br>
111 | 
112 | <hr>
113 | 
114 | ## 7. Preprocessing
115 | 
116 | ![M5](../06-images/module-5-11.png)   
117 | <br><br>
118 | 
119 | ![M5](../06-images/module-5-12.png)   
120 | <br><br>
121 | 
122 | <hr>
123 | 
124 | ## 8. Managed Dataset registration in Vertex AI
125 | 
126 | ![M5](../06-images/module-5-13.png)   
127 | <br><br>
128 | 
129 | ![M5](../06-images/module-5-14.png)   
130 | <br><br>
131 | 
132 | <hr>
133 | 
134 | ## 9. Model training
135 | 
136 | ![M5](../06-images/module-5-15.png)   
137 | <br><br>
138 | 
139 | ![M5](../06-images/module-5-16.png)   
140 | <br><br>
141 | 
142 | <hr>
143 | 
144 | ## 10. Model evaluation 
145 | 
146 | ![M5](../06-images/module-5-17.png)   
147 | <br><br>
148 | 
149 | ### 10.1. Metrics
150 | 
151 | Click on the metrics artifact icon on the canvas and review the metrics on the right side of the canvas.
152 | 
153 | ![M5](../06-images/module-5-18.png)   
154 | <br><br>
155 | 
156 | ### 10.2. Plots
157 | 
158 | Click on the plots artifact icon on the canvas and review the Confusion Matrix on the right side of the canvas.
159 | 
160 | ![M5](../06-images/module-5-19.png)   
161 | <br><br>
162 | 
163 | Scroll to see the RoC curve below the Confusion Matrix
164 | 
165 | ![M5](../06-images/module-5-20.png)   
166 | <br><br>
167 | 
168 | <hr>
169 | 
170 | ## 11. Conditional hyperparameter tuning
171 | 
172 | Review the notebook code for the AUPR threshold set and observe the hyperparameter tuning conditional execution
173 | 
174 | ![M5](../06-images/module-5-21.png)   
175 | <br><br>
176 | 
177 | ![M5](../06-images/module-5-22.png)   
178 | <br><br>
179 | 
180 | Observe that the best model is persisted in GCS. We will use this for batch scoring.
181 | 
182 | ![M5](../06-images/module-5-23.png)   
183 | <br><br>
184 | 
185 | <hr>
186 | 
187 | ## 12. Pipeline completion
188 | 
189 | ![M5](../06-images/module-5-26.png)   
190 | <br><br>
191 | 
192 | ![M5](../06-images/module-5-27.png)   
193 | <br><br>
194 | 
195 | <hr>
196 | 
197 | ## 13. Study the pipeline JSON 
198 | 
199 | ![M5](../06-images/module-5-24.png)   
200 | <br><br>
201 | 
202 | ![M5](../06-images/module-5-25.png)   
203 | <br><br>
204 | 
205 | Open the JSON and review all the nodes and elements in it. The runtime parameters at the very end of the JSON are most critical.
206 | 
207 | <hr>
208 | 
209 | ## 14. Test the JSON via Vertex AI pipeline UI
210 | 
211 | - All the 3 PySpark jobs expect a pipeline ID parameter
212 | - The pipeline ID is intended to be the same for lineage/traceability/versioning
213 | - Notice that the pipeline JSON has the pipeline ID hardcoded from the execution via the notebook
214 | - Download the JSON to your machine
215 | - Lets upload it to the UI
216 | - We will have to modify all the parameters where there is an existing pipeline ID. Lets replace with 123456 as the pipeline ID and run it
217 | - Submit the pipeline via the UI after the notebook submission completes - you may not have enough compute cores for parallel runs although they wont clash whatsoever
218 | 
219 | 
220 | ![M5](../06-images/module-5-26.png)   
221 | <br><br>
222 | 
223 | ![M5](../06-images/module-5-27.png)   
224 | <br><br>
225 | 
226 | ![M5](../06-images/module-5-28.png)   
227 | <br><br>
228 | 
229 | ![M5](../06-images/module-5-29.png)   
230 | <br><br>
231 | 
232 | ![M5](../06-images/module-5-30.png)   
233 | <br><br>
234 | 
235 | ![M5](../06-images/module-5-31.png)   
236 | <br><br>
237 | 
238 | ![M5](../06-images/module-5-32a.png)   
239 | <br><br>
240 | 
241 | ![M5](../06-images/module-5-32b.png)   
242 | <br><br>
243 | 
244 | ![M5](../06-images/module-5-33.png)   
245 | <br><br>
246 | 
247 | ![M5](../06-images/module-5-34.png)   
248 | <br><br>
249 | 
250 | <hr>
251 | 
252 | ## 15. Edit the JSON for on-demand REST calls & persist in GCS
253 | 
254 | - This step has been completed for you.
255 | - The author has essentially replaced hard-coded values of pipeline ID from the authroing & execution via the notebook with a keyword that will be replaced with a runtime generated pipeline ID in the Cloud function
256 | - The updated JSON for use with the Cloud Function is in your pipeline bucket inside the templates directory
257 | - Note that your version of the pipeline JSON template has your project details - ID, number, your service account - completely customized
258 | 
259 | ![M5](../06-images/module-5-35.png)   
260 | <br><br>
261 | 
262 | ![M5](../06-images/module-5-36.png)   
263 | <br><br>
264 | 
265 | 
266 | <hr>
267 | 
268 | This concludes the module. In the [next module](../05-lab-guide/Module-06-Author-CloudFunction-For-Vertex-AI-Pipeline.md) you will create a Cloud Function to execute the Vertex AI Spark ML pipeline.
269 | 
270 | <hr>
271 | 


--------------------------------------------------------------------------------
/05-lab-guide/Module-06-Author-CloudFunction-For-Vertex-AI-Pipeline.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # About Module 6
  3 | 
  4 | In this module we will create a Cloud Function that executes a Vertex AI pipeline on-demand based off of a pipeline JSON in GCS. This module takes 15 minutes to review, and almost an hour to run.
  5 | 
  6 | ## 1. Where we are in the model training lifecycle
  7 | 
  8 | ![M6](../06-images/module-6-01.png)   
  9 | 
 10 | 
 11 | <hr>
 12 | 
 13 | ## 2. The lab environment
 14 | 
 15 | ![M6](../06-images/module-6-02.png)   
 16 | 
 17 | 
 18 | <hr>
 19 | 
 20 | ## 3. The exercise
 21 | 
 22 | ![M6](../06-images/module-6-03.png)   
 23 | 
 24 | <hr>
 25 | 
 26 | ## 4. Dependencies
 27 | 
 28 | 1. Successful testing of pipeline template JSON
 29 | 2. Customized Vertex AI Spark ML model training template JSON in GCS
 30 | 
 31 | We completed #1 in the prior module. #2 is already available for you in GCS.
 32 | 
 33 | ![M6](../06-images/module-6-04.png)   
 34 | 
 35 | <hr>
 36 | 
 37 | ## 5. Documentation for scheduling Vertex AI pipelines
 38 | 
 39 | Read the documentation for scheduling Vertex AI pipelines ahead of working on the next step to better understand on-demand execution through a simpler example than the one in the lab.<br>
 40 | https://cloud.google.com/vertex-ai/docs/pipelines/schedule-cloud-scheduler
 41 | 
 42 | <hr>
 43 | 
 44 | ## 6. Cloud Function deployment pictorial overview
 45 | 
 46 | The Cloud Function is already deployed in your environment. The folowing is the author's deployment from the Terraform script. Yours should be identical.
 47 | 
 48 | ![M6](../06-images/module-6-05.png)   
 49 | <br><br>
 50 | 
 51 | ![M6](../06-images/module-6-06.png)   
 52 | <br><br>
 53 | 
 54 | ![M6](../06-images/module-6-07.png)   
 55 | <br><br>
 56 | 
 57 | ![M6](../06-images/module-6-08.png)   
 58 | <br><br>
 59 | 
 60 | ![M6](../06-images/module-6-09.png)   
 61 | <br><br>
 62 | 
 63 | ![M6](../06-images/module-6-10.png)   
 64 | <br><br>
 65 | 
 66 | ![M6](../06-images/module-6-11.png)   
 67 | <br><br>
 68 | 
 69 | 
 70 | ## 7. Review of the Cloud Function code for executing the Vertex AI Spark ML Model Training Pipeline
 71 | 
 72 | ### 7.1. What is happening inside the function?
 73 | 
 74 | ![M6](../06-images/module-6-13.png)   
 75 | <br><br>
 76 | 
 77 | ### 7.2. Runtime variables
 78 | 
 79 | ![M6](../06-images/module-6-12.png)   
 80 | <br><br>
 81 | 
 82 | ![M6](../06-images/module-6-14.png)   
 83 | <br><br>
 84 | 
 85 | ### 7.3. Source code - requirements.txt
 86 | 
 87 | The latest requirements.txt is avialable here-<br>
 88 | https://github.com/anagha-google/s8s-spark-mlops-lab/blob/main/02-scripts/cloud-functions/requirements.txt
 89 | 
 90 | ### 7.4. Source code - main.py
 91 | The latest source code is avialable here-<br>
 92 | https://github.com/anagha-google/s8s-spark-mlops-lab/blob/main/02-scripts/cloud-functions/main.py
 93 | <hr>
 94 | 
 95 | ## 8. Execute the Cloud Function and monitor for pipeline execution through completion
 96 | The Cloud Function is generation 2 and does not have a "click to test" button feature yet. We need to grab the command line execution from the UI and run it in Cloud Shell.
 97 | 
 98 | ### 8.1. Grab the command for executing the Cloud Function
 99 | ![M6](../06-images/module-6-15.png)   
100 | <br><br>
101 | 
102 | ### 8.2. Run the command in Cloud Shell
103 | ![M6](../06-images/module-6-16.png)   
104 | <br><br>
105 | 
106 | ### 8.3. Monitor the Vertex AI pipeline for any errors
107 | ![M6](../06-images/module-6-17.png)   
108 | <br><br>
109 | 
110 | ### 8.4. Monitor the Dataproc Batches UI for new jobs
111 | Follow at least a couple steps through completion.
112 | 
113 | ![M6](../06-images/module-6-18.png)   
114 | <br><br>
115 | 
116 | ![M6](../06-images/module-6-19.png)   
117 | <br><br>
118 | 
119 | ### 8.5. Monitor the overall completion of the pipeline execution
120 | ![M6](../06-images/module-6-20.png)   
121 | <br><br>
122 | 
123 | <hr>
124 | 
125 | This concludes the module. In the [next module](../05-lab-guide/Module-07-Schedule-VertexAI-Pipeline.md), we will create a Cloud Scheduler job for time based execution of the model training pipeline.
126 | 
127 | <hr>
128 | 
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/05-lab-guide/Module-07-Schedule-VertexAI-Pipeline.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # About Module 7
 3 | 
 4 | This module covers creating a Cloud Scheduler job to trigger the Vertex AI Spark ML model training pipeline via the Cloud Function we created in the prior module. The approximate time for the module content review is 15 minutes but the pipeline execution could take an hour.
 5 | 
 6 | <hr>
 7 | 
 8 | ## 1. Where we are in the SparK ML model lifecycle
 9 | 
10 | ![M8](../06-images/module-7-01.png)   
11 | <br><br>
12 | 
13 | <hr>
14 | 
15 | ## 2. The lab environment
16 | 
17 | ![M8](../06-images/module-7-02.png)   
18 | <br><br>
19 | 
20 | <hr>
21 | 
22 | ## 3. The exercise
23 | 
24 | ![M8](../06-images/module-7-03.png)   
25 | <br><br>
26 | 
27 | <hr>
28 | 
29 | ## 4. Review the Cloud Scheduler job configuration
30 | 
31 | A Cloud Scheduler job has been precreated for you that calls the Cloud Function which inturn calls the Vertex AI Spark ML pipeline we created in module 5. Lets walk through the setup in the author's environment.
32 | 
33 | ![CS](../06-images/module-1-cloud-scheduler-01.png)   
34 | <br><br>
35 | 
36 | ![CS](../06-images/module-1-cloud-scheduler-02.png)   
37 | <br><br>
38 | 
39 | ![CS](../06-images/module-1-cloud-scheduler-03.png)   
40 | <br><br>
41 | 
42 | ![CS](../06-images/module-1-cloud-scheduler-04.png)   
43 | <br><br>
44 | 
45 | ![CS](../06-images/module-1-cloud-scheduler-05.png)   
46 | <br><br>
47 | 
48 | <hr>
49 | 
50 | ## 5. Run the Cloud Scheduler job manually to test it
51 | 
52 | ![M8](../06-images/module-7-04.png)   
53 | <br><br>
54 | 
55 | <hr>
56 | 
57 | ## 6. Monitor the exeuction through completion of the pipeline execution
58 | ~ 1 hour
59 | 
60 | ![M8](../06-images/module-7-05.png)   
61 | <br><br>
62 | 
63 | ![M8](../06-images/module-7-06.png)   
64 | <br><br>
65 | 
66 | ![M8](../06-images/module-7-07.png)   
67 | <br><br>
68 | 
69 | <hr>
70 | 
71 | This concludes the lab module. Proceed to the [next module](../05-lab-guide/Module-08-Orchestrate-Batch-Scoring.md) where we will operationalize batch scoring on Cloud Composer.
72 | 
73 | <hr>
74 | 
75 | 


--------------------------------------------------------------------------------
/05-lab-guide/Module-08-Orchestrate-Batch-Scoring.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # About Module 8
  3 | 
  4 | This module covers orchestrating Spark ML batch scoring with Apache Airflow on Cloud Composer. Vertex AI pipelines has deliberately not been used as it lacks support for model monitoring and explainability and is not suited for upstream job orchestration typical with batch scoring and that may not be ML related.
  5 | 
  6 | ## 1. Where we are in the SparK ML model lifecycle
  7 | 
  8 | ![M8](../06-images/module-8-01.png)   
  9 | <br><br>
 10 | 
 11 | ## 2. The lab environment
 12 | 
 13 | ![M8](../06-images/module-8-02.png)   
 14 | <br><br>
 15 | 
 16 | ## 3. The exercise
 17 | 
 18 | ![M8](../06-images/module-8-03.png)   
 19 | <br><br>
 20 | 
 21 | ## 4. Review of the Cloud Composer Environment setup
 22 | 
 23 | Module 1 created and configured your Cloud Composer environment, including Airflow variables import and upload of the precreated Apache Airflow DAG into the Cloud Composer DAG bucket. In this section, we will walkthrough the author's environment.
 24 | 
 25 | ### 4a. Cloud Composer environment
 26 | 
 27 | ![CC2](../06-images/module-1-composer-01.png)   
 28 | <br><br>
 29 | 
 30 | ![CC2](../06-images/module-1-composer-02.png)   
 31 | <br><br>
 32 | 
 33 | ### 4b. Cloud Composer - Airflow variables
 34 | 
 35 | ![CC2](../06-images/module-1-composer-03.png)   
 36 | <br><br>
 37 | 
 38 | ![CC2](../06-images/module-1-composer-04.png)   
 39 | <br><br>
 40 | 
 41 | ### 4c. Cloud Composer - Airflow DAG
 42 | 
 43 | ![CC2](../06-images/module-1-composer-07.png)   
 44 | <br><br>
 45 | 
 46 | ![CC2](../06-images/module-1-composer-08.png)   
 47 | <br><br>
 48 | 
 49 | ### 4d. Cloud Composer - Airflow GUI
 50 | 
 51 | ![CC2](../06-images/module-1-composer-05.png)   
 52 | <br><br>
 53 | 
 54 | ![CC2](../06-images/module-1-composer-06.png)   
 55 | <br><br>
 56 | 
 57 | ## 5. Documentation on orchestrating Dataproc Serverless Spark batches with Apache Airflow
 58 | 
 59 | Read the documentation at the link below for clear understanding of a simple example before you delve into the next session.<br>
 60 | https://cloud.google.com/composer/docs/composer-2/run-dataproc-workloads
 61 | 
 62 | 
 63 | ## 6. Review of the Apache Airflow DAG for batch scoring
 64 | 
 65 | The latest version of the source code is here-<br>
 66 | https://github.com/anagha-google/s8s-spark-mlops-lab/blob/main/02-scripts/airflow/pipeline.py<br>
 67 | 
 68 | Review the code and understand the flow.
 69 | 
 70 | ```
 71 | # ======================================================================================
 72 | # ABOUT
 73 | # This script orchestrates batch scoring 
 74 | # ======================================================================================
 75 | 
 76 | import os
 77 | from airflow.models import Variable
 78 | from datetime import datetime
 79 | from airflow import models
 80 | from airflow.providers.google.cloud.operators.dataproc import (DataprocCreateBatchOperator,DataprocGetBatchOperator)
 81 | from datetime import datetime
 82 | from airflow.utils.dates import days_ago
 83 | import string
 84 | import random 
 85 | 
 86 | # .......................................................
 87 | # Variables
 88 | # .......................................................
 89 | 
 90 | # {{
 91 | # a) General
 92 | randomizerCharLength = 10 
 93 | randomVal = ''.join(random.choices(string.digits, k = randomizerCharLength))
 94 | airflowDAGName= "customer-churn-prediction"
 95 | batchIDPrefix = f"{airflowDAGName}-edo-{randomVal}"
 96 | # +
 97 | # b) Capture from Airflow variables
 98 | region = models.Variable.get("region")
 99 | subnet=models.Variable.get("subnet")
100 | phsServer=Variable.get("phs_server")
101 | containerImageUri=Variable.get("container_image_uri")
102 | bqDataset=Variable.get("bq_dataset")
103 | umsaFQN=Variable.get("umsa_fqn")
104 | bqConnectorJarUri=Variable.get("bq_connector_jar_uri")
105 | # +
106 | # c) For the Spark application
107 | pipelineID = randomVal
108 | projectID = models.Variable.get("project_id")
109 | projectNbr = models.Variable.get("project_nbr")
110 | modelVersion=Variable.get("model_version")
111 | displayPrintStatements=Variable.get("display_print_statements")
112 | # +
113 | # d) Arguments array
114 | batchScoringArguments = [f"--pipelineID={pipelineID}", \
115 |         f"--projectID={projectID}", \
116 |         f"--projectNbr={projectNbr}", \
117 |         f"--modelVersion={modelVersion}", \
118 |         f"--displayPrintStatements={displayPrintStatements}" ]
119 | # +
120 | # e) PySpark script to execute
121 | scoringScript= "gs://s8s_code_bucket-"+projectNbr+"/pyspark/batch_scoring.py"
122 | commonUtilsScript= "gs://s8s_code_bucket-"+projectNbr+"/pyspark/common_utils.py"
123 | # }}
124 | 
125 | # .......................................................
126 | # s8s Spark batch config
127 | # .......................................................
128 | 
129 | s8sSparkBatchConfig = {
130 |     "pyspark_batch": {
131 |         "main_python_file_uri": scoringScript,
132 |         "python_file_uris": [ commonUtilsScript ],
133 |         "args": batchScoringArguments,
134 |         "jar_file_uris": [ bqConnectorJarUri ]
135 |     },
136 |     "runtime_config": {
137 |         "container_image": containerImageUri
138 |     },
139 |     "environment_config":{
140 |         "execution_config":{
141 |             "service_account": umsaFQN,
142 |             "subnetwork_uri": subnet
143 |             },
144 |         "peripherals_config": {
145 |             "spark_history_server_config": {
146 |                 "dataproc_cluster": f"projects/{projectID}/regions/{region}/clusters/{phsServer}"
147 |                 }
148 |             }
149 |         }
150 | }
151 | 
152 | 
153 | # .......................................................
154 | # DAG
155 | # .......................................................
156 | 
157 | with models.DAG(
158 |     airflowDAGName,
159 |     schedule_interval=None,
160 |     start_date = days_ago(2),
161 |     catchup=False,
162 | ) as scoringDAG:
163 |     customerChurnPredictionStep = DataprocCreateBatchOperator(
164 |         task_id="Predict-Customer-Churn",
165 |         project_id=projectID,
166 |         region=region,
167 |         batch=s8sSparkBatchConfig,
168 |         batch_id=batchIDPrefix 
169 |     )
170 |     customerChurnPredictionStep 
171 | ```
172 | 
173 | **Note that a runtime Airflow variable expected is "Model Version". **
174 | 
175 | <hr>
176 | 
177 | ## 7. Test the Apache Airflow DAG on Cloud Composer for batch scoring
178 | 
179 | ### 7.1. Identify the model version you want to use
180 | 
181 | ![M8](../06-images/module-8-04.png)   
182 | <br><br>
183 | 
184 | ### 7.2. Edit the Airflow varibale for Model Version
185 | 
186 | Takes about 10 minutes to update.
187 | 
188 | ![M8](../06-images/module-8-05.png)   
189 | <br><br>
190 | 
191 | ![M8](../06-images/module-1-composer-04.png)   
192 | <br><br>
193 | 
194 | ![M8](../06-images/module-8-06.png)   
195 | <br><br>
196 | 
197 | ![M8](../06-images/module-8-07.png)   
198 | <br><br>
199 | 
200 | ![M8](../06-images/module-8-08.png)   
201 | <br><br>
202 | 
203 | ### 7.3. Open the Customer Churn DAG in the Airflow UI & trigger its execution manually
204 | 
205 | ![M8](../06-images/module-8-09.png)   
206 | <br><br>
207 | 
208 | ### 7.4. Monitor the DAG execution
209 | 
210 | ![M8](../06-images/module-8-10.png)   
211 | <br><br>
212 | 
213 | ![M8](../06-images/module-8-11.png)   
214 | <br><br>
215 | 
216 | ![M8](../06-images/module-8-12.png)   
217 | <br><br>
218 | 
219 | ![M8](../06-images/module-8-13.png)   
220 | <br><br>
221 | 
222 | ![M8](../06-images/module-8-14.png)   
223 | <br><br>
224 | 
225 | ![M8](../06-images/module-8-15.png)   
226 | <br><br>
227 | 
228 | ![M8](../06-images/module-8-16.png)   
229 | <br><br>
230 | 
231 | ![M8](../06-images/module-8-17.png)   
232 | <br><br>
233 | 
234 | 
235 | 
236 | ### 7.5. Review the results in BigQuery
237 | Find your pipeline_id in the Dataproc batches UI and edit the query below to reflect your pipeline_id.
238 | ```
239 | SELECT * FROM `customer_churn_ds.batch_predictions` 
240 | WHERE pipeline_id='YOUR_PIPELINE_ID'
241 | ```
242 | 
243 | ![M8](../06-images/module-8-18.png)   
244 | <br><br>
245 | 
246 | The following is the author's results-
247 | 
248 | ![M8](../06-images/module-8-19.png)   
249 | <br><br>
250 | 
251 | <hr>
252 | 
253 | This concludes the lab. Be sure to shut down the project to avoid incurring billing charges. Return to [lab home](../../README.md).
254 | 
255 | <hr>
256 | 
257 | 


--------------------------------------------------------------------------------
/05-lab-guide/Services-Created.md:
--------------------------------------------------------------------------------
  1 | # About
  2 | 
  3 | The following are the products/services that get provisioned, and the code and configuration that gets uploaded into your environment. The screenshots present a view of the author's environment, yours should be identical.
  4 | 
  5 | [1. IAM](Services-Created.md#1-iam) <br>
  6 | [2. Networking](Services-Created.md#2-networking) <br>
  7 | [3. Cloud Storage](Services-Created.md#3-cloud-storage) <br>
  8 | [4. BigQuery](Services-Created.md#4-bigquery) <br>
  9 | [5. Persistent Spark History Server](Services-Created.md#5-persistent-spark-history-server) <br>
 10 | [6. Vertex AI Workbench - User Managed Notebook Server ](Services-Created.md#6a-vertex-ai-workbench---managed-notebook-server) | [Jupyter Notebook](Services-Created.md#6b-vertex-ai-workbench---managed-notebook-server---jupyter-notebook) <br>
 11 | [7. Vertex AI Workbench - Managed Notebook Server ](Services-Created.md#7a-vertex-ai-workbench---user-managed-notebook-server) | [Jupyter Notebooks](Services-Created.md#7b-vertex-ai-workbench---managed-notebook-server---jupyter-notebooks)  <br>
 12 | [8. Google Container Registry](Services-Created.md#8a-google-container-registry) | [Container image](Services-Created.md#8b-google-container-registry---container-image) <br>
 13 | [9. Cloud Composer](Services-Created.md#9a-cloud-composer) | [Airflow variables](Services-Created.md#9b-cloud-composer---airflow-variables) | [Airflow DAG](Services-Created.md#9c-cloud-composer---airflow-dag) | [Airflow UI](Services-Created.md#9d-cloud-composer---airflow-gui) <br>
 14 | [10. Google Cloud Function](Services-Created.md#10-google-cloud-function) <br>
 15 | [11. Cloud Scheduler](Services-Created.md#11-cloud-scheduler) <br>
 16 | [12. Customized Vertex AI pipeline JSON in GCS](Services-Created.md#12-customized-vertex-ai-pipeline-json-in-gcs) <br>
 17 | 
 18 | ## 1. IAM
 19 | A User Managed Service Account (UMSA) is created and granted requisite permissions and the lab attendee is granted permissions to impersonate the UMSA. There are a few other permissions granted to the default Google Managed Service Accounts of some services as requried. The Terraform main.tf is a good read to understand the permissions.
 20 | 
 21 | ![IAM](../06-images/module-1-iam-01.png)   
 22 | <br><br>
 23 | 
 24 | ![IAM](../06-images/module-1-iam-02.png)   
 25 | <br><br>
 26 | 
 27 | ![IAM](../06-images/module-1-iam-03.png)   
 28 | <br><br>
 29 | 
 30 | ![IAM](../06-images/module-1-iam-04.png)   
 31 | <br><br>
 32 | 
 33 | 
 34 | ## 2. Networking
 35 | The following networking components are created as part of Terraform deployment-
 36 | 
 37 | ### 2.1. VPC
 38 | 
 39 | ![VPC](../06-images/module-1-networking-01.png)   
 40 | <br><br>
 41 | 
 42 | ![VPC](../06-images/module-1-networking-02.png)   
 43 | <br><br>
 44 | 
 45 | ### 2.2. Subnet with private google access
 46 | 
 47 | ![VPC](../06-images/module-1-networking-03.png)   
 48 | <br><br>
 49 | 
 50 | ### 2.3. Firewall rule for Data Serverless Spark
 51 | 
 52 | ![VPC](../06-images/module-1-networking-05.png)   
 53 | <br><br>
 54 | 
 55 | 
 56 | ### 2.4. Reserved IP for VPC peering with Vertex AI tenant network for Vertex AI workbench, managed notebook instance for BYO network
 57 | 
 58 | ![VPC](../06-images/module-1-networking-04.png)   
 59 | <br><br>
 60 | 
 61 | 
 62 | ### 2.5. VPC peering with Vertex AI tenant network for Vertex AI workbench, managed notebook instance for BYO network
 63 | 
 64 | ![VPC](../06-images/module-1-networking-06.png)   
 65 | <br><br>
 66 | 
 67 | ![VPC](../06-images/module-1-networking-07.png)   
 68 | <br><br>
 69 | 
 70 | ![VPC](../06-images/module-1-networking-08.png)   
 71 | <br><br>
 72 | 
 73 | ## 3. Cloud Storage
 74 | 
 75 | ### 3.1. Buckets created
 76 | A number of buckets are created by te Terraform and some buckets are created by the GCP products. The following is a listing of buckets created as part of the deployment with Terraform.
 77 | 
 78 | ![GCS](../06-images/module-1-storage-01.png)   
 79 | <br><br>
 80 | 
 81 | ![GCS](../06-images/module-1-storage-02.png)   
 82 | <br><br>
 83 | 
 84 | ### 3.2. The Data Bucket
 85 | The following is the author's data bucket content-
 86 | ```
 87 | customer_churn_score_data.csv
 88 | customer_churn_train_data.csv
 89 | ```
 90 | 
 91 | ### 3.3. The Code Bucket
 92 | 
 93 | The following is the author's code bucket content-
 94 | ```
 95 | # Cloud Composer - Airflow DAG
 96 | airflow/pipeline.py
 97 | 
 98 | # Shell Script for building custom container image for Serverless Spark
 99 | bash/build-container-image.sh
100 | 
101 | # Post startup shell scripts to upload Jupyter notebooks in GCS to Vertex AI workbench notebook server instances
102 | bash/mnbs-exec-post-startup.sh
103 | bash/umnbs-exec-post-startup.sh
104 | 
105 | # Pyspark scripts for Spark Machine Learning
106 | pyspark/batch_scoring.py
107 | pyspark/common_utils.py
108 | pyspark/hyperparameter_tuning.py
109 | pyspark/model_training.py
110 | pyspark/preprocessing.py
111 | 
112 | # Cloud Functions source code
113 | cloud-functions/function-source.zip
114 | cloud-functions/main.py
115 | cloud-functions/requirements.txt
116 | ```
117 | 
118 | ### 3.4. The Notebook Bucket
119 | 
120 | ```
121 | # PySpark development notebooks
122 | pyspark/batch_scoring.ipynb
123 | pyspark/hyperparameter_tuning.ipynb
124 | pyspark/model_training.ipynb
125 | pyspark/preprocessing.ipynb
126 | 
127 | # Vertex AI pipeline development notebook
128 | vai-pipelines/customer_churn_training_pipeline.ipynb
129 | ```
130 | 
131 | ### 3.5. The Pipeline Bucket
132 | 
133 | The customized (for your environment) JSON for scheduling a Vertex AI pipeline.
134 | ```
135 | templates/customer_churn_vai_pipeline_template.json
136 | ```
137 | 
138 | ### 3.6. The Functions Bucket
139 | 
140 | Cloud Functions source code
141 | ```
142 | function-source.zip
143 | ```
144 | 
145 | ### 3.6. The rest of the buckets
146 | Are empty and used for peristing logs and/or MLOps artifacts
147 | 
148 | 
149 | 
150 | ## 4. BigQuery
151 | 
152 | ![BQ](../06-images/module-1-bq-01.png)   
153 | <br><br>
154 | 
155 | ![BQ](../06-images/module-1-bq-02.png)   
156 | <br><br>
157 | 
158 | ## 5. Persistent Spark History Server
159 | 
160 | ![PHS](../06-images/module-1-phs-01.png)   
161 | <br><br>
162 | 
163 | ![PHS](../06-images/module-1-phs-02.png)   
164 | <br><br>
165 | 
166 | ![PHS](../06-images/module-1-phs-03.png)   
167 | <br><br>
168 | 
169 | ![PHS](../06-images/module-1-phs-04.png)   
170 | <br><br>
171 | 
172 | ![PHS](../06-images/module-1-phs-05.png)   
173 | <br><br>
174 | 
175 | ## 6a. Vertex AI Workbench - Managed Notebook Server 
176 | 
177 | 
178 | 
179 | 
180 | ![UMNBS](../06-images/module-1-vai-wb-01.png)   
181 | <br><br>
182 | 
183 | 
184 | ## 6b. Vertex AI Workbench - Managed Notebook Server - Jupyter Notebook
185 | 
186 | **Be sure to select the right region in the dropdown.**
187 | 
188 | ![UMNBS](../06-images/module-1-vai-wb-mnb-01.png)   
189 | <br><br>
190 | 
191 | ![UMNBS](../06-images/module-1-vai-wb-mnbs-02.png)   
192 | <br><br>
193 | 
194 | ## 7a. Vertex AI Workbench - User Managed Notebook Server 
195 | 
196 | 
197 | ![UMNBS](../06-images/module-1-vai-wb-umnb-01.png)   
198 | <br><br>
199 | 
200 | ## 7b. Vertex AI Workbench - Managed Notebook Server - Jupyter Notebooks
201 | 
202 | 
203 | ![UMNBS](../06-images/module-1-vai-wb-umnb-02.png)   
204 | <br><br>
205 | 
206 | ![UMNBS](../06-images/module-1-vai-wb-umnb-03.png)   
207 | <br><br>
208 | 
209 | ## 8a. Google Container Registry
210 | 
211 | ![GCR](../06-images/module-1-gcr-01.png)   
212 | <br><br>
213 | 
214 | 
215 | ## 8b. Google Container Registry - Container Image
216 | 
217 | ![GCR](../06-images/module-1-gcr-02.png)   
218 | <br><br>
219 | 
220 | ![GCR](../06-images/module-1-gcr-03.png)   
221 | <br><br>
222 | 
223 | ## 9a. Cloud Composer
224 | 
225 | ![CC2](../06-images/module-1-composer-01.png)   
226 | <br><br>
227 | 
228 | ![CC2](../06-images/module-1-composer-02.png)   
229 | <br><br>
230 | 
231 | ## 9b. Cloud Composer - Airflow variables
232 | 
233 | ![CC2](../06-images/module-1-composer-03.png)   
234 | <br><br>
235 | 
236 | ![CC2](../06-images/module-1-composer-04.png)   
237 | <br><br>
238 | 
239 | ## 9c. Cloud Composer - Airflow DAG
240 | 
241 | ![CC2](../06-images/module-1-composer-07.png)   
242 | <br><br>
243 | 
244 | ![CC2](../06-images/module-1-composer-08.png)   
245 | <br><br>
246 | 
247 | ## 9d. Cloud Composer - Airflow GUI
248 | 
249 | ![CC2](../06-images/module-1-composer-05.png)   
250 | <br><br>
251 | 
252 | ![CC2](../06-images/module-1-composer-06.png)   
253 | <br><br>
254 | 
255 | ## 10. Google Cloud Function
256 | 
257 | ![GCF](../06-images/module-1-cloud-function-01.png)   
258 | <br><br>
259 | 
260 | ![GCF](../06-images/module-1-cloud-function-02.png)   
261 | <br><br>
262 | 
263 | ![GCF](../06-images/module-1-cloud-function-03.png)   
264 | <br><br>
265 | 
266 | ![GCF](../06-images/module-1-cloud-function-04.png)   
267 | <br><br>
268 | 
269 | ![GCF](../06-images/module-1-cloud-function-05.png)   
270 | <br><br>
271 | 
272 | ![GCF](../06-images/module-1-cloud-function-06.png)   
273 | <br><br>
274 | 
275 | ## 11. 	Cloud Scheduler
276 | 
277 | ![CS](../06-images/module-1-cloud-scheduler-01.png)   
278 | <br><br>
279 | 
280 | ![CS](../06-images/module-1-cloud-scheduler-02.png)   
281 | <br><br>
282 | 
283 | ![CS](../06-images/module-1-cloud-scheduler-03.png)   
284 | <br><br>
285 | 
286 | ![CS](../06-images/module-1-cloud-scheduler-04.png)   
287 | <br><br>
288 | 
289 | ![CS](../06-images/module-1-cloud-scheduler-05.png)   
290 | <br><br>
291 | 
292 | ## 12. Customized Vertex AI pipeline JSON in GCS
293 | 
294 | ![VAI](../06-images/module-1-pipeline-json-01.png)   
295 | <br><br>
296 | 
297 | ![VAI](../06-images/module-1-pipeline-json-02.png)   
298 | <br><br>
299 | 
300 | ![VAI](../06-images/module-1-pipeline-json-03.png)   
301 | <br><br>
302 | 
303 | ![VAI](../06-images/module-1-pipeline-json-04.png)   
304 | <br><br>
305 | 
306 | <hr>
307 | 
308 | This is a summary of services that get provisioned for the lab. [Return to Module 1](../05-lab-guide/Module-01-Environment-Provisioning.md)
309 | 
310 | <hr>
311 | 
312 | 


--------------------------------------------------------------------------------
/06-images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/.DS_Store


--------------------------------------------------------------------------------
/06-images/landing-page-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/landing-page-01.png


--------------------------------------------------------------------------------
/06-images/landing-page-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/landing-page-02.png


--------------------------------------------------------------------------------
/06-images/landing-page-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/landing-page-03.png


--------------------------------------------------------------------------------
/06-images/landing-page-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/landing-page-04.png


--------------------------------------------------------------------------------
/06-images/module-1-bq-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-bq-01.png


--------------------------------------------------------------------------------
/06-images/module-1-bq-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-bq-02.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-function-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-function-01.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-function-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-function-02.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-function-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-function-03.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-function-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-function-04.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-function-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-function-05.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-function-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-function-06.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-scheduler-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-scheduler-01.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-scheduler-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-scheduler-02.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-scheduler-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-scheduler-03.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-scheduler-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-scheduler-04.png


--------------------------------------------------------------------------------
/06-images/module-1-cloud-scheduler-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-cloud-scheduler-05.png


--------------------------------------------------------------------------------
/06-images/module-1-composer-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-composer-01.png


--------------------------------------------------------------------------------
/06-images/module-1-composer-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-composer-02.png


--------------------------------------------------------------------------------
/06-images/module-1-composer-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-composer-03.png


--------------------------------------------------------------------------------
/06-images/module-1-composer-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-composer-04.png


--------------------------------------------------------------------------------
/06-images/module-1-composer-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-composer-05.png


--------------------------------------------------------------------------------
/06-images/module-1-composer-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-composer-06.png


--------------------------------------------------------------------------------
/06-images/module-1-composer-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-composer-07.png


--------------------------------------------------------------------------------
/06-images/module-1-composer-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-composer-08.png


--------------------------------------------------------------------------------
/06-images/module-1-gcr-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-gcr-01.png


--------------------------------------------------------------------------------
/06-images/module-1-gcr-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-gcr-02.png


--------------------------------------------------------------------------------
/06-images/module-1-gcr-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-gcr-03.png


--------------------------------------------------------------------------------
/06-images/module-1-iam-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-iam-01.png


--------------------------------------------------------------------------------
/06-images/module-1-iam-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-iam-02.png


--------------------------------------------------------------------------------
/06-images/module-1-iam-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-iam-03.png


--------------------------------------------------------------------------------
/06-images/module-1-iam-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-iam-04.png


--------------------------------------------------------------------------------
/06-images/module-1-networking-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-networking-01.png


--------------------------------------------------------------------------------
/06-images/module-1-networking-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-networking-02.png


--------------------------------------------------------------------------------
/06-images/module-1-networking-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-networking-03.png


--------------------------------------------------------------------------------
/06-images/module-1-networking-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-networking-04.png


--------------------------------------------------------------------------------
/06-images/module-1-networking-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-networking-05.png


--------------------------------------------------------------------------------
/06-images/module-1-networking-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-networking-06.png


--------------------------------------------------------------------------------
/06-images/module-1-networking-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-networking-07.png


--------------------------------------------------------------------------------
/06-images/module-1-networking-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-networking-08.png


--------------------------------------------------------------------------------
/06-images/module-1-phs-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-phs-01.png


--------------------------------------------------------------------------------
/06-images/module-1-phs-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-phs-02.png


--------------------------------------------------------------------------------
/06-images/module-1-phs-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-phs-03.png


--------------------------------------------------------------------------------
/06-images/module-1-phs-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-phs-04.png


--------------------------------------------------------------------------------
/06-images/module-1-phs-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-phs-05.png


--------------------------------------------------------------------------------
/06-images/module-1-pictorial-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-pictorial-01.png


--------------------------------------------------------------------------------
/06-images/module-1-pictorial-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-pictorial-02.png


--------------------------------------------------------------------------------
/06-images/module-1-pictorial-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-pictorial-03.png


--------------------------------------------------------------------------------
/06-images/module-1-pipeline-json-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-pipeline-json-01.png


--------------------------------------------------------------------------------
/06-images/module-1-pipeline-json-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-pipeline-json-02.png


--------------------------------------------------------------------------------
/06-images/module-1-pipeline-json-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-pipeline-json-03.png


--------------------------------------------------------------------------------
/06-images/module-1-pipeline-json-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-pipeline-json-04.png


--------------------------------------------------------------------------------
/06-images/module-1-storage-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-storage-01.png


--------------------------------------------------------------------------------
/06-images/module-1-storage-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-storage-02.png


--------------------------------------------------------------------------------
/06-images/module-1-vai-wb-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-vai-wb-01.png


--------------------------------------------------------------------------------
/06-images/module-1-vai-wb-mnb-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-vai-wb-mnb-01.png


--------------------------------------------------------------------------------
/06-images/module-1-vai-wb-mnbs-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-vai-wb-mnbs-02.png


--------------------------------------------------------------------------------
/06-images/module-1-vai-wb-umnb-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-vai-wb-umnb-01.png


--------------------------------------------------------------------------------
/06-images/module-1-vai-wb-umnb-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-vai-wb-umnb-02.png


--------------------------------------------------------------------------------
/06-images/module-1-vai-wb-umnb-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-1-vai-wb-umnb-03.png


--------------------------------------------------------------------------------
/06-images/module-2-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-01.png


--------------------------------------------------------------------------------
/06-images/module-2-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-02.png


--------------------------------------------------------------------------------
/06-images/module-2-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-03.png


--------------------------------------------------------------------------------
/06-images/module-2-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-04.png


--------------------------------------------------------------------------------
/06-images/module-2-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-05.png


--------------------------------------------------------------------------------
/06-images/module-2-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-06.png


--------------------------------------------------------------------------------
/06-images/module-2-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-07.png


--------------------------------------------------------------------------------
/06-images/module-2-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-08.png


--------------------------------------------------------------------------------
/06-images/module-2-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-09.png


--------------------------------------------------------------------------------
/06-images/module-2-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-10.png


--------------------------------------------------------------------------------
/06-images/module-2-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-11.png


--------------------------------------------------------------------------------
/06-images/module-2-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-12.png


--------------------------------------------------------------------------------
/06-images/module-2-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-13.png


--------------------------------------------------------------------------------
/06-images/module-2-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-14.png


--------------------------------------------------------------------------------
/06-images/module-2-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-15.png


--------------------------------------------------------------------------------
/06-images/module-2-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-16.png


--------------------------------------------------------------------------------
/06-images/module-2-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-17.png


--------------------------------------------------------------------------------
/06-images/module-2-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-18.png


--------------------------------------------------------------------------------
/06-images/module-2-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-19.png


--------------------------------------------------------------------------------
/06-images/module-2-summary-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-summary-01.png


--------------------------------------------------------------------------------
/06-images/module-2-summary-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-summary-02.png


--------------------------------------------------------------------------------
/06-images/module-2-summary-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-summary-03.png


--------------------------------------------------------------------------------
/06-images/module-2-summary-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-2-summary-04.png


--------------------------------------------------------------------------------
/06-images/module-3-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-01.png


--------------------------------------------------------------------------------
/06-images/module-3-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-02.png


--------------------------------------------------------------------------------
/06-images/module-3-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-03.png


--------------------------------------------------------------------------------
/06-images/module-3-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-04.png


--------------------------------------------------------------------------------
/06-images/module-3-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-05.png


--------------------------------------------------------------------------------
/06-images/module-3-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-06.png


--------------------------------------------------------------------------------
/06-images/module-3-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-07.png


--------------------------------------------------------------------------------
/06-images/module-3-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-08.png


--------------------------------------------------------------------------------
/06-images/module-3-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-09.png


--------------------------------------------------------------------------------
/06-images/module-3-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-10.png


--------------------------------------------------------------------------------
/06-images/module-3-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-11.png


--------------------------------------------------------------------------------
/06-images/module-3-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-12.png


--------------------------------------------------------------------------------
/06-images/module-3-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-13.png


--------------------------------------------------------------------------------
/06-images/module-3-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-14.png


--------------------------------------------------------------------------------
/06-images/module-3-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-15.png


--------------------------------------------------------------------------------
/06-images/module-3-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-16.png


--------------------------------------------------------------------------------
/06-images/module-3-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-17.png


--------------------------------------------------------------------------------
/06-images/module-3-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-18.png


--------------------------------------------------------------------------------
/06-images/module-3-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-19.png


--------------------------------------------------------------------------------
/06-images/module-3-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-20.png


--------------------------------------------------------------------------------
/06-images/module-3-21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-21.png


--------------------------------------------------------------------------------
/06-images/module-3-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-22.png


--------------------------------------------------------------------------------
/06-images/module-3-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-23.png


--------------------------------------------------------------------------------
/06-images/module-3-24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-24.png


--------------------------------------------------------------------------------
/06-images/module-3-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-25.png


--------------------------------------------------------------------------------
/06-images/module-3-26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-26.png


--------------------------------------------------------------------------------
/06-images/module-3-27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-27.png


--------------------------------------------------------------------------------
/06-images/module-3-28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-28.png


--------------------------------------------------------------------------------
/06-images/module-3-29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-29.png


--------------------------------------------------------------------------------
/06-images/module-3-30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-30.png


--------------------------------------------------------------------------------
/06-images/module-3-31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-31.png


--------------------------------------------------------------------------------
/06-images/module-3-32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-32.png


--------------------------------------------------------------------------------
/06-images/module-3-33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-33.png


--------------------------------------------------------------------------------
/06-images/module-3-34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-34.png


--------------------------------------------------------------------------------
/06-images/module-3-35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-35.png


--------------------------------------------------------------------------------
/06-images/module-3-36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-36.png


--------------------------------------------------------------------------------
/06-images/module-3-37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-37.png


--------------------------------------------------------------------------------
/06-images/module-3-38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-3-38.png


--------------------------------------------------------------------------------
/06-images/module-4-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-01.png


--------------------------------------------------------------------------------
/06-images/module-4-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-02.png


--------------------------------------------------------------------------------
/06-images/module-4-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-03.png


--------------------------------------------------------------------------------
/06-images/module-4-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-04.png


--------------------------------------------------------------------------------
/06-images/module-4-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-05.png


--------------------------------------------------------------------------------
/06-images/module-4-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-06.png


--------------------------------------------------------------------------------
/06-images/module-4-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-07.png


--------------------------------------------------------------------------------
/06-images/module-4-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-08.png


--------------------------------------------------------------------------------
/06-images/module-4-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-09.png


--------------------------------------------------------------------------------
/06-images/module-4-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-10.png


--------------------------------------------------------------------------------
/06-images/module-4-100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-100.png


--------------------------------------------------------------------------------
/06-images/module-4-101.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-101.png


--------------------------------------------------------------------------------
/06-images/module-4-102.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-102.png


--------------------------------------------------------------------------------
/06-images/module-4-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-11.png


--------------------------------------------------------------------------------
/06-images/module-4-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-12.png


--------------------------------------------------------------------------------
/06-images/module-4-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-13.png


--------------------------------------------------------------------------------
/06-images/module-4-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-14.png


--------------------------------------------------------------------------------
/06-images/module-4-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-15.png


--------------------------------------------------------------------------------
/06-images/module-4-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-16.png


--------------------------------------------------------------------------------
/06-images/module-4-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-4-17.png


--------------------------------------------------------------------------------
/06-images/module-5-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-01.png


--------------------------------------------------------------------------------
/06-images/module-5-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-02.png


--------------------------------------------------------------------------------
/06-images/module-5-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-03.png


--------------------------------------------------------------------------------
/06-images/module-5-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-04.png


--------------------------------------------------------------------------------
/06-images/module-5-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-05.png


--------------------------------------------------------------------------------
/06-images/module-5-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-06.png


--------------------------------------------------------------------------------
/06-images/module-5-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-07.png


--------------------------------------------------------------------------------
/06-images/module-5-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-08.png


--------------------------------------------------------------------------------
/06-images/module-5-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-09.png


--------------------------------------------------------------------------------
/06-images/module-5-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-10.png


--------------------------------------------------------------------------------
/06-images/module-5-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-11.png


--------------------------------------------------------------------------------
/06-images/module-5-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-12.png


--------------------------------------------------------------------------------
/06-images/module-5-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-13.png


--------------------------------------------------------------------------------
/06-images/module-5-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-14.png


--------------------------------------------------------------------------------
/06-images/module-5-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-15.png


--------------------------------------------------------------------------------
/06-images/module-5-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-16.png


--------------------------------------------------------------------------------
/06-images/module-5-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-17.png


--------------------------------------------------------------------------------
/06-images/module-5-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-18.png


--------------------------------------------------------------------------------
/06-images/module-5-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-19.png


--------------------------------------------------------------------------------
/06-images/module-5-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-20.png


--------------------------------------------------------------------------------
/06-images/module-5-21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-21.png


--------------------------------------------------------------------------------
/06-images/module-5-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-22.png


--------------------------------------------------------------------------------
/06-images/module-5-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-23.png


--------------------------------------------------------------------------------
/06-images/module-5-24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-24.png


--------------------------------------------------------------------------------
/06-images/module-5-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-25.png


--------------------------------------------------------------------------------
/06-images/module-5-26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-26.png


--------------------------------------------------------------------------------
/06-images/module-5-27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-27.png


--------------------------------------------------------------------------------
/06-images/module-5-28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-28.png


--------------------------------------------------------------------------------
/06-images/module-5-29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-29.png


--------------------------------------------------------------------------------
/06-images/module-5-30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-30.png


--------------------------------------------------------------------------------
/06-images/module-5-31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-31.png


--------------------------------------------------------------------------------
/06-images/module-5-32a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-32a.png


--------------------------------------------------------------------------------
/06-images/module-5-32b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-32b.png


--------------------------------------------------------------------------------
/06-images/module-5-33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-33.png


--------------------------------------------------------------------------------
/06-images/module-5-34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-34.png


--------------------------------------------------------------------------------
/06-images/module-5-35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-35.png


--------------------------------------------------------------------------------
/06-images/module-5-36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-5-36.png


--------------------------------------------------------------------------------
/06-images/module-6-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-01.png


--------------------------------------------------------------------------------
/06-images/module-6-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-02.png


--------------------------------------------------------------------------------
/06-images/module-6-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-03.png


--------------------------------------------------------------------------------
/06-images/module-6-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-04.png


--------------------------------------------------------------------------------
/06-images/module-6-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-05.png


--------------------------------------------------------------------------------
/06-images/module-6-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-06.png


--------------------------------------------------------------------------------
/06-images/module-6-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-07.png


--------------------------------------------------------------------------------
/06-images/module-6-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-08.png


--------------------------------------------------------------------------------
/06-images/module-6-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-09.png


--------------------------------------------------------------------------------
/06-images/module-6-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-10.png


--------------------------------------------------------------------------------
/06-images/module-6-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-11.png


--------------------------------------------------------------------------------
/06-images/module-6-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-12.png


--------------------------------------------------------------------------------
/06-images/module-6-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-13.png


--------------------------------------------------------------------------------
/06-images/module-6-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-14.png


--------------------------------------------------------------------------------
/06-images/module-6-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-15.png


--------------------------------------------------------------------------------
/06-images/module-6-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-16.png


--------------------------------------------------------------------------------
/06-images/module-6-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-17.png


--------------------------------------------------------------------------------
/06-images/module-6-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-18.png


--------------------------------------------------------------------------------
/06-images/module-6-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-19.png


--------------------------------------------------------------------------------
/06-images/module-6-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-6-20.png


--------------------------------------------------------------------------------
/06-images/module-7-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-7-01.png


--------------------------------------------------------------------------------
/06-images/module-7-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-7-02.png


--------------------------------------------------------------------------------
/06-images/module-7-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-7-03.png


--------------------------------------------------------------------------------
/06-images/module-7-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-7-04.png


--------------------------------------------------------------------------------
/06-images/module-7-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-7-05.png


--------------------------------------------------------------------------------
/06-images/module-7-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-7-06.png


--------------------------------------------------------------------------------
/06-images/module-7-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-7-07.png


--------------------------------------------------------------------------------
/06-images/module-8-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-01.png


--------------------------------------------------------------------------------
/06-images/module-8-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-02.png


--------------------------------------------------------------------------------
/06-images/module-8-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-03.png


--------------------------------------------------------------------------------
/06-images/module-8-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-04.png


--------------------------------------------------------------------------------
/06-images/module-8-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-05.png


--------------------------------------------------------------------------------
/06-images/module-8-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-06.png


--------------------------------------------------------------------------------
/06-images/module-8-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-07.png


--------------------------------------------------------------------------------
/06-images/module-8-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-08.png


--------------------------------------------------------------------------------
/06-images/module-8-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-09.png


--------------------------------------------------------------------------------
/06-images/module-8-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-10.png


--------------------------------------------------------------------------------
/06-images/module-8-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-11.png


--------------------------------------------------------------------------------
/06-images/module-8-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-12.png


--------------------------------------------------------------------------------
/06-images/module-8-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-13.png


--------------------------------------------------------------------------------
/06-images/module-8-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-14.png


--------------------------------------------------------------------------------
/06-images/module-8-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-15.png


--------------------------------------------------------------------------------
/06-images/module-8-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-16.png


--------------------------------------------------------------------------------
/06-images/module-8-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-17.png


--------------------------------------------------------------------------------
/06-images/module-8-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-18.png


--------------------------------------------------------------------------------
/06-images/module-8-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anagha-google/s8s-spark-mlops-lab/13439bc696388328841d78bedc4da41fe860a05c/06-images/module-8-19.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Practical Machine Learning at scale with Serverless Spark on GCP and Vertex AI
  2 | 
  3 | 
  4 | ## 1. About
  5 | 
  6 | This repo is a hands on lab for [Spark MLlib](https://spark.apache.org/docs/latest/ml-guide.html) based scalable machine learning on Google Cloud, powered by Dataproc Serverless Spark and showcases integration with Vertex AI AIML platform. The focus is on demystifying the products and integration (and not about a perfect model), and features a minimum viable end to end machine learning use case.
  7 | 
  8 | <hr>
  9 | 
 10 | ## 2. Format & Duration
 11 | The lab is fully scripted (no research needed), with (fully automated) environment setup, data, code, commands, notebooks, orchestration, and configuration. Clone the repo and follow the step by step instructions for an end to end MLOps experience. <br><br>
 12 | 
 13 | Expect to spend ~8 hours to fully understand and execute if new to GCP and the services and at least ~6 hours otherwise.
 14 | 
 15 | <hr>
 16 | 
 17 | ## 3. Level
 18 | L300 - framework (Spark), services/products, integration 
 19 | 
 20 | <hr>
 21 | 
 22 | ## 4. Audience
 23 | The intended audience is anyone with (access to Google Cloud and) interest in the usecase, products and features showcased.
 24 | 
 25 | <hr>
 26 | 
 27 | ## 5. Prerequisites
 28 | Knowledge of Apache Spark, Machine Learning, and GCP products would be beneficial but is not entirely required, given the format of the lab. Access to Google Cloud is a must unless you want to just read the content.
 29 | 
 30 | <hr>
 31 | 
 32 | ## 6. Goal
 33 | Simplify your learning and adoption journey of our product stack for scalable data science with - <br> 
 34 | 1. Just enough product knowledge of Dataproc Serverless Spark & Vertex AI integration for machine learning at scale on Google Cloud<br>
 35 | 2. Quick start code for ML at scale with Spark that can be repurposed for your data and ML experiments<br>
 36 | 3. Terraform for provisioning a variety of Google Cloud data services in the Spark ML context, that can be repurposed for your use case<br>
 37 | 
 38 | <hr>
 39 | 
 40 | ## 7. Use case covered
 41 | Telco Customer Churn Prediction with a [Kaggle dataset](https://www.kaggle.com/datasets/blastchar/telco-customer-churn) and [Spark MLLib, Random Forest Classifer](https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier)<br> 
 42 | 
 43 | <hr>
 44 | 
 45 | ## 8. Solution Architecture
 46 | 
 47 | ### 8.1. Experimenting with Spark model training, tuning and batch scoring
 48 | 
 49 | ![README](06-images/landing-page-02.png)   
 50 | <br><br>
 51 | 
 52 | About Dataproc Serverless Spark Interactive:
 53 | Fully managed, autoscalable, secure Spark infrastructure as a service for use with Jupyter notebooks on Vertex AI Workbench managed notebooks. Use as an interactive Spark IDE, for accelerating development and speed to production.
 54 | 
 55 | ### 8.2. Operationalizing Spark Model Training
 56 | 
 57 | ![README](06-images/landing-page-03.png)   
 58 | <br><br>
 59 | 
 60 | About Dataproc Serverless Spark Batches:
 61 | Fully managed, autoscalable, secure Spark jobs as a service that eliminates administration overhead and resource contention, simplifies development and accelerates speed to production. Learn more about the service [here](https://cloud.google.com/dataproc-serverless/docs). <br>
 62 | 
 63 | - Find templates that accelerate speed to production [here](https://github.com/GoogleCloudPlatform/dataproc-templates)
 64 | - Want Google Cloud to train you on Serverless Spark for free, reach out to us [here](https://forms.gle/8ekUAFYd5xXvi2Hy9)
 65 | - Try out our other Serverless Spark centric hands on labs [here](https://github.com/GoogleCloudPlatform/serverless-spark-workshop)
 66 | 
 67 | ### 8.3. Operationalizing Spark Batch Scoring
 68 | There are multiple options. 
 69 | 
 70 | #### 8.3.1. Directly from within Spark
 71 | ![README](06-images/landing-page-04.png)   
 72 | <br><br>
 73 | 
 74 | #### 8.3.2. Through Vertex AI serving
 75 | Vertex AI supports operationalizing batch serving of Spark ML Models in conjunction with MLEAP.<br>
 76 | ARCHITECTURE DIAGRAM TO BE ADDED<br>
 77 | CODE MODULE - Work in progress<br>
 78 | 
 79 | <hr>
 80 | 
 81 | ## 9. Flow of the lab
 82 | 
 83 | ![README](06-images/landing-page-01.png)   
 84 | <br><br>
 85 | For your convenience, all the code is pre-authored, so you can focus on understanding product features and integration.
 86 | 
 87 | <hr>
 88 | 
 89 | ## 10. The lab modules
 90 | Complete the lab modules in a sequential manner. For a better lab experience, read *all* the modules and then start working on them.
 91 | | # | Module | Duration | 
 92 | | -- | :--- | :--- |
 93 | | 01 |  [Terraform for environment provisioning](05-lab-guide/Module-01-Environment-Provisioning.md)| 1 hour |
 94 | | 02 |  [Tutorial on Dataproc Serverless Spark Interactive Sessions for authoring Spark code](05-lab-guide/Module-02-Spark-IDE-on-GCP.md)| 15 minutes |
 95 | | 03 |  [Author PySpark ML experiments with Serverless Spark Interactive notebooks](05-lab-guide/Module-03-Author-ML-Experiments-With-Spark-Notebooks.md)| 1 hour |
 96 | | 04 |  [Author PySpark ML scripts in preparation for authoring a model training pipeline](05-lab-guide/Module-04-Author-ML-PySpark-Scripts.md)| 1 hour |
 97 | | 05 |  [Author a Vertex AI model training pipeline](05-lab-guide/Module-05-Author-Vertex-AI-Pipeline.md)| 1 hour |
 98 | | 06 |  [Author a Cloud Function that calls your Vertex AI model training pipeline](05-lab-guide/Module-06-Author-CloudFunction-For-Vertex-AI-Pipeline.md)| 15 minutes |
 99 | | 07 |  [Create a Cloud Scheduler job that invokes the Cloud Function you created](05-lab-guide/Module-07-Schedule-VertexAI-Pipeline.md)| 15 minutes |
100 | | 08 |  [Author a Cloud Composer Airflow DAG for batch scoring and schedule it](05-lab-guide/Module-08-Orchestrate-Batch-Scoring.md)| 15 minutes |
101 | 
102 | The lab includes custom [container image creation](05-lab-guide/Module-04-Author-ML-PySpark-Scripts.md#11-creating-a-custom-container-image) and usage.
103 | 
104 | ## 11. Dont forget to 
105 | Shut down/delete resources when done to avoid unnecessary billing.
106 | 
107 | <hr>
108 | 
109 | ## 12. Credits
110 | | # | Google Cloud Collaborators | Contribution  | 
111 | | -- | :--- | :--- |
112 | | 1. | Anagha Khanolkar | Creator |
113 | | 2. | Dr. Thomas Abraham<br>Brian Kang | ML consultation, testing, best practices and feedback |
114 | | 3. | Rob Vogelbacher<br>Proshanta Saha| ML consultation |
115 | | 4. | Ivan Nardini<br>Win Woo | ML consultation, inspiration through [samples](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/google_cloud_pipeline_components_dataproc_tabular.ipynb) and [blogs](https://medium.com/google-cloud/sparkling-vertex-ai-pipeline-cfe6e19334f7) |
116 | 
117 | The source code was evolved by the creator from a base developed by a partner for Google Cloud.
118 | 
119 | 
120 | <hr>
121 | 
122 | ## 13. Contributions welcome
123 | Community contribution to improve the lab is very much appreciated. <br>
124 | 
125 | <hr>
126 | 
127 | ## 14. Getting help
128 | If you have any questions or if you found any problems with this repository, please report through GitHub issues.
129 | 
130 | <hr>
131 | 
132 | ## 15. Release History
133 | | Date | Details | 
134 | | -- | :--- | 
135 | | 20220930 |  Added serializing model to MLEAP bundle<br>Affects:<br>1. Terraform main.tf<br>2. Hyperparameter tuning notebook<br>3. Hyperparameter tuning PySpark script<br>4. VAI pipeline notebook<br>5. VAI Json Template |
136 | |20221202 | [Added a Python SDK (notebook) sample for preprocessing](03-notebooks/pyspark/Dataproc-Spark-Servereless-Batch-PythonSDK-Sample.ipynb) |
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------