├── .gitignore
├── .project-metadata.yaml
├── 0_bootstrap.py
├── 1_data_ingest.py
├── 2_data_exploration.ipynb
├── 3_model_building.ipynb
├── 4_train_models.py
├── 5_model_serve_explainer.py
├── 6_application.py
├── 7a_ml_ops_simulation.py
├── 7b_ml_ops_visual.py
├── 8_check_model.py
├── 9_build_project.py
├── README.md
├── cdsw-build.sh
├── churnexplainer.py
├── flask
├── ajax-loader.gif
├── churn_vis.css
├── churn_vis.js
├── env_vars.png
├── single_view.html
└── table_view.html
├── images
├── data.png
├── disable_auth.png
├── model_accuracy.png
├── model_id.png
├── single_view_1.png
├── single_view_2.png
└── table_view.png
├── lineage.yml
├── models
└── telco_linear
│ └── telco_linear.pkl
├── raw
├── WA_Fn-UseC_-Telco-Customer-Churn-.csv
└── telco-data
│ ├── _SUCCESS
│ └── part-00000-bfdb203d-eea4-4b80-bda3-d369976e785a-c000.csv
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | R
2 | node_modules
3 | *.pyc
4 | __pycache__
5 | .*
6 | !.gitignore
--------------------------------------------------------------------------------
/.project-metadata.yaml:
--------------------------------------------------------------------------------
1 | name: ML Churn Demo
2 | description: Prototype to demonstrate building a churn model on CML
3 | author: Cloudera Engineer
4 | specification_version: 1.0
5 | prototype_version: 1.0
6 | date: "2020-09-29"
7 | api_version: 1
8 |
9 | environment_variables:
10 | AWS_ACCESS_KEY:
11 | default: "AWS ACCESS KEY"
12 | description: "Access Key ID for accessing S3 bucket"
13 | prompt_user: true
14 | AWS_SECRET_KEY:
15 | default: "AWS SECRET KEY"
16 | description: "Secret Access Key for accessing S3 bucket"
17 | prompt_user: true
18 | HADOOP_DATA_SOURCE:
19 | default: "S3 URL FOR DATASET"
20 | description: "S3 URL to large data set"
21 | prompt_user: true
22 | MODEL_REPLICAS:
23 | default: "3"
24 | description: "Number of model replicas, 3 is standard for redundancy"
25 | prompt_user: false
26 | TASK_TYPE:
27 | default: NOT_OVERRIDEN
28 | prompt_user: false
29 |
30 | tasks:
31 | - type: create_job
32 | name: Install dependencies
33 | entity_label: install_deps
34 | script: 0_bootstrap.py
35 | arguments: None
36 | short_summary: Job to install dependencies and download training data.
37 | environment:
38 | TASK_TYPE: CREATE/RUN_JOB
39 | kernel: python3
40 |
41 | - type: run_job
42 | entity_label: install_deps
43 | short_summary: Running install dependencies training job.
44 | long_summary: >-
45 | Running the job to install dependencies.
46 |
47 | - type: create_job
48 | name: Train Churn Model
49 | entity_label: train_model
50 | script: 4_train_models.py
51 | arguments: None
52 | short_summary: Job to train models.
53 | environment:
54 | TASK_TYPE: CREATE/RUN_JOB
55 | kernel: python3
56 |
57 | - type: run_job
58 | entity_label: train_model
59 | short_summary: Run model training job.
60 | long_summary: >-
61 | Running the job to train models.
62 |
63 | - type: create_model
64 | name: Create Churn Model API Endpoint
65 | entity_label: telco_churn_model
66 | description: This model api endpoint predicts churn
67 | short_summary: Create the churn model prediction api endpoint
68 | access_key_environment_variable: SHTM_ACCESS_KEY
69 | # default_resources:
70 | # cpu: 1
71 | # memory: 2
72 | default_replication_policy:
73 | type: fixed
74 | num_replicas: 1
75 |
76 | # auth_enabled: false
77 | - type: build_model
78 | name: Build Telco Churn Model Endpoint
79 | entity_label: telco_churn_model
80 | comment: Build churn model
81 | examples:
82 | - request: '{"StreamingTV":"No","MonthlyCharges":70.35,"PhoneService":"No","PaperlessBilling":"No","Partner":"No","OnlineBackup":"No","gender":"Female","Contract":"Month-to-month","TotalCharges":1397.475,"StreamingMovies":"No","DeviceProtection":"No","PaymentMethod":"Bank transfer (automatic)","tenure":29,"Dependents":"No","OnlineSecurity":"No","MultipleLines":"No","InternetService":"DSL","SeniorCitizen":"No","TechSupport":"No"}'
83 | response: ""
84 | target_file_path: 5_model_serve_explainer.py
85 | target_function_name: explain
86 | kernel: python3
87 | environment_variables:
88 | TASK_TYPE: CREATE/BUILD/DEPLOY_MODEL
89 |
90 | - type: deploy_model
91 | name: telco_churn_model
92 | entity_label: telco_churn_model
93 | cpu: 1
94 | gpu: 0
95 | environment_variables:
96 | TASK_TYPE: CREATE/BUILD/DEPLOY_MODEL
97 |
98 | - type: start_application
99 | name: Application to serve Churn front app UI
100 | subdomain: churn
101 | script: 6_application.py
102 | environment_variables:
103 | TASK_TYPE: START_APPLICATION
104 | kernel: python3
105 |
--------------------------------------------------------------------------------
/0_bootstrap.py:
--------------------------------------------------------------------------------
1 | # # Part 0: Bootstrap File
2 | # You need to at the start of the project. It will install the requirements, creates the
3 | # STORAGE environment variable and copy the data from
4 | # raw/WA_Fn-UseC_-Telco-Customer-Churn-.csv into /datalake/data/churn of the STORAGE
5 | # location.
6 |
7 | # The STORAGE environment variable is the Cloud Storage location used by the DataLake
8 | # to store hive data. On AWS it will s3a://[something], on Azure it will be
9 | # abfs://[something] and on CDSW cluster, it will be hdfs://[something]
10 |
11 | # Install the requirements
12 | !pip3 install -r requirements.txt --progress-bar off
13 |
14 | # Create the directories and upload data
15 |
16 | from cmlbootstrap import CMLBootstrap
17 | from IPython.display import Javascript, HTML
18 | import os
19 | import time
20 | import json
21 | import requests
22 | import xml.etree.ElementTree as ET
23 | import datetime
24 |
25 | try:
26 | os.environ["SPARK_HOME"]
27 | print("Spark is enabled")
28 | except:
29 | print('Spark is not enabled, please enable spark before running this script')
30 | raise KeyError('Spark is not enabled, please enable spark before running this script')
31 |
32 | run_time_suffix = datetime.datetime.now()
33 | run_time_suffix = run_time_suffix.strftime("%d%m%Y%H%M%S")
34 |
35 | # Instantiate API Wrapper
36 | cml = CMLBootstrap()
37 |
38 | # Set the STORAGE environment variable
39 | try :
40 | storage=os.environ["STORAGE"]
41 | except:
42 | storage = cml.get_cloud_storage()
43 | storage_environment_params = {"STORAGE":storage}
44 | storage_environment = cml.create_environment_variable(storage_environment_params)
45 | os.environ["STORAGE"] = storage
46 |
47 | # Upload the data to the cloud storage
48 | !hadoop fs -mkdir -p $STORAGE/datalake
49 | !hadoop fs -mkdir -p $STORAGE/datalake/data
50 | !hadoop fs -mkdir -p $STORAGE/datalake/data/churn
51 | !hadoop fs -copyFromLocal /home/cdsw/raw/WA_Fn-UseC_-Telco-Customer-Churn-.csv $STORAGE/datalake/data/churn/WA_Fn-UseC_-Telco-Customer-Churn-.csv
52 |
53 |
--------------------------------------------------------------------------------
/1_data_ingest.py:
--------------------------------------------------------------------------------
1 | # Part 1: Data Ingest
2 | # A data scientist should never be blocked in getting data into their environment,
3 | # so CML is able to ingest data from many sources.
4 | # Whether you have data in .csv files, modern formats like parquet or feather,
5 | # in cloud storage or a SQL database, CML will let you work with it in a data
6 | # scientist-friendly environment.
7 |
8 | # Access local data on your computer
9 | #
10 | # Accessing data stored on your computer is a matter of [uploading a file to the CML filesystem and
11 | # referencing from there](https://docs.cloudera.com/machine-learning/cloud/import-data/topics/ml-accessing-local-data-from-your-computer.html).
12 | #
13 | # > Go to the project's **Overview** page. Under the **Files** section, click **Upload**, select the relevant data files to be uploaded and a destination folder.
14 | #
15 | # If, for example, you upload a file called, `mydata.csv` to a folder called `data`, the
16 | # following example code would work.
17 |
18 | # ```
19 | # import pandas as pd
20 | #
21 | # df = pd.read_csv('data/mydata.csv')
22 | #
23 | # # Or:
24 | # df = pd.read_csv('/home/cdsw/data/mydata.csv')
25 | # ```
26 |
27 | # Access data in S3
28 | #
29 | # Accessing [data in Amazon S3](https://docs.cloudera.com/machine-learning/cloud/import-data/topics/ml-accessing-data-in-amazon-s3-buckets.html)
30 | # follows a familiar procedure of fetching and storing in the CML filesystem.
31 | # > Add your Amazon Web Services access keys to your project's
32 | # > [environment variables](https://docs.cloudera.com/machine-learning/cloud/import-data/topics/ml-environment-variables.html)
33 | # > as `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
34 | #
35 | # To get the the access keys that are used for your in the CDP DataLake, you can follow
36 | # [this Cloudera Community Tutorial](https://community.cloudera.com/t5/Community-Articles/How-to-get-AWS-access-keys-via-IDBroker-in-CDP/ta-p/295485)
37 |
38 | #
39 | # The following sample code would fetch a file called `myfile.csv` from the S3 bucket, `data_bucket`, and store it in the CML home folder.
40 | # ```
41 | # # Create the Boto S3 connection object.
42 | # from boto.s3.connection import S3Connection
43 | # aws_connection = S3Connection()
44 | #
45 | # # Download the dataset to file 'myfile.csv'.
46 | # bucket = aws_connection.get_bucket('data_bucket')
47 | # key = bucket.get_key('myfile.csv')
48 | # key.get_contents_to_filename('/home/cdsw/myfile.csv')
49 | # ```
50 |
51 |
52 | # Access data from Cloud Storage or the Hive metastore
53 | #
54 | # Accessing data from [the Hive metastore](https://docs.cloudera.com/machine-learning/cloud/import-data/topics/ml-accessing-data-from-apache-hive.html)
55 | # that comes with CML only takes a few more steps.
56 | # But first we need to fetch the data from Cloud Storage and save it as a Hive table.
57 | #
58 | # > Specify `STORAGE` as an
59 | # > [environment variable](https://docs.cloudera.com/machine-learning/cloud/import-data/topics/ml-environment-variables.html)
60 | # > in your project settings containing the Cloud Storage location used by the DataLake to store
61 | # > Hive data. On AWS it will `s3a://[something]`, on Azure it will be `abfs://[something]` and on
62 | # > on prem CDSW cluster, it will be `hdfs://[something]`
63 | #
64 | # This was done for you when you ran `0_bootstrap.py`, so the following code is set up to run as is.
65 | # It begins with imports and creating a `SparkSession`.
66 |
67 | import os
68 | import sys
69 | from pyspark.sql import SparkSession
70 | from pyspark.sql.types import *
71 |
72 |
73 |
74 | spark = SparkSession\
75 | .builder\
76 | .appName("PythonSQL")\
77 | .master("local[*]")\
78 | .getOrCreate()
79 |
80 | # **Note:**
81 | # Our file isn't big, so running it in Spark local mode is fine but you can add the following config
82 | # if you want to run Spark on the kubernetes cluster
83 | #
84 | # > .config("spark.yarn.access.hadoopFileSystems",os.getenv['STORAGE'])\
85 | #
86 | # and remove `.master("local[*]")\`
87 | #
88 |
89 | # Since we know the data already, we can add schema upfront. This is good practice as Spark will
90 | # read *all* the Data if you try infer the schema.
91 |
92 | schema = StructType(
93 | [
94 | StructField("customerID", StringType(), True),
95 | StructField("gender", StringType(), True),
96 | StructField("SeniorCitizen", StringType(), True),
97 | StructField("Partner", StringType(), True),
98 | StructField("Dependents", StringType(), True),
99 | StructField("tenure", DoubleType(), True),
100 | StructField("PhoneService", StringType(), True),
101 | StructField("MultipleLines", StringType(), True),
102 | StructField("InternetService", StringType(), True),
103 | StructField("OnlineSecurity", StringType(), True),
104 | StructField("OnlineBackup", StringType(), True),
105 | StructField("DeviceProtection", StringType(), True),
106 | StructField("TechSupport", StringType(), True),
107 | StructField("StreamingTV", StringType(), True),
108 | StructField("StreamingMovies", StringType(), True),
109 | StructField("Contract", StringType(), True),
110 | StructField("PaperlessBilling", StringType(), True),
111 | StructField("PaymentMethod", StringType(), True),
112 | StructField("MonthlyCharges", DoubleType(), True),
113 | StructField("TotalCharges", DoubleType(), True),
114 | StructField("Churn", StringType(), True)
115 | ]
116 | )
117 |
118 | # Now we can read in the data from Cloud Storage into Spark...
119 |
120 | storage = os.environ['STORAGE']
121 |
122 | telco_data = spark.read.csv(
123 | "{}/datalake/data/churn/WA_Fn-UseC_-Telco-Customer-Churn-.csv".format(
124 | storage),
125 | header=True,
126 | schema=schema,
127 | sep=',',
128 | nullValue='NA'
129 | )
130 |
131 | # ...and inspect the data.
132 |
133 | telco_data.show()
134 |
135 | telco_data.printSchema()
136 |
137 | # Now we can store the Spark DataFrame as a file in the local CML file system
138 | # *and* as a table in Hive used by the other parts of the project.
139 |
140 | telco_data.coalesce(1).write.csv(
141 | "file:/home/cdsw/raw/telco-data/",
142 | mode='overwrite',
143 | header=True
144 | )
145 |
146 | spark.sql("show databases").show()
147 |
148 | spark.sql("show tables in default").show()
149 |
150 | # Create the Hive table
151 | # This is here to create the table in Hive used be the other parts of the project, if it
152 | # does not already exist.
153 |
154 | if ('telco_churn' not in list(spark.sql("show tables in default").toPandas()['tableName'])):
155 | print("creating the telco_churn database")
156 | telco_data\
157 | .write.format("parquet")\
158 | .mode("overwrite")\
159 | .saveAsTable(
160 | 'default.telco_churn'
161 | )
162 |
163 | # Show the data in the hive table
164 | spark.sql("select * from default.telco_churn").show()
165 |
166 | # To get more detailed information about the hive table you can run this:
167 | spark.sql("describe formatted default.telco_churn").toPandas()
168 |
169 | # Other ways to access data
170 |
171 | # To access data from other locations, refer to the
172 | # [CML documentation](https://docs.cloudera.com/machine-learning/cloud/import-data/index.html).
173 |
174 | # Scheduled Jobs
175 | #
176 | # One of the features of CML is the ability to schedule code to run at regular intervals,
177 | # similar to cron jobs. This is useful for **data pipelines**, **ETL**, and **regular reporting**
178 | # among other use cases. If new data files are created regularly, e.g. hourly log files, you could
179 | # schedule a Job to run a data loading script with code like the above.
180 |
181 | # > Any script [can be scheduled as a Job](https://docs.cloudera.com/machine-learning/cloud/jobs-pipelines/topics/ml-creating-a-job.html).
182 | # > You can create a Job with specified command line arguments or environment variables.
183 | # > Jobs can be triggered by the completion of other jobs, forming a
184 | # > [Pipeline](https://docs.cloudera.com/machine-learning/cloud/jobs-pipelines/topics/ml-creating-a-pipeline.html)
185 | # > You can configure the job to email individuals with an attachment, e.g. a csv report which your
186 | # > script saves at: `/home/cdsw/job1/output.csv`.
187 |
188 | # Try running this script `1_data_ingest.py` for use in such a Job.
189 |
190 |
--------------------------------------------------------------------------------
/3_model_building.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Part 3: Model Building"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook explores building the **customer churn model** and an **interpretability model** to explain each prediction.\n",
15 | "In addition to making a prediction of whether a customer will churn, we will also be able to answer the question, \"why are they expected to churn?\"\n",
16 | "\n",
17 | "The following work will look fairly standard to anyone having trained machine learning models using python Jupyter notebooks.\n",
18 | "The CML platform provides a **fully capable Jupyter notebook environment** that data scientists know and love.\n",
19 | "\n",
20 | "If you haven't yet, run through the initialization steps in the README file and Part 1. \n",
21 | "In Part 1, the data is imported into the `default.telco_churn` table in Hive. All data accesses fetch from Hive.\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## Load data\n",
29 | "\n",
30 | "We again start by creating a `SparkSession` to fetch the data using Spark SQL, only this time we convert to a pandas `DataFrame` since we saw earlier that there are only 7k records in the dataset.\n"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 6,
36 | "metadata": {
37 | "scrolled": false
38 | },
39 | "outputs": [
40 | {
41 | "name": "stdout",
42 | "output_type": "stream",
43 | "text": [
44 | "root\n",
45 | " |-- customerID: string (nullable = true)\n",
46 | " |-- gender: string (nullable = true)\n",
47 | " |-- SeniorCitizen: string (nullable = true)\n",
48 | " |-- Partner: string (nullable = true)\n",
49 | " |-- Dependents: string (nullable = true)\n",
50 | " |-- tenure: double (nullable = true)\n",
51 | " |-- PhoneService: string (nullable = true)\n",
52 | " |-- MultipleLines: string (nullable = true)\n",
53 | " |-- InternetService: string (nullable = true)\n",
54 | " |-- OnlineSecurity: string (nullable = true)\n",
55 | " |-- OnlineBackup: string (nullable = true)\n",
56 | " |-- DeviceProtection: string (nullable = true)\n",
57 | " |-- TechSupport: string (nullable = true)\n",
58 | " |-- StreamingTV: string (nullable = true)\n",
59 | " |-- StreamingMovies: string (nullable = true)\n",
60 | " |-- Contract: string (nullable = true)\n",
61 | " |-- PaperlessBilling: string (nullable = true)\n",
62 | " |-- PaymentMethod: string (nullable = true)\n",
63 | " |-- MonthlyCharges: double (nullable = true)\n",
64 | " |-- TotalCharges: double (nullable = true)\n",
65 | " |-- Churn: string (nullable = true)\n",
66 | "\n"
67 | ]
68 | }
69 | ],
70 | "source": [
71 | "from pyspark.sql import SparkSession\n",
72 | "\n",
73 | "spark = SparkSession\\\n",
74 | " .builder\\\n",
75 | " .appName(\"PythonSQL\")\\\n",
76 | " .master(\"local[*]\")\\\n",
77 | " .getOrCreate()\n",
78 | "\n",
79 | "spark_df = spark.sql(\"SELECT * FROM default.telco_churn\")\n",
80 | "spark_df.printSchema()\n",
81 | "df = spark_df.toPandas()"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "**Note:** If you don't have the Hive table, you can read the csv from the CML filesystem using pandas directly:\n",
89 | "\n",
90 | "```python\n",
91 | "import pandas as pd\n",
92 | "import os\n",
93 | "\n",
94 | "data_dir = '/home/cdsw'\n",
95 | "df = pd.read_csv(os.path.join(data_dir, 'raw', 'WA_Fn-UseC_-Telco-Customer-Churn-.csv'))\n",
96 | "```\n"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "## Basic feature engineering\n",
104 | "\n",
105 | "\n",
106 | "Next we munge the data into appropriate types for later steps. \n",
107 | "In particular, we want to convert all the binary and string columns into pandas `Categorical` types.\n",
108 | "\n"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 7,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "import subprocess, glob, sys\n",
118 | "import dill # a better pickle\n",
119 | "import pandas as pd\n",
120 | "import numpy as np\n",
121 | "import matplotlib.pyplot as plt\n",
122 | "\n",
123 | "data_dir = '/home/cdsw'"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 8,
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "idcol = 'customerID' # ID column\n",
133 | "labelcol = 'Churn' # label column\n",
134 | "cols = (('gender', True), # (feature column, Categorical?)\n",
135 | " ('SeniorCitizen', True),\n",
136 | " ('Partner', True),\n",
137 | " ('Dependents', True),\n",
138 | " ('tenure', False),\n",
139 | " ('PhoneService', True),\n",
140 | " ('MultipleLines', True),\n",
141 | " ('InternetService', True),\n",
142 | " ('OnlineSecurity', True),\n",
143 | " ('OnlineBackup', True),\n",
144 | " ('DeviceProtection', True),\n",
145 | " ('TechSupport', True),\n",
146 | " ('StreamingTV', True),\n",
147 | " ('StreamingMovies', True),\n",
148 | " ('Contract', True),\n",
149 | " ('PaperlessBilling', True),\n",
150 | " ('PaymentMethod', True),\n",
151 | " ('MonthlyCharges', False),\n",
152 | " ('TotalCharges', False))"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 9,
158 | "metadata": {},
159 | "outputs": [
160 | {
161 | "data": {
162 | "text/html": [
163 | "
\n",
164 | "\n",
177 | "
\n",
178 | " \n",
179 | "
\n",
180 | "
\n",
181 | "
gender
\n",
182 | "
SeniorCitizen
\n",
183 | "
Partner
\n",
184 | "
Dependents
\n",
185 | "
tenure
\n",
186 | "
PhoneService
\n",
187 | "
MultipleLines
\n",
188 | "
InternetService
\n",
189 | "
OnlineSecurity
\n",
190 | "
OnlineBackup
\n",
191 | "
DeviceProtection
\n",
192 | "
TechSupport
\n",
193 | "
StreamingTV
\n",
194 | "
StreamingMovies
\n",
195 | "
Contract
\n",
196 | "
PaperlessBilling
\n",
197 | "
PaymentMethod
\n",
198 | "
MonthlyCharges
\n",
199 | "
TotalCharges
\n",
200 | "
\n",
201 | "
\n",
202 | "
id
\n",
203 | "
\n",
204 | "
\n",
205 | "
\n",
206 | "
\n",
207 | "
\n",
208 | "
\n",
209 | "
\n",
210 | "
\n",
211 | "
\n",
212 | "
\n",
213 | "
\n",
214 | "
\n",
215 | "
\n",
216 | "
\n",
217 | "
\n",
218 | "
\n",
219 | "
\n",
220 | "
\n",
221 | "
\n",
222 | "
\n",
223 | " \n",
224 | " \n",
225 | "
\n",
226 | "
0
\n",
227 | "
Female
\n",
228 | "
0
\n",
229 | "
Yes
\n",
230 | "
No
\n",
231 | "
1.0
\n",
232 | "
No
\n",
233 | "
No phone service
\n",
234 | "
DSL
\n",
235 | "
No
\n",
236 | "
Yes
\n",
237 | "
No
\n",
238 | "
No
\n",
239 | "
No
\n",
240 | "
No
\n",
241 | "
Month-to-month
\n",
242 | "
Yes
\n",
243 | "
Electronic check
\n",
244 | "
29.85
\n",
245 | "
29.85
\n",
246 | "
\n",
247 | "
\n",
248 | "
1
\n",
249 | "
Male
\n",
250 | "
0
\n",
251 | "
No
\n",
252 | "
No
\n",
253 | "
34.0
\n",
254 | "
Yes
\n",
255 | "
No
\n",
256 | "
DSL
\n",
257 | "
Yes
\n",
258 | "
No
\n",
259 | "
Yes
\n",
260 | "
No
\n",
261 | "
No
\n",
262 | "
No
\n",
263 | "
One year
\n",
264 | "
No
\n",
265 | "
Mailed check
\n",
266 | "
56.95
\n",
267 | "
1889.50
\n",
268 | "
\n",
269 | "
\n",
270 | "
2
\n",
271 | "
Male
\n",
272 | "
0
\n",
273 | "
No
\n",
274 | "
No
\n",
275 | "
2.0
\n",
276 | "
Yes
\n",
277 | "
No
\n",
278 | "
DSL
\n",
279 | "
Yes
\n",
280 | "
Yes
\n",
281 | "
No
\n",
282 | "
No
\n",
283 | "
No
\n",
284 | "
No
\n",
285 | "
Month-to-month
\n",
286 | "
Yes
\n",
287 | "
Mailed check
\n",
288 | "
53.85
\n",
289 | "
108.15
\n",
290 | "
\n",
291 | "
\n",
292 | "
3
\n",
293 | "
Male
\n",
294 | "
0
\n",
295 | "
No
\n",
296 | "
No
\n",
297 | "
45.0
\n",
298 | "
No
\n",
299 | "
No phone service
\n",
300 | "
DSL
\n",
301 | "
Yes
\n",
302 | "
No
\n",
303 | "
Yes
\n",
304 | "
Yes
\n",
305 | "
No
\n",
306 | "
No
\n",
307 | "
One year
\n",
308 | "
No
\n",
309 | "
Bank transfer (automatic)
\n",
310 | "
42.30
\n",
311 | "
1840.75
\n",
312 | "
\n",
313 | "
\n",
314 | "
4
\n",
315 | "
Female
\n",
316 | "
0
\n",
317 | "
No
\n",
318 | "
No
\n",
319 | "
2.0
\n",
320 | "
Yes
\n",
321 | "
No
\n",
322 | "
Fiber optic
\n",
323 | "
No
\n",
324 | "
No
\n",
325 | "
No
\n",
326 | "
No
\n",
327 | "
No
\n",
328 | "
No
\n",
329 | "
Month-to-month
\n",
330 | "
Yes
\n",
331 | "
Electronic check
\n",
332 | "
70.70
\n",
333 | "
151.65
\n",
334 | "
\n",
335 | " \n",
336 | "
\n",
337 | "
"
338 | ],
339 | "text/plain": [
340 | " gender SeniorCitizen Partner Dependents tenure PhoneService \\\n",
341 | "id \n",
342 | "0 Female 0 Yes No 1.0 No \n",
343 | "1 Male 0 No No 34.0 Yes \n",
344 | "2 Male 0 No No 2.0 Yes \n",
345 | "3 Male 0 No No 45.0 No \n",
346 | "4 Female 0 No No 2.0 Yes \n",
347 | "\n",
348 | " MultipleLines InternetService OnlineSecurity OnlineBackup \\\n",
349 | "id \n",
350 | "0 No phone service DSL No Yes \n",
351 | "1 No DSL Yes No \n",
352 | "2 No DSL Yes Yes \n",
353 | "3 No phone service DSL Yes No \n",
354 | "4 No Fiber optic No No \n",
355 | "\n",
356 | " DeviceProtection TechSupport StreamingTV StreamingMovies Contract \\\n",
357 | "id \n",
358 | "0 No No No No Month-to-month \n",
359 | "1 Yes No No No One year \n",
360 | "2 No No No No Month-to-month \n",
361 | "3 Yes Yes No No One year \n",
362 | "4 No No No No Month-to-month \n",
363 | "\n",
364 | " PaperlessBilling PaymentMethod MonthlyCharges TotalCharges \n",
365 | "id \n",
366 | "0 Yes Electronic check 29.85 29.85 \n",
367 | "1 No Mailed check 56.95 1889.50 \n",
368 | "2 Yes Mailed check 53.85 108.15 \n",
369 | "3 No Bank transfer (automatic) 42.30 1840.75 \n",
370 | "4 Yes Electronic check 70.70 151.65 "
371 | ]
372 | },
373 | "execution_count": 9,
374 | "metadata": {},
375 | "output_type": "execute_result"
376 | }
377 | ],
378 | "source": [
379 | "df = df.replace(r'^\\s$', np.nan, regex=True).dropna().reset_index() # drop blank rows\n",
380 | "df.index.name = 'id' # name the index\n",
381 | "data, labels = df.drop(labelcol, axis=1), df[labelcol] # separate out the labels\n",
382 | "data = data[[c for c, _ in cols]] # only use the columns named in `cols`\n",
383 | "data = data.replace({'SeniorCitizen': {1: 'Yes', 0: 'No'}}) # Change 1/0 to Yes/No to match the other binary features\n",
384 | "\n",
385 | "# convert the categorical columns to pd.Categorical form\n",
386 | "for col, iscat in cols:\n",
387 | " if iscat:\n",
388 | " data[col] = pd.Categorical(data[col])\n",
389 | "labels = (labels == 'Yes') # convert labels from str to bool\n",
390 | "\n",
391 | "data.head()"
392 | ]
393 | },
394 | {
395 | "cell_type": "markdown",
396 | "metadata": {},
397 | "source": [
398 | "## Machine learning model\n",
399 | "\n",
400 | "This step follows a fairly standard ML workflow, which is to create a pipeline to:\n",
401 | "\n",
402 | "* Encode the categorical features as numeric\n",
403 | "* Normalize the numeric features\n",
404 | "* Train a classification model using these processed features\n",
405 | "\n",
406 | "We use *one-hot encoding*, *standardization*, and *logistic regression with cross-validation* for the three steps.\n",
407 | "Then we can evaluate the model's performance.\n",
408 | "\n",
409 | "Note: `CategoricalEncoder` and, later, `ExplainedModel` are helper classes pulled and edited from the original CFFL [interpretability report code](https://ff06-2020.fastforwardlabs.com/).\n",
410 | "You can inspect `churnexplainer.py` to see what they do under the hood.\n",
411 | "CML lets you continue to write modular code to keep things segregated and clean."
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 10,
417 | "metadata": {},
418 | "outputs": [],
419 | "source": [
420 | "from sklearn.model_selection import train_test_split\n",
421 | "from sklearn.metrics import classification_report\n",
422 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
423 | "from sklearn.pipeline import Pipeline\n",
424 | "from sklearn.linear_model import LogisticRegressionCV, LogisticRegression\n",
425 | "from sklearn.compose import ColumnTransformer\n",
426 | "\n",
427 | "from churnexplainer import CategoricalEncoder # convert Categorical columns into numeric"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": 11,
433 | "metadata": {},
434 | "outputs": [],
435 | "source": [
436 | "ce = CategoricalEncoder()\n",
437 | "X = ce.fit_transform(data) # Categorical columns now have values 0 to num_categories-1\n",
438 | "y = labels.values\n",
439 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
440 | "\n",
441 | "cat_cols = list(ce.cat_columns_ix_.values()) # indices of the categorical columns (now numeric)\n",
442 | "ct = ColumnTransformer(\n",
443 | " [('ohe', OneHotEncoder(), cat_cols)],\n",
444 | " remainder='passthrough'\n",
445 | ")\n",
446 | "clf = LogisticRegressionCV(cv=5,solver='lbfgs', max_iter=100)"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 23,
452 | "metadata": {},
453 | "outputs": [
454 | {
455 | "name": "stderr",
456 | "output_type": "stream",
457 | "text": [
458 | "/home/cdsw/.local/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
459 | "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
460 | "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
461 | " warnings.warn(msg, FutureWarning)\n"
462 | ]
463 | },
464 | {
465 | "name": "stdout",
466 | "output_type": "stream",
467 | "text": [
468 | "train 0.8077360637087599\n",
469 | "test 0.7912400455062572\n",
470 | " precision recall f1-score support\n",
471 | "\n",
472 | " False 0.84 0.89 0.86 1300\n",
473 | " True 0.62 0.52 0.56 458\n",
474 | "\n",
475 | " accuracy 0.79 1758\n",
476 | " macro avg 0.73 0.70 0.71 1758\n",
477 | "weighted avg 0.78 0.79 0.79 1758\n",
478 | "\n"
479 | ]
480 | }
481 | ],
482 | "source": [
483 | "pipe = Pipeline([('ct', ct), # 1. Encode the categorical features as numeric\n",
484 | " ('scaler', StandardScaler()), # 2. Normalize the numeric features\n",
485 | " ('clf', clf)]) # 3. Train a classification model using these processed features\n",
486 | "pipe.fit(X_train, y_train)\n",
487 | "train_score = pipe.score(X_train, y_train)\n",
488 | "test_score = pipe.score(X_test, y_test)\n",
489 | "print(\"train\",train_score)\n",
490 | "print(\"test\", test_score) \n",
491 | "print(classification_report(y_test, pipe.predict(X_test)))"
492 | ]
493 | },
494 | {
495 | "cell_type": "markdown",
496 | "metadata": {},
497 | "source": [
498 | "### Compare with Random Forest\n",
499 | "Just for a comparison, lets compare this model to a Random Forest model.\n",
500 | "This is simpler since Random Forests do not need the categorical features encoded with a `OneHotEncoder`."
501 | ]
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": 14,
506 | "metadata": {},
507 | "outputs": [
508 | {
509 | "name": "stdout",
510 | "output_type": "stream",
511 | "text": [
512 | "train 0.9981039059537353\n",
513 | "test 0.7895335608646189\n",
514 | " precision recall f1-score support\n",
515 | "\n",
516 | " False 0.83 0.90 0.86 1300\n",
517 | " True 0.63 0.47 0.54 458\n",
518 | "\n",
519 | " accuracy 0.79 1758\n",
520 | " macro avg 0.73 0.69 0.70 1758\n",
521 | "weighted avg 0.78 0.79 0.78 1758\n",
522 | "\n"
523 | ]
524 | }
525 | ],
526 | "source": [
527 | "from sklearn.ensemble import RandomForestClassifier\n",
528 | "clf_rf = RandomForestClassifier(n_estimators=100)\n",
529 | "pipe_rf = Pipeline([('scaler', StandardScaler()),\n",
530 | " ('clf', clf_rf)])\n",
531 | "pipe_rf.fit(X_train, y_train)\n",
532 | "train_score = pipe_rf.score(X_train, y_train)\n",
533 | "test_score = pipe_rf.score(X_test, y_test)\n",
534 | "print(\"train\",train_score)\n",
535 | "print(\"test\", test_score)\n",
536 | "print(classification_report(y_test, pipe_rf.predict(X_test)))"
537 | ]
538 | },
539 | {
540 | "cell_type": "markdown",
541 | "metadata": {},
542 | "source": [
543 | "### Plot ROC Curve\n",
544 | "\n",
545 | "We can also generate an ROC Curve to visualize the model's performance and calculate the AUROC"
546 | ]
547 | },
548 | {
549 | "cell_type": "code",
550 | "execution_count": 24,
551 | "metadata": {},
552 | "outputs": [
553 | {
554 | "name": "stdout",
555 | "output_type": "stream",
556 | "text": [
557 | "Logistic: AUROC=0.834\n"
558 | ]
559 | },
560 | {
561 | "data": {
562 | "image/png": "\n",
563 | "text/plain": [
564 | ""
565 | ]
566 | },
567 | "metadata": {
568 | "needs_background": "light"
569 | },
570 | "output_type": "display_data"
571 | }
572 | ],
573 | "source": [
574 | "from sklearn.metrics import roc_curve\n",
575 | "from sklearn.metrics import roc_auc_score\n",
576 | "from matplotlib import pyplot\n",
577 | "\n",
578 | "logistic_regression_probabilities = pipe.predict_proba(X_test)\n",
579 | "logistic_regression_probabilities = logistic_regression_probabilities[:, 1]\n",
580 | "logistic_regression_auc = roc_auc_score(y_test, logistic_regression_probabilities)\n",
581 | "print('Logistic: AUROC=%.3f' % (logistic_regression_auc))\n",
582 | "logistic_regression_fpr, logistic_regression_tpr, _ = roc_curve(y_test, logistic_regression_probabilities)\n",
583 | "pyplot.plot(logistic_regression_fpr, logistic_regression_tpr, label='Logistic')\n",
584 | "pyplot.show()"
585 | ]
586 | },
587 | {
588 | "cell_type": "markdown",
589 | "metadata": {},
590 | "source": [
591 | "We find an AUC of 0.83. Not bad for a quick exercise without fine tuning.\n"
592 | ]
593 | },
594 | {
595 | "cell_type": "markdown",
596 | "metadata": {},
597 | "source": [
598 | "## Interpretability model\n",
599 | "We use [lime](https://github.com/marcotcr/lime) (Local Interpretable Model-Agnostic Explanations) to explain the predictions.\n",
600 | "It is a method of determining which feature has the greatest effect on the predicted value,\n",
601 | "and is explained in depth in the the [FFL report](https://ff06-2020.fastforwardlabs.com/).\n",
602 | "For more information, refer to the [lime documentation](https://lime-ml.readthedocs.io/en/latest/lime.html)."
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": 16,
608 | "metadata": {},
609 | "outputs": [],
610 | "source": [
611 | "from lime.lime_tabular import LimeTabularExplainer\n",
612 | "\n",
613 | "data[labels.name + ' probability'] = pipe.predict_proba(X)[:, 1]\n",
614 | "\n",
615 | "# List of length number of features, containing names of features in order\n",
616 | "# in which they appear in X\n",
617 | "feature_names = list(ce.columns_)\n",
618 | "\n",
619 | "# List of indices of columns of X containing categorical features\n",
620 | "categorical_features = list(ce.cat_columns_ix_.values())\n",
621 | "\n",
622 | "# List of (index, [cat1, cat2...]) index-strings tuples, where each index\n",
623 | "# is that of a categorical column in X, and the list of strings are the\n",
624 | "# possible values it can take\n",
625 | "categorical_names = {i: ce.classes_[c]\n",
626 | " for c, i in ce.cat_columns_ix_.items()}\n",
627 | "class_names = ['No ' + labels.name, labels.name]\n",
628 | "explainer = LimeTabularExplainer(ce.transform(data),\n",
629 | " feature_names=feature_names,\n",
630 | " class_names=class_names,\n",
631 | " categorical_features=categorical_features,\n",
632 | " categorical_names=categorical_names) \n",
633 | "\n"
634 | ]
635 | },
636 | {
637 | "cell_type": "markdown",
638 | "metadata": {},
639 | "source": [
640 | "## Explaining a Single Prediction\n",
641 | "\n",
642 | "Let's look at how one specfic prediction would be interpreted.\n",
643 | "Lime explains the prediction by giving every feature a weight from -1 to 1.\n",
644 | "Features with weights closer to -1 have a stronger impact in coming up with a 0 prediction result (will not churn) and vice versa."
645 | ]
646 | },
647 | {
648 | "cell_type": "code",
649 | "execution_count": 17,
650 | "metadata": {},
651 | "outputs": [
652 | {
653 | "data": {
654 | "text/html": [
655 | "
\n",
656 | "\n",
669 | "
\n",
670 | " \n",
671 | "
\n",
672 | "
id
\n",
673 | "
4809
\n",
674 | "
\n",
675 | " \n",
676 | " \n",
677 | "
\n",
678 | "
gender
\n",
679 | "
Female
\n",
680 | "
\n",
681 | "
\n",
682 | "
SeniorCitizen
\n",
683 | "
0
\n",
684 | "
\n",
685 | "
\n",
686 | "
Partner
\n",
687 | "
No
\n",
688 | "
\n",
689 | "
\n",
690 | "
Dependents
\n",
691 | "
No
\n",
692 | "
\n",
693 | "
\n",
694 | "
tenure
\n",
695 | "
1
\n",
696 | "
\n",
697 | "
\n",
698 | "
PhoneService
\n",
699 | "
Yes
\n",
700 | "
\n",
701 | "
\n",
702 | "
MultipleLines
\n",
703 | "
No
\n",
704 | "
\n",
705 | "
\n",
706 | "
InternetService
\n",
707 | "
No
\n",
708 | "
\n",
709 | "
\n",
710 | "
OnlineSecurity
\n",
711 | "
No internet service
\n",
712 | "
\n",
713 | "
\n",
714 | "
OnlineBackup
\n",
715 | "
No internet service
\n",
716 | "
\n",
717 | "
\n",
718 | "
DeviceProtection
\n",
719 | "
No internet service
\n",
720 | "
\n",
721 | "
\n",
722 | "
TechSupport
\n",
723 | "
No internet service
\n",
724 | "
\n",
725 | "
\n",
726 | "
StreamingTV
\n",
727 | "
No internet service
\n",
728 | "
\n",
729 | "
\n",
730 | "
StreamingMovies
\n",
731 | "
No internet service
\n",
732 | "
\n",
733 | "
\n",
734 | "
Contract
\n",
735 | "
Month-to-month
\n",
736 | "
\n",
737 | "
\n",
738 | "
PaperlessBilling
\n",
739 | "
No
\n",
740 | "
\n",
741 | "
\n",
742 | "
PaymentMethod
\n",
743 | "
Mailed check
\n",
744 | "
\n",
745 | "
\n",
746 | "
MonthlyCharges
\n",
747 | "
19.9
\n",
748 | "
\n",
749 | "
\n",
750 | "
TotalCharges
\n",
751 | "
19.9
\n",
752 | "
\n",
753 | "
\n",
754 | "
Churn probability
\n",
755 | "
0.220148
\n",
756 | "
\n",
757 | " \n",
758 | "
\n",
759 | "
"
760 | ],
761 | "text/plain": [
762 | "id 4809\n",
763 | "gender Female\n",
764 | "SeniorCitizen 0\n",
765 | "Partner No\n",
766 | "Dependents No\n",
767 | "tenure 1\n",
768 | "PhoneService Yes\n",
769 | "MultipleLines No\n",
770 | "InternetService No\n",
771 | "OnlineSecurity No internet service\n",
772 | "OnlineBackup No internet service\n",
773 | "DeviceProtection No internet service\n",
774 | "TechSupport No internet service\n",
775 | "StreamingTV No internet service\n",
776 | "StreamingMovies No internet service\n",
777 | "Contract Month-to-month\n",
778 | "PaperlessBilling No\n",
779 | "PaymentMethod Mailed check\n",
780 | "MonthlyCharges 19.9\n",
781 | "TotalCharges 19.9\n",
782 | "Churn probability 0.220148"
783 | ]
784 | },
785 | "execution_count": 17,
786 | "metadata": {},
787 | "output_type": "execute_result"
788 | }
789 | ],
790 | "source": [
791 | "data.sample().T # reminder of the features"
792 | ]
793 | },
794 | {
795 | "cell_type": "code",
796 | "execution_count": 18,
797 | "metadata": {},
798 | "outputs": [
799 | {
800 | "name": "stdout",
801 | "output_type": "stream",
802 | "text": [
803 | "('tenure > 55.00', -0.2764138466515261)\n",
804 | "('MonthlyCharges > 89.86', -0.24321978003513584)\n",
805 | "('InternetService=Fiber optic', 0.2096249701592442)\n",
806 | "('TotalCharges > 3794.74', 0.2031826086609449)\n",
807 | "('StreamingMovies=Yes', 0.08274884799449057)\n",
808 | "('StreamingTV=Yes', 0.07781839117828696)\n",
809 | "('PhoneService=Yes', 0.04962121848245511)\n",
810 | "('MultipleLines=Yes', 0.04446637536101756)\n",
811 | "('Contract=One year', -0.04392535067270691)\n",
812 | "('TechSupport=No', 0.04173749428961184)\n"
813 | ]
814 | }
815 | ],
816 | "source": [
817 | "exp = explainer.explain_instance(ce.transform(data.sample())[0],pipe.predict_proba)\n",
818 | "for cols in exp.as_list():\n",
819 | " print(cols)"
820 | ]
821 | },
822 | {
823 | "cell_type": "code",
824 | "execution_count": 19,
825 | "metadata": {},
826 | "outputs": [
827 | {
828 | "data": {
829 | "image/png": "\n",
830 | "text/plain": [
831 | ""
832 | ]
833 | },
834 | "metadata": {
835 | "needs_background": "light"
836 | },
837 | "output_type": "display_data"
838 | }
839 | ],
840 | "source": [
841 | "exp.as_pyplot_figure()\n",
842 | "plt.tight_layout()"
843 | ]
844 | },
845 | {
846 | "cell_type": "markdown",
847 | "metadata": {},
848 | "source": [
849 | "We see that one of the features that contributes most strongly to the positive prediction is the short tenure of the customer."
850 | ]
851 | },
852 | {
853 | "cell_type": "markdown",
854 | "metadata": {},
855 | "source": [
856 | "## Saving the model\n",
857 | "Now that we've done all this work to build the models, we want to be able to use them later.\n",
858 | "The `ExplainedModel` class is a handy wrapper for using the `CategoricalEncoder`, the `Pipeline` object which *is* the churn model, and the Lime Explainer.\n",
859 | "Here, we use it to save these trained models for use in later parts of the Project."
860 | ]
861 | },
862 | {
863 | "cell_type": "code",
864 | "execution_count": 20,
865 | "metadata": {},
866 | "outputs": [],
867 | "source": [
868 | "from churnexplainer import ExplainedModel\n",
869 | "explainedmodel = ExplainedModel(data=data, labels=labels, model_name='telco_linear',\n",
870 | " categoricalencoder=ce, pipeline=pipe,\n",
871 | " explainer=explainer,data_dir=data_dir)\n",
872 | "explainedmodel.save()"
873 | ]
874 | },
875 | {
876 | "cell_type": "code",
877 | "execution_count": 21,
878 | "metadata": {},
879 | "outputs": [],
880 | "source": [
881 | "spark.stop()"
882 | ]
883 | },
884 | {
885 | "cell_type": "markdown",
886 | "metadata": {},
887 | "source": [
888 | "## Wrap up\n",
889 | "We've now covered all the steps to **building a machine learning model** including interpretability\n",
890 | "and saved our work for use in later sections.\n",
891 | "\n",
892 | "In the next part of the series we will explore how to use the **Experiments** feature of CML\n",
893 | "for when we want to test lots of combinations of hyperparameters to fine tune our models.\n"
894 | ]
895 | }
896 | ],
897 | "metadata": {
898 | "kernelspec": {
899 | "display_name": "Python 3",
900 | "language": "python",
901 | "name": "python3"
902 | },
903 | "language_info": {
904 | "codemirror_mode": {
905 | "name": "ipython",
906 | "version": 3
907 | },
908 | "file_extension": ".py",
909 | "mimetype": "text/x-python",
910 | "name": "python",
911 | "nbconvert_exporter": "python",
912 | "pygments_lexer": "ipython3",
913 | "version": "3.6.9"
914 | }
915 | },
916 | "nbformat": 4,
917 | "nbformat_minor": 4
918 | }
919 |
--------------------------------------------------------------------------------
/4_train_models.py:
--------------------------------------------------------------------------------
1 | # Part 4: Model Training
2 |
3 | # This script is used to train an Explained model and also how to use the
4 | # Jobs to run model training and the Experiments feature of CML to facilitate model
5 | # tuning.
6 |
7 | # If you haven't yet, run through the initialization steps in the README file and Part 1.
8 | # In Part 1, the data is imported into the `default.telco_churn` table in Hive.
9 | # All data accesses fetch from Hive.
10 | #
11 | # To simply train the model once, run this file in a workbench session.
12 | #
13 | # There are 2 other ways of running the model training process
14 | #
15 | # ***Scheduled Jobs***
16 | #
17 | # The **[Jobs](https://docs.cloudera.com/machine-learning/cloud/jobs-pipelines/topics/ml-creating-a-job.html)**
18 | # feature allows for adhoc, recurring and depend jobs to run specific scripts. To run this model
19 | # training process as a job, create a new job by going to the Project window and clicking _Jobs >
20 | # New Job_ and entering the following settings:
21 | # * **Name** : Train Mdoel
22 | # * **Script** : 4_train_models.py
23 | # * **Arguments** : _Leave blank_
24 | # * **Kernel** : Python 3
25 | # * **Schedule** : Manual
26 | # * **Engine Profile** : 1 vCPU / 2 GiB
27 | # The rest can be left as is. Once the job has been created, click **Run** to start a manual
28 | # run for that job.
29 |
30 | # ***Experiments***
31 | #
32 | # Training a model for use in production requires testing many combinations of model parameters
33 | # and picking the best one based on one or more metrics.
34 | # In order to do this in a *principled*, *reproducible* way, an Experiment executes model training code with **versioning** of the **project code**, **input parameters**, and **output artifacts**.
35 | # This is a very useful feature for testing a large number of hyperparameters in parallel on elastic cloud resources.
36 |
37 | # **[Experiments](https://docs.cloudera.com/machine-learning/cloud/experiments/topics/ml-running-an-experiment.html)**.
38 | # run immediately and are used for testing different parameters in a model training process.
39 | # In this instance it would be use for hyperparameter optimisation. To run an experiment, from the
40 | # Project window click Experiments > Run Experiment with the following settings.
41 | # * **Script** : 4_train_models.py
42 | # * **Arguments** : 5 lbfgs 100 _(these the cv, solver and max_iter parameters to be passed to
43 | # LogisticRegressionCV() function)
44 | # * **Kernel** : Python 3
45 | # * **Engine Profile** : 1 vCPU / 2 GiB
46 |
47 | # Click **Start Run** and the expriment will be sheduled to build and run. Once the Run is
48 | # completed you can view the outputs that are tracked with the experiment using the
49 | # `cdsw.track_metrics` function. It's worth reading through the code to get a sense of what
50 | # all is going on.
51 |
52 | # More Details on Running Experiments
53 | # Requirements
54 | # Experiments have a few requirements:
55 | # - model training code in a `.py` script, not a notebook
56 | # - `requirements.txt` file listing package dependencies
57 | # - a `cdsw-build.sh` script containing code to install all dependencies
58 | #
59 | # These three components are provided for the churn model as `4_train_models.py`, `requirements.txt`,
60 | # and `cdsw-build.sh`, respectively.
61 | # You can see that `cdsw-build.sh` simply installs packages from `requirements.txt`.
62 | # The code in `4_train_models.py` is largely identical to the code in the last notebook.
63 | # with a few differences.
64 | #
65 | # The first difference from the last notebook is at the "Experiments options" section.
66 | # When you set up a new Experiment, you can enter
67 | # [**command line arguments**](https://docs.python.org/3/library/sys.html#sys.argv)
68 | # in standard Python fashion.
69 | # This will be where you enter the combination of model hyperparameters that you wish to test.
70 | #
71 | # The other difference is at the end of the script.
72 | # Here, the `cdsw` package (available by default) provides
73 | # [two methods](https://docs.cloudera.com/machine-learning/cloud/experiments/topics/ml-tracking-metrics.html)
74 | # to let the user evaluate results.
75 | #
76 | # **`cdsw.track_metric`** stores a single value which can be viewed in the Experiments UI.
77 | # Here we store two metrics and the filepath to the saved model.
78 | #
79 | # **`cdsw.track_file`** stores a file for later inspection.
80 | # Here we store the saved model, but we could also have saved a report csv, plot, or any other
81 | # output file.
82 | #
83 |
84 |
85 | from pyspark.sql.types import *
86 | from pyspark.sql import SparkSession
87 | import sys
88 | import os
89 | import os
90 | import datetime
91 | import subprocess
92 | import glob
93 | import dill
94 | import pandas as pd
95 | import numpy as np
96 | import cdsw
97 |
98 | from sklearn.model_selection import train_test_split
99 | from sklearn.metrics import classification_report
100 | from sklearn.preprocessing import OneHotEncoder, StandardScaler
101 | from sklearn.pipeline import Pipeline
102 | from sklearn.linear_model import LogisticRegressionCV
103 | from sklearn.pipeline import TransformerMixin
104 | from sklearn.preprocessing import LabelEncoder
105 | from sklearn.compose import ColumnTransformer
106 |
107 | from lime.lime_tabular import LimeTabularExplainer
108 |
109 | from churnexplainer import ExplainedModel, CategoricalEncoder
110 |
111 | data_dir = '/home/cdsw'
112 |
113 | idcol = 'customerID'
114 | labelcol = 'Churn'
115 | cols = (('gender', True),
116 | ('SeniorCitizen', True),
117 | ('Partner', True),
118 | ('Dependents', True),
119 | ('tenure', False),
120 | ('PhoneService', True),
121 | ('MultipleLines', True),
122 | ('InternetService', True),
123 | ('OnlineSecurity', True),
124 | ('OnlineBackup', True),
125 | ('DeviceProtection', True),
126 | ('TechSupport', True),
127 | ('StreamingTV', True),
128 | ('StreamingMovies', True),
129 | ('Contract', True),
130 | ('PaperlessBilling', True),
131 | ('PaymentMethod', True),
132 | ('MonthlyCharges', False),
133 | ('TotalCharges', False))
134 |
135 |
136 | # This is a fail safe incase the hive table did not get created in the last step.
137 | try:
138 | spark = SparkSession\
139 | .builder\
140 | .appName("PythonSQL")\
141 | .master("local[*]")\
142 | .getOrCreate()
143 |
144 | if (spark.sql("SELECT count(*) FROM default.telco_churn").collect()[0][0] > 0):
145 | df = spark.sql("SELECT * FROM default.telco_churn").toPandas()
146 | except:
147 | print("Hive table has not been created")
148 | df = pd.read_csv(os.path.join(
149 | 'raw', 'WA_Fn-UseC_-Telco-Customer-Churn-.csv'))
150 |
151 | # Clean and shape the data from lr and LIME
152 | df = df.replace(r'^\s$', np.nan, regex=True).dropna().reset_index()
153 | df.index.name = 'id'
154 | data, labels = df.drop(labelcol, axis=1), df[labelcol]
155 | data = data.replace({'SeniorCitizen': {1: 'Yes', 0: 'No'}})
156 | # This is Mike's lovely short hand syntax for looping through data and doing useful things. I think if we started to pay him by the ASCII char, we'd get more readable code.
157 | data = data[[c for c, _ in cols]]
158 | catcols = (c for c, iscat in cols if iscat)
159 | for col in catcols:
160 | data[col] = pd.Categorical(data[col])
161 | labels = (labels == 'Yes')
162 |
163 | # Prepare the pipeline and split the data for model training
164 | ce = CategoricalEncoder()
165 | X = ce.fit_transform(data)
166 | y = labels.values
167 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
168 | ct = ColumnTransformer(
169 | [('ohe', OneHotEncoder(), list(ce.cat_columns_ix_.values()))],
170 | remainder='passthrough'
171 | )
172 |
173 | # Experiments options
174 | # If you are running this as an experiment, pass the cv, solver and max_iter values
175 | # as arguments in that order. e.g. `5 lbfgs 100`.
176 |
177 | if len(sys.argv) == 4:
178 | try:
179 | cv = int(sys.argv[1])
180 | solver = str(sys.argv[2])
181 | max_iter = int(sys.argv[3])
182 | except:
183 | sys.exit("Invalid Arguments passed to Experiment")
184 | else:
185 | cv = 5
186 | solver = 'lbfgs' # one of newton-cg, lbfgs, liblinear, sag, saga
187 | max_iter = 100
188 |
189 | clf = LogisticRegressionCV(cv=cv, solver=solver, max_iter=max_iter)
190 | pipe = Pipeline([('ct', ct),
191 | ('scaler', StandardScaler()),
192 | ('clf', clf)])
193 |
194 | # The magical model.fit()
195 | pipe.fit(X_train, y_train)
196 | train_score = pipe.score(X_train, y_train)
197 | test_score = pipe.score(X_test, y_test)
198 | print("train", train_score)
199 | print("test", test_score)
200 | print(classification_report(y_test, pipe.predict(X_test)))
201 | data[labels.name + ' probability'] = pipe.predict_proba(X)[:, 1]
202 |
203 |
204 | # Create LIME Explainer
205 | feature_names = list(ce.columns_)
206 | categorical_features = list(ce.cat_columns_ix_.values())
207 | categorical_names = {i: ce.classes_[c]
208 | for c, i in ce.cat_columns_ix_.items()}
209 | class_names = ['No ' + labels.name, labels.name]
210 | explainer = LimeTabularExplainer(ce.transform(data),
211 | feature_names=feature_names,
212 | class_names=class_names,
213 | categorical_features=categorical_features,
214 | categorical_names=categorical_names)
215 |
216 |
217 | # Create and save the combined Logistic Regression and LIME Explained Model.
218 | explainedmodel = ExplainedModel(data=data, labels=labels, model_name='telco_linear',
219 | categoricalencoder=ce, pipeline=pipe,
220 | explainer=explainer, data_dir=data_dir)
221 | explainedmodel.save()
222 |
223 |
224 | # If running as as experiment, this will track the metrics and add the model trained in this
225 | # training run to the experiment history.
226 | cdsw.track_metric("train_score", round(train_score, 2))
227 | cdsw.track_metric("test_score", round(test_score, 2))
228 | cdsw.track_metric("model_path", explainedmodel.model_path)
229 | cdsw.track_file(explainedmodel.model_path)
230 |
231 | # Wrap up
232 |
233 | # We've now covered all the steps to **running Experiments**.
234 | #
235 | # Notice also that any script that will run as an Experiment can also be run as a Job or in a Session.
236 | # Our provided script can be run with the same settings as for Experiments.
237 | # A common use case is to **automate periodic model updates**.
238 | # Jobs can be scheduled to run the same model training script once a week using the latest data.
239 | # Another Job dependent on the first one can update the model parameters being used in production
240 | # if model metrics are favorable.
241 |
--------------------------------------------------------------------------------
/5_model_serve_explainer.py:
--------------------------------------------------------------------------------
1 | ## Part 5: Model Serving
2 | #
3 | # This notebook explains how to create and deploy Models in CML which function as a
4 | # REST API to serve predictions. This feature makes it very easy for a data scientist
5 | # to make trained models available and usable to other developers and data scientists
6 | # in your organization.
7 | #
8 | # In the last part of the series, you learned:
9 | # - the requirements for running an Experiment
10 | # - how to set up a new Experiment
11 | # - how to monitor the results of an Experiment
12 | # - limitations of the feature
13 | #
14 | # In this part, you will learn:
15 | # - the requirements for creating and deploying a Model
16 | # - how to deploy a Model
17 | # - how to test and use a Model
18 | # - limitations of the feature
19 | #
20 | # If you haven't yet, run through the initialization steps in the README file and Part 1.
21 | # In Part 1, the data is imported into the `default.telco_churn` table in Hive.
22 | # All data accesses fetch from Hive.
23 | #
24 | ### Requirements
25 | # Models have the same requirements as Experiments:
26 | # - model code in a `.py` script, not a notebook
27 | # - a `requirements.txt` file listing package dependencies
28 | # - a `cdsw-build.sh` script containing code to install all dependencies
29 | #
30 | # > In addition, Models *must* be designed with one main function that takes a dictionary as its sole argument
31 | # > and returns a single dictionary.
32 | # > CML handles the JSON serialization and deserialization.
33 |
34 | # In this file, there is minimal code since calculating predictions is much simpler
35 | # than training a machine learning model.
36 | # Once again, we use the `ExplainedModel` helper class in `churnexplainer.py`.
37 | # When a Model API is called, CML will translate the input and returned JSON blobs to and from python dictionaries.
38 | # Thus, the script simply loads the model we saved at the end of the last notebook,
39 | # passes the input dictionary into the model, and returns the results as a dictionary with the following format:
40 | #
41 | # {
42 | # 'data': dict(data),
43 | # 'probability': probability,
44 | # 'explanation': explanation
45 | # }
46 | #
47 | # The Model API will return this dictionary serialized as JSON.
48 | #
49 | ### Model Operations
50 | #
51 | # This model is deployed using the model operations feature of CML which consists of
52 | # [Model Metrics](https://docs.cloudera.com/machine-learning/cloud/model-metrics/topics/ml-enabling-model-metrics.html)
53 | # and [Model Governance](https://docs.cloudera.com/machine-learning/cloud/model-governance/topics/ml-enabling-model-governance.html)
54 | #
55 | # The first requirement to make the model use the model metrics feature by adding the
56 | # `@cdsw.model_metrics` [Python Decorator](https://wiki.python.org/moin/PythonDecorators)
57 | # before the fuction.
58 | #
59 | # Then you can use the *`cdsw.track_metric`* function to add additional
60 | # data to the underlying database for each call made to the model.
61 | # **Note:** `cdsw.track_metric` has different functionality depening on if its being
62 | # used in an *Experiment* or a *Model*.
63 | #
64 | # More detail is available
65 | # using the `help(cdsw.track_mertic)` function
66 | #```
67 | # help(cdsw.track_metric)
68 | # Help on function track_metric in module cdsw:
69 | #
70 | # track_metric(key, value)
71 | # Description
72 | # -----------
73 | #
74 | # Tracks a metric for an experiment or model deployment
75 | # Example:
76 | # model deployment usage:
77 | # >>>@cdsw.model_metrics
78 | # >>>predict_func(args):
79 | # >>> cdsw.track_metric("input_args", args)
80 | # >>> return {"result": "prediction"}
81 | #
82 | # experiment usage:
83 | # >>>cdsw.track_metric("input_args", args)
84 | #
85 | # Parameters
86 | # ----------
87 | # key: string
88 | # The metric key to track
89 | # value: string, boolean, numeric
90 | # The metric value to track
91 | #```
92 | #
93 | #
94 | ### Creating and deploying a Model
95 | # To create a Model using our `5_model_serve_explainer.py` script, use the following settings:
96 | # * **Name**: Explainer
97 | # * **Description**: Explain customer churn prediction
98 | # * **File**: `5_model_serve_explainer.py`
99 | # * **Function**: explain
100 | # * **Input**:
101 | # ```
102 | # {
103 | # "StreamingTV": "No",
104 | # "MonthlyCharges": 70.35,
105 | # "PhoneService": "No",
106 | # "PaperlessBilling": "No",
107 | # "Partner": "No",
108 | # "OnlineBackup": "No",
109 | # "gender": "Female",
110 | # "Contract": "Month-to-month",
111 | # "TotalCharges": 1397.475,
112 | # "StreamingMovies": "No",
113 | # "DeviceProtection": "No",
114 | # "PaymentMethod": "Bank transfer (automatic)",
115 | # "tenure": 29,
116 | # "Dependents": "No",
117 | # "OnlineSecurity": "No",
118 | # "MultipleLines": "No",
119 | # "InternetService": "DSL",
120 | # "SeniorCitizen": "No",
121 | # "TechSupport": "No"
122 | # }
123 | # ```
124 | #* **Kernel**: Python 3
125 | #* **Engine Profile**: 1 vCPU / 2 GiB Memory
126 | #
127 | # The rest can be left as is.
128 | #
129 | # After accepting the dialog, CML will *build* a new Docker image using `cdsw-build.sh`,
130 | # then *assign an endpoint* for sending requests to the new Model.
131 |
132 | ## Testing the Model
133 | # > To verify it's returning the right results in the format you expect, you can
134 | # > test any Model from it's *Overview* page.
135 | #
136 | # If you entered an *Example Input* before, it will be the default input here,
137 | # though you can enter your own.
138 |
139 | ## Using the Model
140 | #
141 | # > The *Overview* page also provides sample `curl` or Python commands for calling your Model API.
142 | # > You can adapt these samples for other code that will call this API.
143 | #
144 | # This is also where you can find the full endpoint to share with other developers
145 | # and data scientists.
146 | #
147 | # **Note:** for security, you can specify
148 | # [Model API Keys](https://docs.cloudera.com/machine-learning/cloud/models/topics/ml-model-api-key-for-models.html)
149 | # to add authentication.
150 |
151 | ## Limitations
152 | #
153 | # Models do have a few limitations that are important to know:
154 | # - re-deploying or re-building Models results in Model downtime (usually brief)
155 | # - re-starting CML does not automatically restart active Models
156 | # - Model logs and statistics are only preserved so long as the individual replica is active
157 | #
158 | # A current list of known limitations are
159 | # [documented here](https://docs.cloudera.com/machine-learning/cloud/models/topics/ml-models-known-issues-and-limitations.html).
160 |
161 |
162 | from collections import ChainMap
163 | import cdsw, numpy
164 | from churnexplainer import ExplainedModel
165 |
166 | #Load the model save earlier.
167 | em = ExplainedModel(model_name='telco_linear',data_dir='/home/cdsw')
168 |
169 | # *Note:* If you want to test this in a session, comment out the line
170 | #`@cdsw.model_metrics` below. Don't forget to uncomment when you
171 | # deploy, or it won't write the metrics to the database
172 |
173 | @cdsw.model_metrics
174 | # This is the main function used for serving the model. It will take in the JSON formatted arguments , calculate the probablity of
175 | # churn and create a LIME explainer explained instance and return that as JSON.
176 | def explain(args):
177 | data = dict(ChainMap(args, em.default_data))
178 | data = em.cast_dct(data)
179 | probability, explanation = em.explain_dct(data)
180 |
181 | # Track inputs
182 | cdsw.track_metric('input_data', data)
183 |
184 | # Track our prediction
185 | cdsw.track_metric('probability', probability)
186 |
187 | # Track explanation
188 | cdsw.track_metric('explanation', explanation)
189 |
190 | return {
191 | 'data': dict(data),
192 | 'probability': probability,
193 | 'explanation': explanation
194 | }
195 |
196 | # To test this is a session, comment out the `@cdsw.model_metrics` line,
197 | # uncomment the and run the two rows below.
198 | #x={"StreamingTV":"No","MonthlyCharges":70.35,"PhoneService":"No","PaperlessBilling":"No","Partner":"No","OnlineBackup":"No","gender":"Female","Contract":"Month-to-month","TotalCharges":1397.475,"StreamingMovies":"No","DeviceProtection":"No","PaymentMethod":"Bank transfer (automatic)","tenure":29,"Dependents":"No","OnlineSecurity":"No","MultipleLines":"No","InternetService":"DSL","SeniorCitizen":"No","TechSupport":"No"}
199 | #explain(x)
200 |
201 | ## Wrap up
202 | #
203 | # We've now covered all the steps to **deploying and serving Models**, including the
204 | # requirements, limitations, and how to set up, test, and use them.
205 | # This is a powerful way to get data scientists' work in use by other people quickly.
206 | #
207 | # In the next part of the project we will explore how to launch a **web application**
208 | # served through CML.
209 | # Your team is busy building models to solve problems.
210 | # CML-hosted Applications are a simple way to get these solutions in front of
211 | # stakeholders quickly.
--------------------------------------------------------------------------------
/6_application.py:
--------------------------------------------------------------------------------
1 | # Part 6: Application
2 |
3 | # This script explains how to create and deploy Applications in CML.
4 | # This feature allows data scientists to **get ML solutions in front of stakeholders quickly**,
5 | # including business users who need results fast.
6 | # This may be good for sharing a **highly customized dashboard**, a **monitoring tool**, or a **product mockup**.
7 |
8 | # CML is agnostic regarding frameworks.
9 | # [Flask](https://flask.palletsprojects.com/en/1.1.x/),
10 | # [Dash](https://plotly.com/dash/),
11 | # or even [Tornado](https://www.tornadoweb.org/en/stable/) apps will all work.
12 | # R users will find it easy to deploy Shiny apps.
13 |
14 | # If you haven't yet, run through the initialization steps in the README file. Do that
15 | # now
16 |
17 | # This file is provides a sample Flask app script, ready for deployment,
18 | # which displays churn predictions and explanations using the Model API deployed in
19 | # Part 5
20 |
21 | # Deploying the Application
22 | #
23 | # > Once you have written an app that is working as desired, including in a test Session,
24 | # > it can be deployed using the *New Application* dialog in the *Applications* tab in CML.
25 |
26 | # After accepting the dialog, CML will deploy the application then *assign a URL* to
27 | # the Application using the subdomain you chose.
28 | #
29 | # *Note:* This does not requirement the `cdsw-build.sh* file as it doen now follows a
30 | # seperate build process to deploy an application.
31 | #
32 |
33 | # To create an Application using our sample Flask app, perform the following.
34 | # This is a special step for this particular app:
35 | #
36 | # In the deployed Model from step 5, go to *Model* > *Settings* in CML and make a note (i.e. copy) the
37 | # "**Access Key**". eg - `mqc8ypo...pmj056y`
38 | #
39 | # While you're there, **disable** the additional Model authentication feature by unticking **Enable Authentication**.
40 | #
41 | # **Note**: Disabling authentication is only necessary for this Application to work.
42 | # Ordinarily, you may want to keep Authentication in place.
43 | #
44 | # Next, from the Project level, click on *Open Workbench* (note you don't actually have to Launch a
45 | # Session) in order to edit a file. Select the `flask/single_view.html` file and paste the Access
46 | # Key in at line 19.
47 | #
48 | # ` const accessKey = "mp3ebluylxh4yn5h9xurh1r0430y76ca";`
49 | #
50 | # Save the file (if it has not auto saved already) and go back to the Project.
51 | #
52 | # Finally, go to the *Applications* section of the Project and select *New Application* with the following:
53 | # * **Name**: Churn Analysis App
54 | # * **Subdomain**: churn-app _(note: this needs to be unique, so if you've done this before,
55 | # pick a more random subdomain name)_
56 | # * **Script**: 6_application.py
57 | # * **Kernel**: Python 3
58 | # * **Engine Profile**: 1 vCPU / 2 GiB Memory
59 | #
60 | # Accept the inputs, and in a few minutes the Application will be ready to use.
61 |
62 | # Using the Application
63 |
64 | # > A few minutes after deploying, the *Applications* page will show the app as Running.
65 | # You can then click on its name to access it.
66 | # CML Applications are accessible by any user with read-only (or higher) access to the project.
67 | #
68 |
69 | # This deploys a basic flask application for serving the HTML and some specific data
70 | # use for project Application.
71 |
72 | # At this point, you will be able to open the Churn Analysis App.
73 | # The initial view is a table of randomly selected customers from the dataset.
74 | # This provides a snapshot of the customer base as a whole.
75 | # The colors in the *Probability* column correspond to the prediction, with red customers being deemed more likely to churn.
76 | # The colors of the features show which are most important for each prediction.
77 | # Deeper red indicates incresed importance for predicting that a customer **will churn**
78 | # while deeper blue indicates incresed importance for predicting that a customer **will not**.
79 | #
80 | from flask import Flask, send_from_directory, request
81 | from IPython.display import Javascript, HTML
82 | import random
83 | import os
84 | from churnexplainer import ExplainedModel
85 | from collections import ChainMap
86 | from flask import Flask
87 | from pandas.io.json import dumps as jsonify
88 | import logging
89 | import subprocess
90 | from IPython.display import Image
91 | Image("images/table_view.png")
92 | #
93 | # Clicking on any row will show a "local" interpreted model for that particular customer.
94 | # Here, you can see how adjusting any one of the features will change that customer's churn prediction.
95 | #
96 | Image("images/single_view_1.png")
97 | #
98 | # Changing the *InternetService* to *DSL* lowers the probablity of churn.
99 | # **Note**: this obviously does *not* mean that you should change that customer's internet service to DSL
100 | # and expect they will be less likely to churn.
101 | # Imagine if your ISP did that to you.
102 | # Rather, the model is more optimistic about an otherwise identical customer who has been using DSL.
103 | # This information simply gives you a clearer view of what to expect given specific factors
104 | # as a starting point for developing your business strategies.
105 | # Furthermore, as you start implementing changes based on the model, it may change customers' behavior
106 | # so that the predictions stop being reliable.
107 | # It's important to use Jobs to keep models up-to-date.
108 | #
109 | Image("images/single_view_2.png")
110 | #
111 | # There are many frameworks that ease the development of interactive, informative webapps.
112 | # Once written, it is straightforward to deploy them in CML.
113 |
114 |
115 | # This reduces the the output to the console window
116 | log = logging.getLogger('werkzeug')
117 | log.setLevel(logging.ERROR)
118 |
119 | # Since we have access in an environment variable, we want to write it to our UI
120 | # Change the line in the flask/single_view.html file.
121 | if os.environ.get('SHTM_ACCESS_KEY') != None:
122 | access_key = os.environ.get('SHTM_ACCESS_KEY', "")
123 | subprocess.call(["sed", "-i", 's/const\saccessKey.*/const accessKey = "' +
124 | access_key + '";/', "/home/cdsw/flask/single_view.html"])
125 |
126 |
127 | # Load the explained model
128 | em = ExplainedModel(model_name='telco_linear', data_dir='/home/cdsw')
129 |
130 | # Creates an explained version of a partiuclar data point. This is almost exactly the same as the data used in the model serving code.
131 |
132 |
133 | def explainid(N):
134 | customer_data = dataid(N)[0]
135 | customer_data.pop('id')
136 | customer_data.pop('Churn probability')
137 | data = em.cast_dct(customer_data)
138 | probability, explanation = em.explain_dct(data)
139 | return {'data': dict(data),
140 | 'probability': probability,
141 | 'explanation': explanation,
142 | 'id': int(N)}
143 |
144 | # Gets the rest of the row data for a particular customer.
145 |
146 |
147 | def dataid(N):
148 | customer_id = em.data.index.dtype.type(N)
149 | customer_df = em.data.loc[[customer_id]].reset_index()
150 | return customer_df.to_dict(orient='records')
151 |
152 |
153 | # Flask doing flasky things
154 | flask_app = Flask(__name__, static_url_path='')
155 |
156 |
157 | @flask_app.route('/')
158 | def home():
159 | return ""
160 |
161 |
162 | @flask_app.route('/flask/')
163 | def send_file(path):
164 | return send_from_directory('flask', path)
165 |
166 | # Grabs a sample explained dataset for 10 randomly selected customers.
167 |
168 |
169 | @flask_app.route('/sample_table')
170 | def sample_table():
171 | sample_ids = random.sample(range(1, len(em.data)), 10)
172 | sample_table = []
173 | for ids in sample_ids:
174 | sample_table.append(explainid(str(ids)))
175 | return jsonify(sample_table)
176 |
177 | # Shows the names and all the catagories of the categorical variables.
178 |
179 |
180 | @flask_app.route("/categories")
181 | def categories():
182 | return jsonify({feat: dict(enumerate(cats))
183 | for feat, cats in em.categories.items()})
184 |
185 | # Shows the names and all the statistical variations of the numerica variables.
186 |
187 |
188 | @flask_app.route("/stats")
189 | def stats():
190 | return jsonify(em.stats)
191 |
192 |
193 | # A handy way to get the link if you are running in a session.
194 | HTML("Open Table View".format(
195 | os.environ['CDSW_ENGINE_ID'], os.environ['CDSW_DOMAIN']))
196 |
197 | # Launches flask. Note the host and port details. This is specific to CML/CDSW
198 | if __name__ == "__main__":
199 | flask_app.run(host='127.0.0.1', port=int(os.environ['CDSW_APP_PORT']))
200 |
--------------------------------------------------------------------------------
/7a_ml_ops_simulation.py:
--------------------------------------------------------------------------------
1 | ## Part 7a - Model Operations - Drift Simulation
2 | #
3 | # This script show cases how to use the model operations features of CML.
4 | # # This feature allows machine learning engineering to **measure and manage models
5 | # through their life cycle**, and know how a model is performing over time. As part
6 | # of the larger machine learning lifecycle, this closes the loop on managing
7 | # models that have been deployed into production.
8 |
9 | ### Add Model Metrics
10 | # New metrics can be added to a model and existing ones updated using the `cdsw`
11 | # library and the [model metrics SDK](https://docs.cloudera.com/machine-learning/cloud/model-metrics/topics/ml-tracking-model-metrics-using-python.html)
12 | # If model metrics is enabled for a model, then every call to that model is recorded
13 | # in the model metric database. There are situations in which its necessary to update or
14 | # add to those recordered metrics. This script shows you how this works.
15 |
16 | #### Update Exsiting Tracked Metrics
17 | # This is part of what is called "ground truth". Certain machine learning implemetations,
18 | # (like this very project) will use a supervised approach where a model is making a
19 | # prediction and the acutal value (or lable) is only available at later stage. To check
20 | # how well a model is performing, these actual values need to be compared with the
21 | # prediction the model. Each time a model end point is called, it provides the response
22 | # from the function, some other details and a unique uuid for that response.
23 | # This tracked model response entry can then be updated at a later date to add the
24 | # actual "ground truth" value, or any other data that you want to add.
25 | #
26 | # Data can be added to a tracked model response using the `cdsw.track_delayed_metrics`.
27 | #
28 | # ```python
29 | # help(cdsw.track_delayed_metrics)
30 | # Help on function track_delayed_metrics in module cdsw:
31 | #
32 | # track_delayed_metrics(metrics, prediction_uuid)
33 | # Description
34 | # -----------
35 | #
36 | # Track a metric for a model prediction that is only known after prediction time.
37 | # For example, for a model that makes a binary or categorical prediction, the actual
38 | # correctness of the prediction is not known at prediction time. This function can be
39 | # used to retroactively to track a prediction's correctness later, when ground truth
40 | # is available
41 | # Example:
42 | # >>>track_delayed_metrics({"ground_truth": "value"}, "prediction_uuid")
43 | #
44 | # Parameters
45 | # ----------
46 | # metrics: object
47 | # metrics object
48 | # prediction_uuid: string, UUID
49 | # prediction UUID of model metrics
50 | # ```
51 |
52 | #### Adding Additional Metrics
53 | # It is also possible to add additional data/metrics to the model database to track
54 | # things like aggrerate metrics that aren't associated with the one particular response.
55 | # This can be done using the `cdsw.track_aggregate_metrics` function.
56 |
57 | # ```python
58 | # help(cdsw.track_aggregate_metrics)
59 | # Help on function track_aggregate_metrics in module cdsw:
60 | #
61 | # track_aggregate_metrics(metrics, start_timestamp_ms, end_timestamp_ms, model_deployment_crn=None)
62 | # Description
63 | # -----------
64 | #
65 | # Track aggregate metric data for model deployment or model build or model
66 | # Example:
67 | # >>>track_aggregate_metrics({"val_count": 125}, 1585685142786,
68 | # ... 1585685153602, model_deployment_crn="/db401b6a-4b26-4c8f-8ea6-a1b09b93db88"))
69 | #
70 | # Parameters
71 | # ----------
72 | # metrics: object
73 | # metrics data object
74 | # start_timestamp_ms: int
75 | # aggregated metrics start timestamp in milliseconds
76 | # end_timestamp_ms: int
77 | # aggregated metrics end timestamp in milliseconds
78 | # model_deployment_crn: string
79 | # model deployment Crn
80 | # ```
81 | #
82 |
83 | ### Model Drift Simlation
84 | # This script simulates making calls to the model using sample data, and slowly
85 | # introducting an increasing amount of random variation to the churn value so that
86 | # the model will be less accurate over time.
87 |
88 | # The script will grab 1000 random samples from the data set and simulate 1000
89 | # predictions. The live model will be called each time in the loop and while the
90 | # `churn_error` function adds an increasing amount of error to the data to make
91 | # the model less accurate. The actual value, the response value and the uuid are
92 | # added to an array.
93 | #
94 | # Then there is "ground truth" loop that iterates though the array and updates the
95 | # recorded metric to add the actual lable value using the uuid. At the same time, the
96 | # model accruacy is evaluated every 100 samples and added as an aggregate metric.
97 | # Overtime this accuracy metric falls due the error introduced into the data.
98 |
99 |
100 | import cdsw, time, os, random, json
101 | import numpy as np
102 | import pandas as pd
103 | import matplotlib.pyplot as plt
104 | from sklearn.metrics import classification_report
105 | from cmlbootstrap import CMLBootstrap
106 | import seaborn as sns
107 | import copy
108 |
109 |
110 | ## Set the model ID
111 | # Get the model id from the model you deployed in step 5. These are unique to each
112 | # model on CML.
113 |
114 | model_id = "63"
115 |
116 | # Grab the data from Hive.
117 | from pyspark.sql import SparkSession
118 | from pyspark.sql.types import *
119 | spark = SparkSession\
120 | .builder\
121 | .appName("PythonSQL")\
122 | .master("local[*]")\
123 | .getOrCreate()
124 |
125 | df = spark.sql("SELECT * FROM default.telco_churn").toPandas()
126 |
127 | # Get the various Model CRN details
128 | HOST = os.getenv("CDSW_API_URL").split(":")[0] + "://" + os.getenv("CDSW_DOMAIN")
129 | cml = CMLBootstrap()
130 |
131 | latest_model = cml.get_model({"id": model_id, "latestModelDeployment": True, "latestModelBuild": True})
132 |
133 | Model_CRN = latest_model ["crn"]
134 | Deployment_CRN = latest_model["latestModelDeployment"]["crn"]
135 | model_endpoint = HOST.split("//")[0] + "//modelservice." + HOST.split("//")[1] + "/model"
136 |
137 | # This will randomly return True for input and increases the likelihood of returning
138 | # true based on `percent`
139 | def churn_error(item,percent):
140 | if random.random() < percent:
141 | return True
142 | else:
143 | return True if item=='Yes' else False
144 |
145 |
146 | # Get 1000 samples
147 | df_sample = df.sample(1000)
148 |
149 | df_sample.groupby('Churn')['Churn'].count()
150 |
151 | df_sample_clean = df_sample.\
152 | replace({'SeniorCitizen': {"1": 'Yes', "0": 'No'}}).\
153 | replace(r'^\s$', np.nan, regex=True).\
154 | dropna()
155 |
156 | # Create an array of model responses.
157 | response_labels_sample = []
158 |
159 | # Make 1000 calls to the model with increasing error
160 | percent_counter = 0
161 | percent_max = len(df_sample_clean)
162 |
163 | for record in json.loads(df_sample_clean.to_json(orient='records')):
164 | print("Added {} records".format(percent_counter)) if (percent_counter%50 == 0) else None
165 | percent_counter += 1
166 | no_churn_record = copy.deepcopy(record)
167 | no_churn_record.pop('customerID')
168 | no_churn_record.pop('Churn')
169 | # **note** this is an easy way to interact with a model in a script
170 | response = cdsw.call_model(latest_model["accessKey"],no_churn_record)
171 | response_labels_sample.append(
172 | {
173 | "uuid":response["response"]["uuid"],
174 | "final_label":churn_error(record["Churn"],percent_counter/percent_max),
175 | "response_label":response["response"]["prediction"]["probability"] >= 0.5,
176 | "timestamp_ms":int(round(time.time() * 1000))
177 | }
178 | )
179 |
180 | # The "ground truth" loop adds the updated actual label value and an accuracy measure
181 | # every 100 calls to the model.
182 | for index, vals in enumerate(response_labels_sample):
183 | print("Update {} records".format(index)) if (index%50 == 0) else None
184 | cdsw.track_delayed_metrics({"final_label":vals['final_label']}, vals['uuid'])
185 | if (index%100 == 0):
186 | start_timestamp_ms = vals['timestamp_ms']
187 | final_labels = []
188 | response_labels = []
189 | final_labels.append(vals['final_label'])
190 | response_labels.append(vals['response_label'])
191 | if (index%100 == 99):
192 | print("Adding accuracy metrc")
193 | end_timestamp_ms = vals['timestamp_ms']
194 | accuracy = classification_report(final_labels,response_labels,output_dict=True)["accuracy"]
195 | cdsw.track_aggregate_metrics({"accuracy": accuracy}, start_timestamp_ms , end_timestamp_ms, model_deployment_crn=Deployment_CRN)
196 |
197 |
198 |
--------------------------------------------------------------------------------
/7b_ml_ops_visual.py:
--------------------------------------------------------------------------------
1 | ## Part 7b - Model Operations - Visualising Model Metrics
2 |
3 | # This is a continuation of the previous process started in the
4 | # `7a_ml_ops_simulations.py` script.
5 | # Here we will load in the metrics saved to the model database in the previous step
6 | # into a Pandas dataframe, and display different features as graphs.
7 |
8 | #```python
9 | # help(cdsw.read_metrics)
10 | # Help on function read_metrics in module cdsw:
11 | #
12 | # read_metrics(model_deployment_crn=None, start_timestamp_ms=None, end_timestamp_ms=None, model_crn=None, model_build_crn=None)
13 | # Description
14 | # -----------
15 | #
16 | # Read metrics data for given Crn with start and end time stamp
17 | #
18 | # Parameters
19 | # ----------
20 | # model_deployment_crn: string
21 | # model deployment Crn
22 | # model_crn: string
23 | # model Crn
24 | # model_build_crn: string
25 | # model build Crn
26 | # start_timestamp_ms: int, optional
27 | # metrics data start timestamp in milliseconds , if not passed
28 | # default value 0 is used to fetch data
29 | # end_timestamp_ms: int, optional
30 | # metrics data end timestamp in milliseconds , if not passed
31 | # current timestamp is used to fetch data
32 | #
33 | # Returns
34 | # -------
35 | # object
36 | # metrics data
37 | #```
38 |
39 |
40 | import cdsw, time, os
41 | import pandas as pd
42 | import matplotlib.pyplot as plt
43 | import numpy as np
44 | from sklearn.metrics import classification_report
45 | from cmlbootstrap import CMLBootstrap
46 | import seaborn as sns
47 | import sqlite3
48 |
49 |
50 | ## Set the model ID
51 | # Get the model id from the model you deployed in step 5. These are unique to each
52 | # model on CML.
53 |
54 | model_id = "63"
55 |
56 | # Get the various Model CRN details
57 | cml = CMLBootstrap()
58 |
59 | latest_model = cml.get_model({"id": model_id, "latestModelDeployment": True, "latestModelBuild": True})
60 |
61 | Model_CRN = latest_model ["crn"]
62 | Deployment_CRN = latest_model["latestModelDeployment"]["crn"]
63 |
64 | # Read in the model metrics dict.
65 | model_metrics = cdsw.read_metrics(model_crn=Model_CRN,model_deployment_crn=Deployment_CRN)
66 |
67 | # This is a handy way to unravel the dict into a big pandas dataframe.
68 | metrics_df = pd.io.json.json_normalize(model_metrics["metrics"])
69 | metrics_df.tail().T
70 |
71 | # Write the data to SQL lite for Viz Apps
72 | if not(os.path.exists("model_metrics.db")):
73 | conn = sqlite3.connect('model_metrics.db')
74 | metrics_df.to_sql(name='model_metrics', con=conn)
75 |
76 | # Do some conversions & calculations
77 | metrics_df['startTimeStampMs'] = pd.to_datetime(metrics_df['startTimeStampMs'], unit='ms')
78 | metrics_df['endTimeStampMs'] = pd.to_datetime(metrics_df['endTimeStampMs'], unit='ms')
79 | metrics_df["processing_time"] = (metrics_df["endTimeStampMs"] - metrics_df["startTimeStampMs"]).dt.microseconds * 1000
80 |
81 | # This shows how to plot specific metrics.
82 | sns.set_style("whitegrid")
83 | sns.despine(left=True,bottom=True)
84 |
85 | prob_metrics = metrics_df.dropna(subset=['metrics.probability']).sort_values('startTimeStampMs')
86 | sns.lineplot(x=range(len(prob_metrics)), y="metrics.probability", data=prob_metrics, color='grey')
87 |
88 | time_metrics = metrics_df.dropna(subset=['processing_time']).sort_values('startTimeStampMs')
89 | sns.lineplot(x=range(len(prob_metrics)), y="processing_time", data=prob_metrics, color='grey')
90 |
91 | # This shows how the model accuracy drops over time.
92 | agg_metrics = metrics_df.dropna(subset=["metrics.accuracy"]).sort_values('startTimeStampMs')
93 | sns.barplot(x=list(range(1,len(agg_metrics)+1)), y="metrics.accuracy", color="grey", data=agg_metrics)
94 |
--------------------------------------------------------------------------------
/8_check_model.py:
--------------------------------------------------------------------------------
1 | # # Check Model
2 | # This file should be run in a job that will periodically check the current model's accuracy and trigger the
3 | # model retrain job if its below the required thresh hold.
4 |
5 | import cdsw, time, os
6 | import pandas as pd
7 | from sklearn.metrics import classification_report
8 | from cmlbootstrap import CMLBootstrap
9 |
10 | # replace this with these values relevant values from the project
11 | model_id = "63"
12 | job_id = "107"
13 |
14 | # Get the various Model CRN details
15 | cml = CMLBootstrap()
16 |
17 | latest_model = cml.get_model({"id": model_id, "latestModelDeployment": True, "latestModelBuild": True})
18 |
19 | Model_CRN = latest_model ["crn"]
20 | Deployment_CRN = latest_model["latestModelDeployment"]["crn"]
21 |
22 | # Read in the model metrics dict.
23 | model_metrics = cdsw.read_metrics(model_crn=Model_CRN,model_deployment_crn=Deployment_CRN)
24 |
25 | # This is a handy way to unravel the dict into a big pandas dataframe.
26 | metrics_df = pd.io.json.json_normalize(model_metrics["metrics"])
27 |
28 | latest_aggregate_metric = metrics_df.dropna(subset=["metrics.accuracy"]).sort_values('startTimeStampMs')[-1:]["metrics.accuracy"]
29 |
30 |
31 | if latest_aggregate_metric.to_list()[0] < 0.6:
32 | print("model is below threshold, retraining")
33 | cml.start_job(job_id,{})
34 | #TODO reploy new model
35 | else:
36 | print("model does not need to be retrained")
37 |
--------------------------------------------------------------------------------
/9_build_project.py:
--------------------------------------------------------------------------------
1 | # Run this file to auto deploy the model, run a job, and deploy the application
2 |
3 | # Install the requirements
4 | !pip3 install -r requirements.txt --progress-bar off
5 | import subprocess
6 | import datetime
7 | import xml.etree.ElementTree as ET
8 | import requests
9 | import json
10 | import time
11 | import os
12 | from IPython.display import Javascript, HTML
13 | from cmlbootstrap import CMLBootstrap
14 |
15 | try:
16 | os.environ["SPARK_HOME"]
17 | print("Spark is enabled")
18 | except:
19 | print('Spark is not enabled, please enable spark before running this script')
20 | raise KeyError('Spark is not enabled, please enable spark before running this script')
21 |
22 | run_time_suffix = datetime.datetime.now()
23 | run_time_suffix = run_time_suffix.strftime("%d%m%Y%H%M%S")
24 |
25 |
26 | # Instantiate API Wrapper
27 | cml = CMLBootstrap()
28 |
29 | # Set the STORAGE environment variable
30 | try :
31 | storage=os.environ["STORAGE"]
32 | except:
33 | storage = cml.get_cloud_storage()
34 | storage_environment_params = {"STORAGE":storage}
35 | storage_environment = cml.create_environment_variable(storage_environment_params)
36 | os.environ["STORAGE"] = storage
37 |
38 | # Create the directories and upload data
39 | !hadoop fs -mkdir -p $STORAGE/datalake
40 | !hadoop fs -mkdir -p $STORAGE/datalake/data
41 | !hadoop fs -mkdir -p $STORAGE/datalake/data/churn
42 | !hadoop fs -copyFromLocal /home/cdsw/raw/WA_Fn-UseC_-Telco-Customer-Churn-.csv $STORAGE/datalake/data/churn/WA_Fn-UseC_-Telco-Customer-Churn-.csv
43 |
44 | # This will run the data ingest file. You need this to create the hive table from the
45 | # csv file.
46 | exec(open("1_data_ingest.py").read())
47 |
48 | # Get User Details
49 | user_details = cml.get_user({})
50 | user_obj = {"id": user_details["id"], "username": os.getenv("CDSW_PROJECT_URL").split("/")[6],
51 | "name": user_details["name"],
52 | "type": user_details["type"],
53 | "html_url": user_details["html_url"],
54 | "url": user_details["url"]
55 | }
56 |
57 | # Get Project Details
58 | project_details = cml.get_project({})
59 | project_id = project_details["id"]
60 |
61 | #Get the runtime_id
62 | runtime_id = 14
63 | for ids in cml.get_runtimes()["runtimes"]:
64 | if ids["kernel"] == "Python 3.7" and ids["edition"] == "Standard" and ids["shortVersion"] == "2021.09" and ids["editor"] == "Workbench":
65 | runtime_id = ids["id"]
66 |
67 | #Get runtime addon numbers
68 | addon_val = cml.get_runtimes_addons()[0]['identifier']
69 |
70 | # Create Job
71 | create_jobs_params = {"name": "Train Model",
72 | "type": "manual",
73 | "script": "4_train_models.py",
74 | "timezone": "America/Los_Angeles",
75 | "environment": {},
76 | "kernel": "python3",
77 | "cpu": 1,
78 | "memory": 2,
79 | "nvidia_gpu": 0,
80 | "include_logs": True,
81 | "notifications": [
82 | {"user_id": user_obj["id"],
83 | "user": user_obj,
84 | "success": False, "failure": False, "timeout": False, "stopped": False
85 | }
86 | ],
87 | "recipients": {},
88 | "attachments": [],
89 | "include_logs": True,
90 | "report_attachments": [],
91 | "success_recipients": [],
92 | "failure_recipients": [],
93 | "timeout_recipients": [],
94 | "stopped_recipients": []
95 | }
96 |
97 |
98 | if os.getenv("ML_RUNTIME_EDITION") != None:
99 | create_jobs_params["runtime_id"] = runtime_id
100 | create_jobs_params["addons"] = [addon_val-1,addon_val]
101 | create_jobs_params["kernel"] = ""
102 |
103 |
104 |
105 | new_job = cml.create_job(create_jobs_params)
106 | new_job_id = new_job["id"]
107 | print("Created new job with jobid", new_job_id)
108 |
109 | ##
110 | # Start a job
111 | job_env_params = {}
112 | start_job_params = {"environment": job_env_params}
113 | job_id = new_job_id
114 | job_status = cml.start_job(job_id, start_job_params)
115 | print("Job started")
116 |
117 | # Stop a job
118 | #job_dict = cml.start_job(job_id, start_job_params)
119 | #cml.stop_job(job_id, start_job_params)
120 |
121 |
122 | # Get Default Engine Details
123 | default_engine_details = cml.get_default_engine({})
124 | default_engine_image_id = default_engine_details["id"]
125 |
126 | # Create the YAML file for the model lineage
127 | yaml_text = \
128 | """"Model Explainer {}":
129 | hive_table_qualified_names: # this is a predefined key to link to training data
130 | - "default.telco_churn@cm" # the qualifiedName of the hive_table object representing
131 | metadata: # this is a predefined key for additional metadata
132 | query: "select * from historical_data" # suggested use case: query used to extract training data
133 | training_file: "4_train_models.py" # suggested use case: training file used
134 | """.format(run_time_suffix)
135 |
136 | with open('lineage.yml', 'w') as lineage:
137 | lineage.write(yaml_text)
138 |
139 |
140 | # Create Model
141 | example_model_input = {"StreamingTV": "No", "MonthlyCharges": 70.35, "PhoneService": "No", "PaperlessBilling": "No", "Partner": "No", "OnlineBackup": "No", "gender": "Female", "Contract": "Month-to-month", "TotalCharges": 1397.475,
142 | "StreamingMovies": "No", "DeviceProtection": "No", "PaymentMethod": "Bank transfer (automatic)", "tenure": 29, "Dependents": "No", "OnlineSecurity": "No", "MultipleLines": "No", "InternetService": "DSL", "SeniorCitizen": "No", "TechSupport": "No"}
143 |
144 |
145 | create_model_params = {
146 | "projectId": project_id,
147 | "name": "Model Explainer 2",
148 | "description": "Explain a given model prediction",
149 | "visibility": "private",
150 | "enableAuth": False,
151 | "targetFilePath": "5_model_serve_explainer.py",
152 | "targetFunctionName": "explain",
153 | "engineImageId": default_engine_image_id,
154 | "kernel": "python3",
155 | "examples": [
156 | {
157 | "request": example_model_input,
158 | "response": {}
159 | }],
160 | "cpuMillicores": 1000,
161 | "memoryMb": 2048,
162 | "nvidiaGPUs": 0,
163 | "replicationPolicy": {"type": "fixed", "numReplicas": 1},
164 | "environment": {}}
165 |
166 | if os.getenv("ML_RUNTIME_EDITION") != None:
167 | create_model_params["runtimeId"] = runtime_id
168 |
169 | new_model_details = cml.create_model(create_model_params)
170 | access_key = new_model_details["accessKey"] # todo check for bad response
171 | model_id = new_model_details["id"]
172 |
173 | print("New model created with access key", access_key)
174 |
175 | # Disable model_authentication
176 | cml.set_model_auth({"id": model_id, "enableAuth": False})
177 |
178 | # Wait for the model to deploy.
179 | is_deployed = False
180 | while is_deployed == False:
181 | model = cml.get_model({"id": str(
182 | new_model_details["id"]), "latestModelDeployment": True, "latestModelBuild": True})
183 | if model["latestModelDeployment"]["status"] == 'deployed':
184 | print("Model is deployed")
185 | break
186 | else:
187 | print("Deploying Model.....")
188 | time.sleep(10)
189 |
190 |
191 | # Change the line in the flask/single_view.html file.
192 | subprocess.call(["sed", "-i", 's/const\saccessKey.*/const accessKey = "' +
193 | access_key + '";/', "/home/cdsw/flask/single_view.html"])
194 |
195 | # Change the model_id value in the 7a_model_operations.py, 7b_ml_ops_visual.py and 8_check_model.py file
196 | subprocess.call(["sed", "-i", 's/model_id =.*/model_id = "' +
197 | model_id + '"/', "/home/cdsw/7a_ml_ops_simulation.py"])
198 | subprocess.call(["sed", "-i", 's/model_id =.*/model_id = "' +
199 | model_id + '"/', "/home/cdsw/7b_ml_ops_visual.py"])
200 | subprocess.call(["sed", "-i", 's/model_id =.*/model_id = "' +
201 | model_id + '"/', "/home/cdsw/8_check_model.py"])
202 |
203 |
204 | # Create Application
205 | create_application_params = {
206 | "name": "Explainer App",
207 | "subdomain": run_time_suffix[:],
208 | "description": "Explainer web application",
209 | "type": "manual",
210 | "script": "6_application.py", "environment": {},
211 | "kernel": "python3", "cpu": 1, "memory": 2,
212 | "nvidia_gpu": 0
213 | }
214 |
215 | if os.getenv("ML_RUNTIME_EDITION") != "":
216 | create_application_params["runtime_id"] = runtime_id
217 | create_application_params["addons"] = [addon_val-1,addon_val]
218 | create_application_params["kernel"] = ""
219 |
220 | new_application_details = cml.create_application(create_application_params)
221 | application_url = new_application_details["url"]
222 | application_id = new_application_details["id"]
223 |
224 | # print("Application may need a few minutes to finish deploying. Open link below in about a minute ..")
225 | print("Application created, deploying at ", application_url)
226 |
227 | # Wait for the application to deploy.
228 | is_deployed = False
229 | while is_deployed == False:
230 | # Wait for the application to deploy.
231 | app = cml.get_application(str(application_id), {})
232 | if app["status"] == 'running':
233 | print("Application is deployed")
234 | break
235 | else:
236 | print("Deploying Application.....")
237 | time.sleep(10)
238 |
239 | HTML("Open Application UI".format(application_url))
240 |
241 | # This will run the model operations section that makes calls to the model to track
242 | # mertics and track metric aggregations
243 |
244 | exec(open("7a_ml_ops_simulation.py").read())
245 |
246 | # Change the job_id value in the 8_check_model.py file
247 | subprocess.call(["sed", "-i", 's/job_id =.*/job_id = "' +
248 | str(new_job_id) + '"/', "/home/cdsw/8_check_model.py"])
249 |
250 | # Create the check model Job
251 | # Create Job
252 | create_jobs_params = {"name": "Check Model",
253 | "type": "manual",
254 | "script": "8_check_model.py",
255 | "timezone": "America/Los_Angeles",
256 | "environment": {},
257 | "kernel": "python3",
258 | "cpu": 1,
259 | "memory": 2,
260 | "nvidia_gpu": 0,
261 | "include_logs": True,
262 | "notifications": [
263 | {"user_id": user_obj["id"],
264 | "user": user_obj,
265 | "success": False, "failure": False, "timeout": False, "stopped": False
266 | }
267 | ],
268 | "recipients": {},
269 | "attachments": [],
270 | "include_logs": True,
271 | "report_attachments": [],
272 | "success_recipients": [],
273 | "failure_recipients": [],
274 | "timeout_recipients": [],
275 | "stopped_recipients": []
276 | }
277 |
278 |
279 | if os.getenv("ML_RUNTIME_EDITION") != None:
280 | create_jobs_params["runtime_id"] = runtime_id
281 | create_jobs_params["addons"] = [addon_val-1,addon_val]
282 | create_jobs_params["kernel"] = ""
283 |
284 | new_job = cml.create_job(create_jobs_params)
285 | new_job_id = new_job["id"]
286 | print("Created new job with jobid", new_job_id)
287 |
288 | # Start a job
289 | job_env_params = {}
290 | start_job_params = {"environment": job_env_params}
291 | job_id = new_job_id
292 | job_status = cml.start_job(job_id, start_job_params)
293 | print("Job started")
294 |
295 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Churn Prediction Prototype
2 | This project is a Cloudera Machine Learning
3 | ([CML](https://www.cloudera.com/products/machine-learning.html)) **Applied Machine Learning
4 | Project Prototype**. It has all the code and data needed to deploy an end-to-end machine
5 | learning project in a running CML instance.
6 |
7 | ## Project Overview
8 | This project builds the telco churn with model interpretability project discussed in more
9 | detail [this blog post](https://blog.cloudera.com/visual-model-interpretability-for-telco-churn-in-cloudera-data-science-workbench/).
10 | The initial idea and code comes from the FFL Interpretability report which is now freely
11 | available and you can read the full report [here](https://ff06-2020.fastforwardlabs.com/)
12 |
13 | 
14 |
15 | The goal is to build a classifier model using Logistic Regression to predict the churn
16 | probability for a group of customers from a telecoms company. On top that, the model
17 | can then be interpreted using [LIME](https://github.com/marcotcr/lime). Both the Logistic
18 | Regression and LIME models are then deployed using CML's real-time model deployment
19 | capability and finally a basic flask based web application is deployed that will let
20 | you interact with the real-time model to see which factors in the data have the most
21 | influence on the churn probability.
22 |
23 | By following the notebooks in this project, you will understand how to perform similar
24 | classification tasks on CML as well as how to use the platform's major features to your
25 | advantage. These features include **streamlined model experimentation**,
26 | **point-and-click model deployment**, and **ML app hosting**.
27 |
28 | We will focus our attention on working within CML, using all it has to offer, while
29 | glossing over the details that are simply standard data science.
30 | We trust that you are familiar with typical data science workflows
31 | and do not need detailed explanations of the code.
32 | Notes that are *specific to CML* will be emphasized in **block quotes**.
33 |
34 | ### Initialize the Project
35 | There are a couple of steps needed at the start to configure the Project and Workspace
36 | settings so each step will run sucessfully. You **must** run the project bootstrap
37 | before running other steps. If you just want to launch the model interpretability
38 | application without going through each step manually, then you can also deploy the
39 | complete project.
40 |
41 | ***Project bootstrap***
42 |
43 | Open the file `0_bootstrap.py` in a normal workbench python3 session. You only need a
44 | 1 vCPU / 2 GiB instance. Once the session is loaded, click **Run > Run All Lines**.
45 | This will file will create an Environment Variable for the project called **STORAGE**,
46 | which is the root of default file storage location for the Hive Metastore in the
47 | DataLake (e.g. `s3a://my-default-bucket`). It will also upload the data used in the
48 | project to `$STORAGE/datalake/data/churn/`. The original file comes as part of this
49 | git repo in the `raw` folder.
50 |
51 | ***Deploy the Complete Project***
52 |
53 | If you just wish build the project artifacts without going through each step manually,
54 | run the `9_build_projet.py` file in a python3 session. Again a 1 vCPU / 2 GiB instance
55 | will be suffient. This script will:
56 | * run the bootstrap
57 | * then create the Hive Table and import the data
58 | * deploy the model
59 | * update the application files to use this new model
60 | * deploy the application
61 | * run the model drift simulation
62 | Once the script has completed you will see the new model and application are now available
63 | in the project.
64 |
65 | ## Project Build
66 | If you want go through each of the steps manually to build and understand how the project
67 | works, follow the steps below. There is a lot more detail and explanation/comments in each
68 | of the files/notebooks so its worth looking into those. Follow the steps below and you
69 | will end up with a running application.
70 |
71 | ### 0 Bootstrap
72 | Just to reiterate that you have run the bootstrap for this project before anything else.
73 | So make sure you run step 0 first.
74 |
75 | Open the file `0_bootstrap.py` in a normal workbench python3 session. You only need a
76 | 1 CPU / 2 GB instance. Then **Run > Run All Lines**
77 |
78 | ### 1 Ingest Data
79 | This script will read in the data csv from the file uploaded to the object store (s3/adls) setup
80 | during the bootstrap and create a managed table in Hive. This is all done using Spark.
81 |
82 | Open `1_data_ingest.py` in a Workbench session: python3, 1 CPU, 2 GB. Run the file.
83 |
84 | ### 2 Explore Data
85 | This is a Jupyter Notebook that does some basic data exploration and visualistaion. It
86 | is to show how this would be part of the data science workflow.
87 |
88 | 
89 |
90 | Open a Jupyter Notebook session (rather than a work bench): python3, 1 CPU, 2 GB and
91 | open the `2_data_exploration.ipynb` file.
92 |
93 | At the top of the page click **Cells > Run All**.
94 |
95 | ### 3 Model Building
96 | This is also a Jupyter Notebook to show the process of selecting and building the model
97 | to predict churn. It also shows more details on how the LIME model is created and a bit
98 | more on what LIME is actually doing.
99 |
100 | Open a Jupyter Notebook session (rather than a work bench): python3, 1 CPU, 2 GB and
101 | open the ` 3_model_building.ipynb` file.
102 |
103 | At the top of the page click **Cells > Run All**.
104 |
105 | ### 4 Model Training
106 | A model pre-trained is saved with the repo has been and placed in the `models` directory.
107 | If you want to retrain the model, open the `4_train_models.py` file in a workbench session:
108 | python3 1 vCPU, 2 GiB and run the file. The newly model will be saved in the models directory
109 | named `telco_linear`.
110 |
111 | There are 2 other ways of running the model training process
112 |
113 | ***1. Jobs***
114 |
115 | The **[Jobs](https://docs.cloudera.com/machine-learning/cloud/jobs-pipelines/topics/ml-creating-a-job.html)**
116 | feature allows for adhoc, recurring and depend jobs to run specific scripts. To run this model
117 | training process as a job, create a new job by going to the Project window and clicking _Jobs >
118 | New Job_ and entering the following settings:
119 | * **Name** : Train Mdoel
120 | * **Script** : 4_train_models.py
121 | * **Arguments** : _Leave blank_
122 | * **Kernel** : Python 3
123 | * **Schedule** : Manual
124 | * **Engine Profile** : 1 vCPU / 2 GiB
125 | The rest can be left as is. Once the job has been created, click **Run** to start a manual
126 | run for that job.
127 |
128 | ***2. Experiments***
129 |
130 | The other option is running an **[Experiment](https://docs.cloudera.com/machine-learning/cloud/experiments/topics/ml-running-an-experiment.html)**. Experiments run immediately and are used for testing different parameters in a model training process. In this instance it would be use for hyperparameter optimisation. To run an experiment, from the Project window click Experiments > Run Experiment with the following settings.
131 | * **Script** : 4_train_models.py
132 | * **Arguments** : 5 lbfgs 100 _(these the cv, solver and max_iter parameters to be passed to
133 | LogisticRegressionCV() function)
134 | * **Kernel** : Python 3
135 | * **Engine Profile** : 1 vCPU / 2 GiB
136 |
137 | Click **Start Run** and the expriment will be sheduled to build and run. Once the Run is
138 | completed you can view the outputs that are tracked with the experiment using the
139 | `cdsw.track_metrics` function. It's worth reading through the code to get a sense of what
140 | all is going on.
141 |
142 |
143 | ### 5 Serve Model
144 | The **[Models](https://docs.cloudera.com/machine-learning/cloud/models/topics/ml-creating-and-deploying-a-model.html)**
145 | is used top deploy a machine learning model into production for real-time prediction. To
146 | deploy the model trailed in the previous step, from to the Project page, click **Models > New
147 | Model** and create a new model with the following details:
148 |
149 | * **Name**: Explainer
150 | * **Description**: Explain customer churn prediction
151 | * **File**: 5_model_serve_explainer.py
152 | * **Function**: explain
153 | * **Input**:
154 | ```
155 | {
156 | "StreamingTV": "No",
157 | "MonthlyCharges": 70.35,
158 | "PhoneService": "No",
159 | "PaperlessBilling": "No",
160 | "Partner": "No",
161 | "OnlineBackup": "No",
162 | "gender": "Female",
163 | "Contract": "Month-to-month",
164 | "TotalCharges": 1397.475,
165 | "StreamingMovies": "No",
166 | "DeviceProtection": "No",
167 | "PaymentMethod": "Bank transfer (automatic)",
168 | "tenure": 29,
169 | "Dependents": "No",
170 | "OnlineSecurity": "No",
171 | "MultipleLines": "No",
172 | "InternetService": "DSL",
173 | "SeniorCitizen": "No",
174 | "TechSupport": "No"
175 | }
176 | ```
177 | * **Kernel**: Python 3
178 | * **Engine Profile**: 1vCPU / 2 GiB Memory
179 |
180 | Leave the rest unchanged. Click **Deploy Model** and the model will go through the build
181 | process and deploy a REST endpoint. Once the model is deployed, you can test it is working
182 | from the model Model Overview page.
183 |
184 | _**Note: This is important**_
185 |
186 | Once the model is deployed, you must disable the additional model authentication feature. In the model settings page, untick **Enable Authentication**.
187 |
188 | 
189 |
190 | ### 6 Deploy Application
191 | The next step is to deploy the Flask application. The **[Applications](https://docs.cloudera.com/machine-learning/cloud/applications/topics/ml-applications.html)** feature is still quite new for CML. For this project it is used to deploy a web based application that interacts with the underlying model created in the previous step.
192 |
193 | _**Note: This next step is important**_
194 |
195 | _In the deployed model from step 5, go to **Model > Settings** and make a note (i.e. copy) the
196 | "Access Key". It will look something like this (ie. mukd9sit7tacnfq2phhn3whc4unq1f38)_
197 |
198 | _From the Project level click on "Open Workbench" (note you don't actually have to Launch a
199 | session) in order to edit a file. Select the flask/single_view.html file and paste the Access
200 | Key in at line 19._
201 |
202 | ` const accessKey = "mp3ebluylxh4yn5h9xurh1r0430y76ca";`
203 |
204 | _Save the file (if it has not auto saved already) and go back to the Project._
205 |
206 | From the Go to the **Applications** section and select "New Application" with the following:
207 | * **Name**: Churn Analysis App
208 | * **Subdomain**: churn-app _(note: this needs to be unique, so if you've done this before,
209 | pick a more random subdomain name)_
210 | * **Script**: 6_application.py
211 | * **Kernel**: Python 3
212 | * **Engine Profile**: 1vCPU / 2 GiB Memory
213 |
214 |
215 | After the Application deploys, click on the blue-arrow next to the name. The initial view is a
216 | table of randomly selected from the dataset. This shows a global view of which features are
217 | most important for the predictor model. The reds show incresed importance for preditcting a
218 | cusomter that will churn and the blues for for customers that will not.
219 |
220 | 
221 |
222 | Clicking on any single row will show a "local" interpreted model for that particular data point
223 | instance. Here you can see how adjusting any one of the features will change the instance's
224 | churn prediction.
225 |
226 |
227 | 
228 |
229 | Changing the InternetService to DSL lowers the probablity of churn. *Note: this does not mean
230 | that changing the Internet Service to DSL cause the probability to go down, this is just what
231 | the model would predict for a customer with those data points*
232 |
233 |
234 | 
235 |
236 | ### 7 Model Operations
237 | The final step is the model operations which consists of [Model Metrics](https://docs.cloudera.com/machine-learning/cloud/model-metrics/topics/ml-enabling-model-metrics.html)
238 | and [Model Governance](https://docs.cloudera.com/machine-learning/cloud/model-governance/topics/ml-enabling-model-governance.html)
239 |
240 | **Model Governance** is setup in the `0_bootstrap.py` script, which writes out the lineage.yml file at
241 | the start of the project. For the **Model Metrics** open a workbench session (1 vCPU / 2 GiB) and open the
242 | `7a_ml_ops_simulation.py` file. You need to set the `model_id` number from the model created in step 5 on line
243 | 113. The model number is on the model's main page.
244 |
245 | 
246 |
247 | `model_id = "95"`
248 |
249 | From there, run the file. This goes through a process of simulating an model that drifts over
250 | over 1000 calls to the model. The file contains comments with details of how this is done.
251 |
252 | In the next step you can interact and display the model metrics. Open a workbench
253 | session (1 vCPU / 2 GiB) and open and run the `7b_ml_ops_visual.py` file. Again you
254 | need to set the `model_id` number from the model created in step 5 on line 53.
255 | The model number is on the model's main page.
256 |
257 | 
258 |
259 |
--------------------------------------------------------------------------------
/cdsw-build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pip3 install -r requirements.txt
--------------------------------------------------------------------------------
/churnexplainer.py:
--------------------------------------------------------------------------------
1 | import datetime, dill, os
2 | import pandas as pd
3 |
4 | from sklearn.pipeline import TransformerMixin
5 | from sklearn.preprocessing import LabelEncoder
6 |
7 |
8 | """
9 | Explained model is a class that has attributes:
10 |
11 | - data, i.e. the features you get for a given dataset from load_dataset. This
12 | is a pandas dataframe that may include categorical variables.
13 | - labels, i.e. the boolean labels you get for a given dataset from
14 | load_dataset.
15 | - categoricalencoder, a fitted sklearn Transformer object that transforms
16 | the categorical columns in `data` to deterministic integer codes, yielding a
17 | plain numpy array often called `X` (leaves non-categorical columns
18 | untouched)
19 | - pipeline, a trained sklearn pipeline that takes `X` as input and predicts.
20 | - explainer, an instantiated LIME explainer that yields an explanation when
21 | it's explain instance method is run on an example `X`
22 |
23 | properties:
24 | - default_data
25 | - categorical_features
26 | - non_categorical_features
27 | - dtypes
28 |
29 | and methods for API (which works in terms of dictionaries):
30 | - cast_dct, converts values of dictionary to dtype corresponding to key
31 | - explain_dct, returns prediction and explanation for example dictionary
32 |
33 | and methods for users (who usually have dataframes):
34 | - predict_df, returns predictions for a df, i.e. runs it through categorical
35 | encoder and pipeline
36 | - explain_df, returns predictions and explanation for example dataframe
37 | """
38 |
39 | class ExplainedModel():
40 |
41 | def __init__(self, model_name=None, labels=None, data=None, #dataset=None, data=None, labels=None,
42 | categoricalencoder=None, pipeline=None, explainer=None, data_dir=None,
43 | load=True):
44 | if model_name is not None:
45 | self.model_name = model_name
46 | self.is_loaded = False
47 | else:
48 | self.data = data
49 | self.labels = labels
50 | self.categoricalencoder = categoricalencoder
51 | self.pipeline = pipeline
52 | self.explainer = explainer
53 | self.is_loaded = True
54 | self.model_dir = os.path.join(data_dir, 'models', self.model_name)
55 | self.model_path = os.path.join(self.model_dir,
56 | self.model_name + '.pkl')
57 | # if asked to load and not yet loaded, load model!
58 | if load and not self.is_loaded:
59 | self.load()
60 |
61 | def load(self):
62 | if not self.is_loaded:
63 | with open(self.model_path, 'rb') as f:
64 | self.__dict__.update(dill.load(f))
65 | self.is_loaded = True
66 |
67 | def save(self):
68 | dilldict = {
69 | 'data': self.data,
70 | 'labels': self.labels,
71 | 'categoricalencoder': self.categoricalencoder,
72 | 'pipeline': self.pipeline,
73 | 'explainer': self.explainer
74 | }
75 | #self._make_model_dir()
76 | with open(self.model_path, 'wb') as f:
77 | dill.dump(dilldict, f)
78 |
79 | # def _make_model_name(self):
80 | # now = datetime.datetime.now().strftime("%Y%m%dT%H%M%S")
81 | # model_type = os.environ.get('CHURN_MODEL_TYPE', 'linear')
82 | # #model_name = '_'.join([now, self.dataset, model_type, get_git_hash()])
83 | # model_name = '_'.join([now, self.dataset, model_type])
84 | # return model_name
85 | #
86 | # def _make_model_dir(self):
87 | # if not os.path.exists(self.model_dir):
88 | # os.makedirs(self.model_dir)
89 |
90 | def predict_df(self, df):
91 | X = self.categoricalencoder.transform(df)
92 | return self.pipeline.predict_proba(X)[:, 1]
93 |
94 | def explain_df(self, df):
95 | X = self.categoricalencoder.transform(df)
96 | probability = self.pipeline.predict_proba(X)[0, 1]
97 | e = self.explainer.explain_instance(
98 | X[0], self.pipeline.predict_proba
99 | ).as_map()[1]
100 | explanations = {self.explainer.feature_names[c]: weight
101 | for c, weight in e}
102 | return probability, explanations
103 |
104 | def explain_dct(self, dct):
105 | return self.explain_df(pd.DataFrame([dct]))
106 |
107 | def cast_dct(self, dct):
108 | return {k: self.dtypes[k].type(v) for k, v in dct.items()}
109 |
110 | @property
111 | def dtypes(self):
112 | if not hasattr(self, '_dtypes'):
113 | d = self.data[self.non_categorical_features].dtypes.to_dict()
114 | d.update({c: self.data[c].cat.categories.dtype
115 | for c in self.categorical_features})
116 | self._dtypes = d
117 | return self._dtypes
118 |
119 | @property
120 | def non_categorical_features(self):
121 | return list(self.data.select_dtypes(exclude=['category']).columns
122 | .drop(self.labels.name + ' probability'))
123 |
124 | @property
125 | def categorical_features(self):
126 | return list(self.data.select_dtypes(include=['category']).columns)
127 |
128 | @property
129 | def stats(self):
130 | def describe(s):
131 | return {'median': s.median(),
132 | 'mean': s.mean(),
133 | 'min': s.min(),
134 | 'max': s.max(),
135 | 'std': s.std()}
136 | if not hasattr(self, '_stats'):
137 | self._stats = {c: describe(self.data[c])
138 | for c in self.non_categorical_features}
139 | return self._stats
140 |
141 | @property
142 | def label_name(self):
143 | return self.labels.name + ' probability'
144 |
145 | @property
146 | def categories(self):
147 | return {feature: list(self.categoricalencoder.classes_[feature])
148 | for feature in self.categorical_features}
149 |
150 | @property
151 | def default_data(self):
152 | # 0th class for categorical variables and mean for continuous
153 | if not hasattr(self, '_default_data'):
154 | d = {}
155 | d.update({feature: self.categoricalencoder.classes_[feature][0]
156 | for feature in self.categorical_features})
157 | d.update({feature: self.data[feature].median()
158 | for feature in self.non_categorical_features})
159 | self._default_data = d
160 | return self._default_data
161 |
162 | class CategoricalEncoder(TransformerMixin):
163 |
164 | def fit(self, X, y=None, *args, **kwargs):
165 | self.columns_ = X.columns
166 | self.cat_columns_ix_ = {c: i for i, c in enumerate(X.columns)
167 | if pd.api.types.is_categorical_dtype(X[c])}
168 | self.cat_columns_ = pd.Index(self.cat_columns_ix_.keys())
169 | self.non_cat_columns_ = X.columns.drop(self.cat_columns_)
170 | self.les_ = {c: LabelEncoder().fit(X[c])
171 | for c in self.cat_columns_}
172 | self.classes_ = {c: list(self.les_[c].classes_)
173 | for c in self.cat_columns_}
174 | return self
175 |
176 | def transform(self, X, y=None, *args, **kwargs):
177 | data = X[self.columns_].values
178 | for c, i in self.cat_columns_ix_.items():
179 | data[:, i] = self.les_[c].transform(data[:, i])
180 | return data.astype(float)
181 |
182 | def __repr__(self):
183 | return('{}()'.format(self.__class__.__name__))
--------------------------------------------------------------------------------
/flask/ajax-loader.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/flask/ajax-loader.gif
--------------------------------------------------------------------------------
/flask/churn_vis.css:
--------------------------------------------------------------------------------
1 | @import url('https://fonts.googleapis.com/css?family=Open+Sans');
2 |
3 |
4 | body {
5 |
6 | margin: 0 auto;
7 | font-family: 'Open Sans', sans-serif;
8 | font-size: 12px;
9 | }
10 |
11 | table {
12 | border: 0px solid black;
13 | /* border-collapse: collapse;*/
14 | }
15 |
16 | tr {
17 | cursor: pointer;
18 | }
19 |
20 | th, td {
21 | padding: 4px;
22 |
23 | }
24 | .header {
25 | font-family: 'Open Sans', sans-serif;
26 | font-weight: 300;
27 | font-size: 35px;
28 | text-align: center;
29 | padding-top: 20px;
30 | vertical-align: top;
31 | line-height: 55px;
32 | }
33 | #loader {
34 | padding-left: 330px;
35 | padding-top: 100px;
36 | }
37 |
38 | .churn_div {
39 | font-size: 15px;
40 | padding-bottom: 0;
41 | }
42 |
43 | .explanation {
44 | width: 680px;
45 | margin: 0 auto;
46 | font-family: "Open Sans", sans-serif;
47 | font-size: 10pt;
48 | font-weight: 300;
49 | padding-bottom: 20px;
50 | padding-top:10px;
51 | }
52 |
53 | h1 {
54 | width: 500px;
55 | padding-top:8px;
56 | padding-left: 20px;
57 | float: left;
58 | font-family: "Open Sans", sans-serif;
59 | font-size: 15pt;
60 | font-weight: 300;
61 |
62 | }
63 |
64 | input {
65 | width: 70px;
66 | }
67 |
68 | .submit_div {
69 | float:right;
70 | padding: 0 10px 0 10px;
71 | }
72 |
73 | .input_div {
74 | float:left;
75 | padding: 5px 10px 0 10px;
76 | }
77 |
78 | .inner_div {
79 | float:left;
80 | padding: 5px 5px 5px 5px;
81 | margin: 0 2px 0 2px;
82 | }
83 |
84 |
85 | div {
86 | /* float:left;*/
87 | padding:10px 10px 0 10px;
88 | }
89 |
90 | .main_div {
91 | clear:both;
92 | }
93 |
94 | #pred_value {
95 | float:left;
96 | }
97 |
98 | #loader {
99 | /*background-color: #fff;*/
100 | /*opacity: 0.9;*/
101 | position: absolute;
102 | padding: 100px 10px 10px 300px;
103 | width: 400px;
104 | height: 500px;
105 | }
106 |
107 |
108 | /* I got the button CSS from http://www.lab.tommasoraspo.com/simple-web-buttoms/ */
109 |
110 | .button {
111 | float: left;
112 | cursor: pointer;
113 | margin: 0 5px;
114 | text-align: center;
115 | /*display: inline-block;*/
116 | text-decoration: none;
117 | font: bold 12px/12px HelveticaNeue, Arial;
118 | padding: 8px 11px;
119 | color: #555;
120 | border: 1px solid #dedede;
121 | -webkit-border-radius: 3px;
122 | -moz-border-radius: 3px;
123 | border-radius: 3px;
124 | }
125 | .button.white {
126 | background: #f5f5f5;
127 | filter: progid: DXImageTransform.Microsoft.gradient(startColorstr='#f9f9f9', endColorstr='#f0f0f0');
128 | /* IE */
129 | background: -webkit-gradient(linear, left top, left bottom, from(#f9f9f9), to(#f0f0f0));
130 | /* WebKit */
131 | background: -moz-linear-gradient(top, #f9f9f9, #f0f0f0);
132 | border-color: #dedede #d8d8d8 #d3d3d3;
133 | color: #555;
134 | text-shadow: 0 1px 0 #fff;
135 | -webkit-box-shadow: 0 1px 1px #eaeaea, inset 0 1px 0 #fbfbfb;
136 | -moz-box-shadow: 0 1px 1px #eaeaea, inset 0 1px 0 #fbfbfb;
137 | box-shadow: 0 1px 1px #eaeaea, inset 0 1px 0 #fbfbfb;
138 | }
139 | .button.white:hover {
140 | background: #f4f4f4;
141 | filter: progid: DXImageTransform.Microsoft.gradient(startColorstr='#efefef', endColorstr='#f8f8f8');
142 | /* IE */
143 | background: -webkit-gradient(linear, left top, left bottom, from(#efefef), to(#f8f8f8));
144 | /* WebKit */
145 | background: -moz-linear-gradient(top, #efefef, #f8f8f8);
146 | border-color: #c7c7c7 #c3c3c3 #bebebe;
147 | text-shadow: 0 1px 0 #fdfdfd;
148 | -webkit-box-shadow: 0 1px 1px #ebebeb, inset 0 1px 0 #f3f3f3;
149 | -moz-box-shadow: 0 1px 1px #ebebeb, inset 0 1px 0 #f3f3f3;
150 | box-shadow: 0 1px 1px #ebebeb, inset 0 1px 0 #f3f3f3;
151 |
--------------------------------------------------------------------------------
/flask/churn_vis.js:
--------------------------------------------------------------------------------
1 | //This is the javascript code that builds and updates the bar graph
2 |
3 | window.updater = function(data) {
4 | //d3.select("#svg_container").text(data);
5 | my_data = data;
6 | console.log(data);
7 |
8 | // var svg_margin = { top: 20, right: 20, bottom: 20, left: 40 };
9 | // var svg_width = d3.select("body").node().getBoundingClientRect().width - svg_margin.left - svg_margin.right;
10 | // var svg_height = 300 - svg_margin.top - svg_margin.bottom;
11 | //
12 | // var y = d3.scaleLinear()
13 | // .domain([0, d3.max(data, function(d) { return d.petal_length; })])
14 | // .range([svg_height, 0]);
15 | //
16 | // var x = d3.scaleBand()
17 | // .domain(d3.range(data.length))
18 | // .range([0, svg_width])
19 | // .padding(0.1);
20 | //
21 | // var species_list = d3.map(data, function (d) { return d.species;}).keys();
22 | //
23 | // if (d3.select("#svg_container").select("svg").empty()) {
24 | //
25 | //
26 | // svg = d3.select("#svg_container").append("svg")
27 | // .attr("width", svg_width + svg_margin.left + svg_margin.right)
28 | // .attr("height", svg_height + svg_margin.top + svg_margin.bottom)
29 | // .append("g")
30 | // .attr("transform",
31 | // "translate(" + svg_margin.left + "," + svg_margin.top + ")");
32 | //
33 | // svg.append("g")
34 | // .attr("transform", "translate(0," + svg_height + ")")
35 | // .attr("class", "x axis")
36 | // .call(d3.axisBottom(x));
37 | //
38 | // // add the y Axis
39 | // svg.append("g")
40 | // .attr("class", "y axis")
41 | // .call(d3.axisLeft(y));
42 | // } else {
43 | // svg.attr("width", svg_width + svg_margin.left + svg_margin.right)
44 | // svg.selectAll("g.y.axis")
45 | // .call(d3.axisLeft(y));
46 | //
47 | // svg.selectAll("g.x.axis")
48 | // .call(d3.axisBottom(x));
49 | // }
50 | //
51 | // // DATA JOIN
52 | // // Join new data with old elements, if any.
53 | //
54 | // var bars = svg.selectAll(".bar")
55 | // .data(data);
56 | //
57 | // // UPDATE
58 | // // Update old elements as needed.
59 | //
60 | // bars
61 | // .attr("style",function(d) { return "fill:" + d3.schemeCategory10[species_list.indexOf(d.species)];})
62 | // .attr("x", function(d, i) { return x(i); })
63 | // .attr("width", x.bandwidth())
64 | // .transition()
65 | // .duration(100)
66 | // .attr("y", function(d) { return y(d.petal_length); })
67 | // .attr("height", function(d) { return svg_height - y(d.petal_length); });
68 | //
69 | // // ENTER + UPDATE
70 | // // After merging the entered elements with the update selection,
71 | // // apply operations to both.
72 | //
73 | // bars.enter().append("rect")
74 | // .attr("class", "bar")
75 | // .attr("style",function(d) { return "fill:" + d3.schemeCategory10[species_list.indexOf(d.species)];})
76 | // .attr("x", function(d, i) { return x(i); })
77 | // .attr("width", x.bandwidth())
78 | // .attr("y", function(d) { return y(d.petal_length); })
79 | // .attr("height", function(d) { return svg_height - y(d.petal_length); })
80 | // .merge(bars);
81 | //
82 | // // EXIT
83 | // // Remove old elements as needed.
84 | //
85 | // bars.exit().remove();
86 |
87 | };
--------------------------------------------------------------------------------
/flask/env_vars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/flask/env_vars.png
--------------------------------------------------------------------------------
/flask/single_view.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |