├── .gitignore ├── .project-metadata.yaml ├── 0_bootstrap.py ├── 1_data_ingest.py ├── 2_data_exploration.ipynb ├── 3_model_building.ipynb ├── 4_train_models.py ├── 5_model_serve_explainer.py ├── 6_application.py ├── 7a_ml_ops_simulation.py ├── 7b_ml_ops_visual.py ├── 8_check_model.py ├── 9_build_project.py ├── README.md ├── cdsw-build.sh ├── churnexplainer.py ├── flask ├── ajax-loader.gif ├── churn_vis.css ├── churn_vis.js ├── env_vars.png ├── single_view.html └── table_view.html ├── images ├── data.png ├── disable_auth.png ├── model_accuracy.png ├── model_id.png ├── single_view_1.png ├── single_view_2.png └── table_view.png ├── lineage.yml ├── models └── telco_linear │ └── telco_linear.pkl ├── raw ├── WA_Fn-UseC_-Telco-Customer-Churn-.csv └── telco-data │ ├── _SUCCESS │ └── part-00000-bfdb203d-eea4-4b80-bda3-d369976e785a-c000.csv └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | R 2 | node_modules 3 | *.pyc 4 | __pycache__ 5 | .* 6 | !.gitignore -------------------------------------------------------------------------------- /.project-metadata.yaml: -------------------------------------------------------------------------------- 1 | name: ML Churn Demo 2 | description: Prototype to demonstrate building a churn model on CML 3 | author: Cloudera Engineer 4 | specification_version: 1.0 5 | prototype_version: 1.0 6 | date: "2020-09-29" 7 | api_version: 1 8 | 9 | environment_variables: 10 | AWS_ACCESS_KEY: 11 | default: "AWS ACCESS KEY" 12 | description: "Access Key ID for accessing S3 bucket" 13 | prompt_user: true 14 | AWS_SECRET_KEY: 15 | default: "AWS SECRET KEY" 16 | description: "Secret Access Key for accessing S3 bucket" 17 | prompt_user: true 18 | HADOOP_DATA_SOURCE: 19 | default: "S3 URL FOR DATASET" 20 | description: "S3 URL to large data set" 21 | prompt_user: true 22 | MODEL_REPLICAS: 23 | default: "3" 24 | description: "Number of model replicas, 3 is standard for redundancy" 25 | prompt_user: false 26 | TASK_TYPE: 27 | default: NOT_OVERRIDEN 28 | prompt_user: false 29 | 30 | tasks: 31 | - type: create_job 32 | name: Install dependencies 33 | entity_label: install_deps 34 | script: 0_bootstrap.py 35 | arguments: None 36 | short_summary: Job to install dependencies and download training data. 37 | environment: 38 | TASK_TYPE: CREATE/RUN_JOB 39 | kernel: python3 40 | 41 | - type: run_job 42 | entity_label: install_deps 43 | short_summary: Running install dependencies training job. 44 | long_summary: >- 45 | Running the job to install dependencies. 46 | 47 | - type: create_job 48 | name: Train Churn Model 49 | entity_label: train_model 50 | script: 4_train_models.py 51 | arguments: None 52 | short_summary: Job to train models. 53 | environment: 54 | TASK_TYPE: CREATE/RUN_JOB 55 | kernel: python3 56 | 57 | - type: run_job 58 | entity_label: train_model 59 | short_summary: Run model training job. 60 | long_summary: >- 61 | Running the job to train models. 62 | 63 | - type: create_model 64 | name: Create Churn Model API Endpoint 65 | entity_label: telco_churn_model 66 | description: This model api endpoint predicts churn 67 | short_summary: Create the churn model prediction api endpoint 68 | access_key_environment_variable: SHTM_ACCESS_KEY 69 | # default_resources: 70 | # cpu: 1 71 | # memory: 2 72 | default_replication_policy: 73 | type: fixed 74 | num_replicas: 1 75 | 76 | # auth_enabled: false 77 | - type: build_model 78 | name: Build Telco Churn Model Endpoint 79 | entity_label: telco_churn_model 80 | comment: Build churn model 81 | examples: 82 | - request: '{"StreamingTV":"No","MonthlyCharges":70.35,"PhoneService":"No","PaperlessBilling":"No","Partner":"No","OnlineBackup":"No","gender":"Female","Contract":"Month-to-month","TotalCharges":1397.475,"StreamingMovies":"No","DeviceProtection":"No","PaymentMethod":"Bank transfer (automatic)","tenure":29,"Dependents":"No","OnlineSecurity":"No","MultipleLines":"No","InternetService":"DSL","SeniorCitizen":"No","TechSupport":"No"}' 83 | response: "" 84 | target_file_path: 5_model_serve_explainer.py 85 | target_function_name: explain 86 | kernel: python3 87 | environment_variables: 88 | TASK_TYPE: CREATE/BUILD/DEPLOY_MODEL 89 | 90 | - type: deploy_model 91 | name: telco_churn_model 92 | entity_label: telco_churn_model 93 | cpu: 1 94 | gpu: 0 95 | environment_variables: 96 | TASK_TYPE: CREATE/BUILD/DEPLOY_MODEL 97 | 98 | - type: start_application 99 | name: Application to serve Churn front app UI 100 | subdomain: churn 101 | script: 6_application.py 102 | environment_variables: 103 | TASK_TYPE: START_APPLICATION 104 | kernel: python3 105 | -------------------------------------------------------------------------------- /0_bootstrap.py: -------------------------------------------------------------------------------- 1 | # # Part 0: Bootstrap File 2 | # You need to at the start of the project. It will install the requirements, creates the 3 | # STORAGE environment variable and copy the data from 4 | # raw/WA_Fn-UseC_-Telco-Customer-Churn-.csv into /datalake/data/churn of the STORAGE 5 | # location. 6 | 7 | # The STORAGE environment variable is the Cloud Storage location used by the DataLake 8 | # to store hive data. On AWS it will s3a://[something], on Azure it will be 9 | # abfs://[something] and on CDSW cluster, it will be hdfs://[something] 10 | 11 | # Install the requirements 12 | !pip3 install -r requirements.txt --progress-bar off 13 | 14 | # Create the directories and upload data 15 | 16 | from cmlbootstrap import CMLBootstrap 17 | from IPython.display import Javascript, HTML 18 | import os 19 | import time 20 | import json 21 | import requests 22 | import xml.etree.ElementTree as ET 23 | import datetime 24 | 25 | try: 26 | os.environ["SPARK_HOME"] 27 | print("Spark is enabled") 28 | except: 29 | print('Spark is not enabled, please enable spark before running this script') 30 | raise KeyError('Spark is not enabled, please enable spark before running this script') 31 | 32 | run_time_suffix = datetime.datetime.now() 33 | run_time_suffix = run_time_suffix.strftime("%d%m%Y%H%M%S") 34 | 35 | # Instantiate API Wrapper 36 | cml = CMLBootstrap() 37 | 38 | # Set the STORAGE environment variable 39 | try : 40 | storage=os.environ["STORAGE"] 41 | except: 42 | storage = cml.get_cloud_storage() 43 | storage_environment_params = {"STORAGE":storage} 44 | storage_environment = cml.create_environment_variable(storage_environment_params) 45 | os.environ["STORAGE"] = storage 46 | 47 | # Upload the data to the cloud storage 48 | !hadoop fs -mkdir -p $STORAGE/datalake 49 | !hadoop fs -mkdir -p $STORAGE/datalake/data 50 | !hadoop fs -mkdir -p $STORAGE/datalake/data/churn 51 | !hadoop fs -copyFromLocal /home/cdsw/raw/WA_Fn-UseC_-Telco-Customer-Churn-.csv $STORAGE/datalake/data/churn/WA_Fn-UseC_-Telco-Customer-Churn-.csv 52 | 53 | -------------------------------------------------------------------------------- /1_data_ingest.py: -------------------------------------------------------------------------------- 1 | # Part 1: Data Ingest 2 | # A data scientist should never be blocked in getting data into their environment, 3 | # so CML is able to ingest data from many sources. 4 | # Whether you have data in .csv files, modern formats like parquet or feather, 5 | # in cloud storage or a SQL database, CML will let you work with it in a data 6 | # scientist-friendly environment. 7 | 8 | # Access local data on your computer 9 | # 10 | # Accessing data stored on your computer is a matter of [uploading a file to the CML filesystem and 11 | # referencing from there](https://docs.cloudera.com/machine-learning/cloud/import-data/topics/ml-accessing-local-data-from-your-computer.html). 12 | # 13 | # > Go to the project's **Overview** page. Under the **Files** section, click **Upload**, select the relevant data files to be uploaded and a destination folder. 14 | # 15 | # If, for example, you upload a file called, `mydata.csv` to a folder called `data`, the 16 | # following example code would work. 17 | 18 | # ``` 19 | # import pandas as pd 20 | # 21 | # df = pd.read_csv('data/mydata.csv') 22 | # 23 | # # Or: 24 | # df = pd.read_csv('/home/cdsw/data/mydata.csv') 25 | # ``` 26 | 27 | # Access data in S3 28 | # 29 | # Accessing [data in Amazon S3](https://docs.cloudera.com/machine-learning/cloud/import-data/topics/ml-accessing-data-in-amazon-s3-buckets.html) 30 | # follows a familiar procedure of fetching and storing in the CML filesystem. 31 | # > Add your Amazon Web Services access keys to your project's 32 | # > [environment variables](https://docs.cloudera.com/machine-learning/cloud/import-data/topics/ml-environment-variables.html) 33 | # > as `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. 34 | # 35 | # To get the the access keys that are used for your in the CDP DataLake, you can follow 36 | # [this Cloudera Community Tutorial](https://community.cloudera.com/t5/Community-Articles/How-to-get-AWS-access-keys-via-IDBroker-in-CDP/ta-p/295485) 37 | 38 | # 39 | # The following sample code would fetch a file called `myfile.csv` from the S3 bucket, `data_bucket`, and store it in the CML home folder. 40 | # ``` 41 | # # Create the Boto S3 connection object. 42 | # from boto.s3.connection import S3Connection 43 | # aws_connection = S3Connection() 44 | # 45 | # # Download the dataset to file 'myfile.csv'. 46 | # bucket = aws_connection.get_bucket('data_bucket') 47 | # key = bucket.get_key('myfile.csv') 48 | # key.get_contents_to_filename('/home/cdsw/myfile.csv') 49 | # ``` 50 | 51 | 52 | # Access data from Cloud Storage or the Hive metastore 53 | # 54 | # Accessing data from [the Hive metastore](https://docs.cloudera.com/machine-learning/cloud/import-data/topics/ml-accessing-data-from-apache-hive.html) 55 | # that comes with CML only takes a few more steps. 56 | # But first we need to fetch the data from Cloud Storage and save it as a Hive table. 57 | # 58 | # > Specify `STORAGE` as an 59 | # > [environment variable](https://docs.cloudera.com/machine-learning/cloud/import-data/topics/ml-environment-variables.html) 60 | # > in your project settings containing the Cloud Storage location used by the DataLake to store 61 | # > Hive data. On AWS it will `s3a://[something]`, on Azure it will be `abfs://[something]` and on 62 | # > on prem CDSW cluster, it will be `hdfs://[something]` 63 | # 64 | # This was done for you when you ran `0_bootstrap.py`, so the following code is set up to run as is. 65 | # It begins with imports and creating a `SparkSession`. 66 | 67 | import os 68 | import sys 69 | from pyspark.sql import SparkSession 70 | from pyspark.sql.types import * 71 | 72 | 73 | 74 | spark = SparkSession\ 75 | .builder\ 76 | .appName("PythonSQL")\ 77 | .master("local[*]")\ 78 | .getOrCreate() 79 | 80 | # **Note:** 81 | # Our file isn't big, so running it in Spark local mode is fine but you can add the following config 82 | # if you want to run Spark on the kubernetes cluster 83 | # 84 | # > .config("spark.yarn.access.hadoopFileSystems",os.getenv['STORAGE'])\ 85 | # 86 | # and remove `.master("local[*]")\` 87 | # 88 | 89 | # Since we know the data already, we can add schema upfront. This is good practice as Spark will 90 | # read *all* the Data if you try infer the schema. 91 | 92 | schema = StructType( 93 | [ 94 | StructField("customerID", StringType(), True), 95 | StructField("gender", StringType(), True), 96 | StructField("SeniorCitizen", StringType(), True), 97 | StructField("Partner", StringType(), True), 98 | StructField("Dependents", StringType(), True), 99 | StructField("tenure", DoubleType(), True), 100 | StructField("PhoneService", StringType(), True), 101 | StructField("MultipleLines", StringType(), True), 102 | StructField("InternetService", StringType(), True), 103 | StructField("OnlineSecurity", StringType(), True), 104 | StructField("OnlineBackup", StringType(), True), 105 | StructField("DeviceProtection", StringType(), True), 106 | StructField("TechSupport", StringType(), True), 107 | StructField("StreamingTV", StringType(), True), 108 | StructField("StreamingMovies", StringType(), True), 109 | StructField("Contract", StringType(), True), 110 | StructField("PaperlessBilling", StringType(), True), 111 | StructField("PaymentMethod", StringType(), True), 112 | StructField("MonthlyCharges", DoubleType(), True), 113 | StructField("TotalCharges", DoubleType(), True), 114 | StructField("Churn", StringType(), True) 115 | ] 116 | ) 117 | 118 | # Now we can read in the data from Cloud Storage into Spark... 119 | 120 | storage = os.environ['STORAGE'] 121 | 122 | telco_data = spark.read.csv( 123 | "{}/datalake/data/churn/WA_Fn-UseC_-Telco-Customer-Churn-.csv".format( 124 | storage), 125 | header=True, 126 | schema=schema, 127 | sep=',', 128 | nullValue='NA' 129 | ) 130 | 131 | # ...and inspect the data. 132 | 133 | telco_data.show() 134 | 135 | telco_data.printSchema() 136 | 137 | # Now we can store the Spark DataFrame as a file in the local CML file system 138 | # *and* as a table in Hive used by the other parts of the project. 139 | 140 | telco_data.coalesce(1).write.csv( 141 | "file:/home/cdsw/raw/telco-data/", 142 | mode='overwrite', 143 | header=True 144 | ) 145 | 146 | spark.sql("show databases").show() 147 | 148 | spark.sql("show tables in default").show() 149 | 150 | # Create the Hive table 151 | # This is here to create the table in Hive used be the other parts of the project, if it 152 | # does not already exist. 153 | 154 | if ('telco_churn' not in list(spark.sql("show tables in default").toPandas()['tableName'])): 155 | print("creating the telco_churn database") 156 | telco_data\ 157 | .write.format("parquet")\ 158 | .mode("overwrite")\ 159 | .saveAsTable( 160 | 'default.telco_churn' 161 | ) 162 | 163 | # Show the data in the hive table 164 | spark.sql("select * from default.telco_churn").show() 165 | 166 | # To get more detailed information about the hive table you can run this: 167 | spark.sql("describe formatted default.telco_churn").toPandas() 168 | 169 | # Other ways to access data 170 | 171 | # To access data from other locations, refer to the 172 | # [CML documentation](https://docs.cloudera.com/machine-learning/cloud/import-data/index.html). 173 | 174 | # Scheduled Jobs 175 | # 176 | # One of the features of CML is the ability to schedule code to run at regular intervals, 177 | # similar to cron jobs. This is useful for **data pipelines**, **ETL**, and **regular reporting** 178 | # among other use cases. If new data files are created regularly, e.g. hourly log files, you could 179 | # schedule a Job to run a data loading script with code like the above. 180 | 181 | # > Any script [can be scheduled as a Job](https://docs.cloudera.com/machine-learning/cloud/jobs-pipelines/topics/ml-creating-a-job.html). 182 | # > You can create a Job with specified command line arguments or environment variables. 183 | # > Jobs can be triggered by the completion of other jobs, forming a 184 | # > [Pipeline](https://docs.cloudera.com/machine-learning/cloud/jobs-pipelines/topics/ml-creating-a-pipeline.html) 185 | # > You can configure the job to email individuals with an attachment, e.g. a csv report which your 186 | # > script saves at: `/home/cdsw/job1/output.csv`. 187 | 188 | # Try running this script `1_data_ingest.py` for use in such a Job. 189 | 190 | -------------------------------------------------------------------------------- /3_model_building.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part 3: Model Building" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This notebook explores building the **customer churn model** and an **interpretability model** to explain each prediction.\n", 15 | "In addition to making a prediction of whether a customer will churn, we will also be able to answer the question, \"why are they expected to churn?\"\n", 16 | "\n", 17 | "The following work will look fairly standard to anyone having trained machine learning models using python Jupyter notebooks.\n", 18 | "The CML platform provides a **fully capable Jupyter notebook environment** that data scientists know and love.\n", 19 | "\n", 20 | "If you haven't yet, run through the initialization steps in the README file and Part 1. \n", 21 | "In Part 1, the data is imported into the `default.telco_churn` table in Hive. All data accesses fetch from Hive.\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Load data\n", 29 | "\n", 30 | "We again start by creating a `SparkSession` to fetch the data using Spark SQL, only this time we convert to a pandas `DataFrame` since we saw earlier that there are only 7k records in the dataset.\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 6, 36 | "metadata": { 37 | "scrolled": false 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "root\n", 45 | " |-- customerID: string (nullable = true)\n", 46 | " |-- gender: string (nullable = true)\n", 47 | " |-- SeniorCitizen: string (nullable = true)\n", 48 | " |-- Partner: string (nullable = true)\n", 49 | " |-- Dependents: string (nullable = true)\n", 50 | " |-- tenure: double (nullable = true)\n", 51 | " |-- PhoneService: string (nullable = true)\n", 52 | " |-- MultipleLines: string (nullable = true)\n", 53 | " |-- InternetService: string (nullable = true)\n", 54 | " |-- OnlineSecurity: string (nullable = true)\n", 55 | " |-- OnlineBackup: string (nullable = true)\n", 56 | " |-- DeviceProtection: string (nullable = true)\n", 57 | " |-- TechSupport: string (nullable = true)\n", 58 | " |-- StreamingTV: string (nullable = true)\n", 59 | " |-- StreamingMovies: string (nullable = true)\n", 60 | " |-- Contract: string (nullable = true)\n", 61 | " |-- PaperlessBilling: string (nullable = true)\n", 62 | " |-- PaymentMethod: string (nullable = true)\n", 63 | " |-- MonthlyCharges: double (nullable = true)\n", 64 | " |-- TotalCharges: double (nullable = true)\n", 65 | " |-- Churn: string (nullable = true)\n", 66 | "\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "from pyspark.sql import SparkSession\n", 72 | "\n", 73 | "spark = SparkSession\\\n", 74 | " .builder\\\n", 75 | " .appName(\"PythonSQL\")\\\n", 76 | " .master(\"local[*]\")\\\n", 77 | " .getOrCreate()\n", 78 | "\n", 79 | "spark_df = spark.sql(\"SELECT * FROM default.telco_churn\")\n", 80 | "spark_df.printSchema()\n", 81 | "df = spark_df.toPandas()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "**Note:** If you don't have the Hive table, you can read the csv from the CML filesystem using pandas directly:\n", 89 | "\n", 90 | "```python\n", 91 | "import pandas as pd\n", 92 | "import os\n", 93 | "\n", 94 | "data_dir = '/home/cdsw'\n", 95 | "df = pd.read_csv(os.path.join(data_dir, 'raw', 'WA_Fn-UseC_-Telco-Customer-Churn-.csv'))\n", 96 | "```\n" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "## Basic feature engineering\n", 104 | "\n", 105 | "\n", 106 | "Next we munge the data into appropriate types for later steps. \n", 107 | "In particular, we want to convert all the binary and string columns into pandas `Categorical` types.\n", 108 | "\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 7, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "import subprocess, glob, sys\n", 118 | "import dill # a better pickle\n", 119 | "import pandas as pd\n", 120 | "import numpy as np\n", 121 | "import matplotlib.pyplot as plt\n", 122 | "\n", 123 | "data_dir = '/home/cdsw'" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 8, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "idcol = 'customerID' # ID column\n", 133 | "labelcol = 'Churn' # label column\n", 134 | "cols = (('gender', True), # (feature column, Categorical?)\n", 135 | " ('SeniorCitizen', True),\n", 136 | " ('Partner', True),\n", 137 | " ('Dependents', True),\n", 138 | " ('tenure', False),\n", 139 | " ('PhoneService', True),\n", 140 | " ('MultipleLines', True),\n", 141 | " ('InternetService', True),\n", 142 | " ('OnlineSecurity', True),\n", 143 | " ('OnlineBackup', True),\n", 144 | " ('DeviceProtection', True),\n", 145 | " ('TechSupport', True),\n", 146 | " ('StreamingTV', True),\n", 147 | " ('StreamingMovies', True),\n", 148 | " ('Contract', True),\n", 149 | " ('PaperlessBilling', True),\n", 150 | " ('PaymentMethod', True),\n", 151 | " ('MonthlyCharges', False),\n", 152 | " ('TotalCharges', False))" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 9, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/html": [ 163 | "
\n", 164 | "\n", 177 | "\n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | "
genderSeniorCitizenPartnerDependentstenurePhoneServiceMultipleLinesInternetServiceOnlineSecurityOnlineBackupDeviceProtectionTechSupportStreamingTVStreamingMoviesContractPaperlessBillingPaymentMethodMonthlyChargesTotalCharges
id
0Female0YesNo1.0NoNo phone serviceDSLNoYesNoNoNoNoMonth-to-monthYesElectronic check29.8529.85
1Male0NoNo34.0YesNoDSLYesNoYesNoNoNoOne yearNoMailed check56.951889.50
2Male0NoNo2.0YesNoDSLYesYesNoNoNoNoMonth-to-monthYesMailed check53.85108.15
3Male0NoNo45.0NoNo phone serviceDSLYesNoYesYesNoNoOne yearNoBank transfer (automatic)42.301840.75
4Female0NoNo2.0YesNoFiber opticNoNoNoNoNoNoMonth-to-monthYesElectronic check70.70151.65
\n", 337 | "
" 338 | ], 339 | "text/plain": [ 340 | " gender SeniorCitizen Partner Dependents tenure PhoneService \\\n", 341 | "id \n", 342 | "0 Female 0 Yes No 1.0 No \n", 343 | "1 Male 0 No No 34.0 Yes \n", 344 | "2 Male 0 No No 2.0 Yes \n", 345 | "3 Male 0 No No 45.0 No \n", 346 | "4 Female 0 No No 2.0 Yes \n", 347 | "\n", 348 | " MultipleLines InternetService OnlineSecurity OnlineBackup \\\n", 349 | "id \n", 350 | "0 No phone service DSL No Yes \n", 351 | "1 No DSL Yes No \n", 352 | "2 No DSL Yes Yes \n", 353 | "3 No phone service DSL Yes No \n", 354 | "4 No Fiber optic No No \n", 355 | "\n", 356 | " DeviceProtection TechSupport StreamingTV StreamingMovies Contract \\\n", 357 | "id \n", 358 | "0 No No No No Month-to-month \n", 359 | "1 Yes No No No One year \n", 360 | "2 No No No No Month-to-month \n", 361 | "3 Yes Yes No No One year \n", 362 | "4 No No No No Month-to-month \n", 363 | "\n", 364 | " PaperlessBilling PaymentMethod MonthlyCharges TotalCharges \n", 365 | "id \n", 366 | "0 Yes Electronic check 29.85 29.85 \n", 367 | "1 No Mailed check 56.95 1889.50 \n", 368 | "2 Yes Mailed check 53.85 108.15 \n", 369 | "3 No Bank transfer (automatic) 42.30 1840.75 \n", 370 | "4 Yes Electronic check 70.70 151.65 " 371 | ] 372 | }, 373 | "execution_count": 9, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "df = df.replace(r'^\\s$', np.nan, regex=True).dropna().reset_index() # drop blank rows\n", 380 | "df.index.name = 'id' # name the index\n", 381 | "data, labels = df.drop(labelcol, axis=1), df[labelcol] # separate out the labels\n", 382 | "data = data[[c for c, _ in cols]] # only use the columns named in `cols`\n", 383 | "data = data.replace({'SeniorCitizen': {1: 'Yes', 0: 'No'}}) # Change 1/0 to Yes/No to match the other binary features\n", 384 | "\n", 385 | "# convert the categorical columns to pd.Categorical form\n", 386 | "for col, iscat in cols:\n", 387 | " if iscat:\n", 388 | " data[col] = pd.Categorical(data[col])\n", 389 | "labels = (labels == 'Yes') # convert labels from str to bool\n", 390 | "\n", 391 | "data.head()" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "## Machine learning model\n", 399 | "\n", 400 | "This step follows a fairly standard ML workflow, which is to create a pipeline to:\n", 401 | "\n", 402 | "* Encode the categorical features as numeric\n", 403 | "* Normalize the numeric features\n", 404 | "* Train a classification model using these processed features\n", 405 | "\n", 406 | "We use *one-hot encoding*, *standardization*, and *logistic regression with cross-validation* for the three steps.\n", 407 | "Then we can evaluate the model's performance.\n", 408 | "\n", 409 | "Note: `CategoricalEncoder` and, later, `ExplainedModel` are helper classes pulled and edited from the original CFFL [interpretability report code](https://ff06-2020.fastforwardlabs.com/).\n", 410 | "You can inspect `churnexplainer.py` to see what they do under the hood.\n", 411 | "CML lets you continue to write modular code to keep things segregated and clean." 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 10, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "from sklearn.model_selection import train_test_split\n", 421 | "from sklearn.metrics import classification_report\n", 422 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", 423 | "from sklearn.pipeline import Pipeline\n", 424 | "from sklearn.linear_model import LogisticRegressionCV, LogisticRegression\n", 425 | "from sklearn.compose import ColumnTransformer\n", 426 | "\n", 427 | "from churnexplainer import CategoricalEncoder # convert Categorical columns into numeric" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 11, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "ce = CategoricalEncoder()\n", 437 | "X = ce.fit_transform(data) # Categorical columns now have values 0 to num_categories-1\n", 438 | "y = labels.values\n", 439 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n", 440 | "\n", 441 | "cat_cols = list(ce.cat_columns_ix_.values()) # indices of the categorical columns (now numeric)\n", 442 | "ct = ColumnTransformer(\n", 443 | " [('ohe', OneHotEncoder(), cat_cols)],\n", 444 | " remainder='passthrough'\n", 445 | ")\n", 446 | "clf = LogisticRegressionCV(cv=5,solver='lbfgs', max_iter=100)" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 23, 452 | "metadata": {}, 453 | "outputs": [ 454 | { 455 | "name": "stderr", 456 | "output_type": "stream", 457 | "text": [ 458 | "/home/cdsw/.local/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n", 459 | "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n", 460 | "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n", 461 | " warnings.warn(msg, FutureWarning)\n" 462 | ] 463 | }, 464 | { 465 | "name": "stdout", 466 | "output_type": "stream", 467 | "text": [ 468 | "train 0.8077360637087599\n", 469 | "test 0.7912400455062572\n", 470 | " precision recall f1-score support\n", 471 | "\n", 472 | " False 0.84 0.89 0.86 1300\n", 473 | " True 0.62 0.52 0.56 458\n", 474 | "\n", 475 | " accuracy 0.79 1758\n", 476 | " macro avg 0.73 0.70 0.71 1758\n", 477 | "weighted avg 0.78 0.79 0.79 1758\n", 478 | "\n" 479 | ] 480 | } 481 | ], 482 | "source": [ 483 | "pipe = Pipeline([('ct', ct), # 1. Encode the categorical features as numeric\n", 484 | " ('scaler', StandardScaler()), # 2. Normalize the numeric features\n", 485 | " ('clf', clf)]) # 3. Train a classification model using these processed features\n", 486 | "pipe.fit(X_train, y_train)\n", 487 | "train_score = pipe.score(X_train, y_train)\n", 488 | "test_score = pipe.score(X_test, y_test)\n", 489 | "print(\"train\",train_score)\n", 490 | "print(\"test\", test_score) \n", 491 | "print(classification_report(y_test, pipe.predict(X_test)))" 492 | ] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": {}, 497 | "source": [ 498 | "### Compare with Random Forest\n", 499 | "Just for a comparison, lets compare this model to a Random Forest model.\n", 500 | "This is simpler since Random Forests do not need the categorical features encoded with a `OneHotEncoder`." 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 14, 506 | "metadata": {}, 507 | "outputs": [ 508 | { 509 | "name": "stdout", 510 | "output_type": "stream", 511 | "text": [ 512 | "train 0.9981039059537353\n", 513 | "test 0.7895335608646189\n", 514 | " precision recall f1-score support\n", 515 | "\n", 516 | " False 0.83 0.90 0.86 1300\n", 517 | " True 0.63 0.47 0.54 458\n", 518 | "\n", 519 | " accuracy 0.79 1758\n", 520 | " macro avg 0.73 0.69 0.70 1758\n", 521 | "weighted avg 0.78 0.79 0.78 1758\n", 522 | "\n" 523 | ] 524 | } 525 | ], 526 | "source": [ 527 | "from sklearn.ensemble import RandomForestClassifier\n", 528 | "clf_rf = RandomForestClassifier(n_estimators=100)\n", 529 | "pipe_rf = Pipeline([('scaler', StandardScaler()),\n", 530 | " ('clf', clf_rf)])\n", 531 | "pipe_rf.fit(X_train, y_train)\n", 532 | "train_score = pipe_rf.score(X_train, y_train)\n", 533 | "test_score = pipe_rf.score(X_test, y_test)\n", 534 | "print(\"train\",train_score)\n", 535 | "print(\"test\", test_score)\n", 536 | "print(classification_report(y_test, pipe_rf.predict(X_test)))" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "### Plot ROC Curve\n", 544 | "\n", 545 | "We can also generate an ROC Curve to visualize the model's performance and calculate the AUROC" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 24, 551 | "metadata": {}, 552 | "outputs": [ 553 | { 554 | "name": "stdout", 555 | "output_type": "stream", 556 | "text": [ 557 | "Logistic: AUROC=0.834\n" 558 | ] 559 | }, 560 | { 561 | "data": { 562 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtgAAAHpCAYAAAC4HUuQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAHvBJREFUeJzt3f+rZfdd7/HXJDOj/R4kjVB7i+ZS1EhscxRyTXMtEvpDJXiFaz8/tOK1FGJABvqT+Ru8v1iJFOy113DjF/Ke5oLSNmJIqG0I5gfP/BA7hVpLrTcFk2grNJPpTJK5P8w54549+8zsc/Znf1l7PR5QmLNm7VmrfHqmz/OZz/qsY5cuXQoAANDHTeu+AQAA2CYCGwAAOhLYAADQkcAGAICOBDYAAHQksAEAoCOBDQAAHQlsAADoSGADAEBHAhsAADoS2AAA0JHABgCAjo6v+wYO66mnnrq07nsAAGAc7rvvvmOH/YwZbAAA6GhwM9j7dnZ2Vnq93d3dtVyX1TLO42Cct58xHgfjPA7rGuf96x6FGWwAAOhIYAMAQEcCGwAAOhLYAADQkcAGAICOBDYAAHQksAEAoCOBDQAAHQlsAADoSGADAEBHAhsAADoS2AAA0JHABgCAjo73+oNaa7+Q5ERVffmA3/+ZJL+U5HyStyT5XFW90Ov6AACwCRYO7Nba/Unek+TLSd5/wDnHknygqv5g4tgDST6z6PUBAGCT9Fgi8oWq+nRV/f11zvlAkqemjn2jtfbuDtcHAICNsfAMdlVdmuO091TVM1PHnk/yviT/7yjX3d3dPcrHjuzcuXNruS6rZZzHwThvP2M8DsZ5c3zqzIU8//IbS73GZzOcce62BvsGZs2Uv5zknSu6PgDAIK0iXulrVYF9jaq61Fq7+aif39nZ6Xk7N7T/0/Gqr8tqGedxMM7bzxiPw6aN86nTZ/PMN7+37ttYm3tvvyUPf+SO7n/uusZ5kX8ZWVVgX7OMZO/BRz+OAQBbYZlxvax4ZTlWFdizQvqdSV5a0fUBgC22SbPHZx66Z923wJqtKrD/ubX23qr6h4lj70vy9RVdHwBYg5WE75PPLvfPP4R7b79l3bfABlhJYFfVM621B5NMBvZPV9WTq7g+ALB6q55VtoyCTbG0wG6t3Z0kVfXc3qGvtNZOJXk1yTuSPL6sawPAGGzSsojr2baH3+BGugZ2Vf3JxK+fm/q9ryb5as/rAcAYDSWsE7PKjNPatukDAOZ3UFQLWNg8AhsANpiwhuER2ACwgWaFtaiGYRDYALBBhDUMn8AGgCXo8SCisIZhEtgA0Emv3T2ENQybwAaATiztABKBDQCHMs8s9ZmH7lnR3QCbSGADQPou7wDGTWADMGpHCWtLP4DrEdgAbKyFZ5WffPZQpwtnoAeBDcDG6rFkYx7CGuhJYAOwkU6dPnvl14d9aHB3dzdJsrOz0/WeAOYhsAFYm3mWgHhoEBgagQ3ASniYEBgLgQ1AV4cNaRENbBuBDcCBeu0NvU9MA2MgsAG4oufLVoQ0MFYCG2BErIMGWD6BDbDlrIkGWC2BDbAFDhPRAhpguQQ2wBaYZy9pUQ2wGgIbYIsc9o2HAPQnsAEGpvfWeQD0JbABNthh11YDsH4CG2CDzBvU1lQDbC6BDbBits0D2G4CG2AFbKMHMB4CG6ADs9IA7BPYAIewyA4eohpgHAQ2wBzmCWsBDUAisAEOdFBUC2kArkdgA0wR1gAsQmADRFQD0I/ABkiuiWthDcBRCWxglD515kKef/LZa46feeieNdwNANtEYAOjcnkpyPmZv3fv7bes+G4A2EYCG9hKttUDYF0ENrCVrhfXd956Uz5518ns7IhrAPoT2MBWm7Wmend3dw13AsBY3LTuGwDo7dTps+u+BQBGzAw2MHjX28MaAFZNYAODdb0HGT3ACMC6CGxgkKbjWlADsCkENjA4k3EtrAHYNAIbGIRZy0HENQCbyC4iwMYT1wAMiRlsYONZDgLAkAhsYCPM82pzcQ3AEAhsYC3mCepJ9rQGYCgENrBy13sxjFlqAIZOYAMrZYs9ALadXUSAlRLXAGw7M9jA0s1aEiKuAdhWZrCBpZu1hzUAbCsz2MDC5t0R5MxD96zgbgBgvQQ2cGSH2WrPrDUAYyGwgUOxxR4AXJ/ABg5l1npqYQ0A/0FgA9d10Iy19dQAMJvABq5xo7XV1lMDwMEENnCVWXFtGQgAzE9gA6IaADoS2DBC8ywBEdcAcDQCG0bkemEtqgGgD4ENIzIZ14IaAJZDYMNInDp99sqvbbEHAMuzcGC31k4keSDJhSQ3Jfl2VT0x47xfTvLje+e9PckTVfW1Ra8P3Njk0hBb7AHAcvWYwW5JHqmqV5KktXZ/a+1dVfWdKye09pNJ3qiqT08c+2Rr7RtVdbHDPQAzTK+5tiwEAJavR2Cf2I/rPV9M8rEkj04c+/mq+tOpz30pyU8leb7DPQB7DnqQUVwDwGrctMiHW2snk3x/8lhVvZHktalTj7XWpmP+Tbm8XAToaNZ+1mceukdcA8CKLDqDfVuSl2Ycnw73zyf5RGvtM1V1qbX2llye1X74qBfe3d096keP5Ny5c2u5Lqu1TeP82Q/98N6vzm/Ff5+etmmcmc0Yj4NxHochjvNCM9i5HOivzzh+bPKLqvpekq8m+dPW2u8k+aMkjy14bWDKp874RyEAWLdFZ7BfS3LzjU5qrb0/yW1V9dG9r08m+e29Ge1Xrv/p2XZ2do7ysSPb/6lp1ddltYY8zqdOn83zL59PcnlZyM6OJSEHGfI4Mx9jPA7GeRzWNc6LzJgvGtgvJrl7jvN2qup/739RVRdaa59O8t+T/NmC9wCjNeuBRg8zAsB6LbREpKr297S+orV2U5KTU6de8+/WVfWDRa8PYyeuAWDz9Ajci621t018fX+Sp6fOebW19mOTB1prH07ydx2uD6M0/WZGO4UAwGbosQ/2Y0kebK2dT3IiyQtV9a3W2t1JUlXPVdXjrbWP77318fUkP5Tk697kCIc36+UxAMDmWDiw997EeM12e1X13NTXf7zotWDsvJkRADZfjxlsYEm8lREAhkdgwwY5KKgniWsA2GwCG9bsRlEtqAFgWAQ2rJF9rAFg+whsWJPJuBbVALA9vOgF1kRcA8B2EtiwBpMviRHXALBdLBGBFbjednsAwHYxgw0rYC9rABgPM9iwJLNmrc88dM+a7gYAWBUz2LAEB22/BwBsPzPYsAR2CAGA8TKDDUskrgFgfAQ2dDa5BR8AMD4CGzqbXB4CAIyPwIYlsTwEAMZJYAMAQEcCGzqy/hoAENjQkfXXAIDAhiWw/hoAxsuLZmABs97YCACMmxlsWMCsuLY8BADGzQw2HMH0zPWZh+5Z490AAJvEDDYcwWRcm7EGACaZwYYFmLkGAKYJbJiTBxoBgHlYIgJzmo5rS0MAgFnMYMMhWRYCAFyPGWyYg1egAwDzEthwA5Nrry0LAQBuRGDDDUzGtVegAwA3IrBhTuIaAJiHhxxhBlvyAQBHJbBhz+WoPr/31fmrfs/aawBgXgIb9sza59qyEADgsAQ25Opt+D77oR/Ozs7OGu8GABgygc1ozVpnfeetnvsFABYjsBmtWUtCPv6fzx9wNgDAfAQ2ozf56vPd3d013gkAsA0ENqNjCz4AYJkENqMxK6xtvwcA9CawGYXpuLYFHwCwLAKbUdiPa2ENACybPcnYepN7XItrAGDZzGCztWYtCwEAWDYz2Gwla64BgHUxg81WsuYaAFgXM9hsHWuuAYB1EthsncnZawCAVRPYbBWz1wDAuglstorZawBg3QQ2W8PsNQCwCQQ2W8PsNQCwCQQ2W8HsNQCwKeyDzaB5WyMAsGnMYDNo3tYIAGwaM9hshTMP3bPuWwAASGIGmwGbXHcNALApBDaDNLn22rprAGCTCGwGZzqurbsGADaJwGZQxDUAsOkENoMhrgGAIRDYDIK4BgCGQmAzCOIaABgK+2Cz0abf1CiuAYBNZwabjeU16ADAEJnBZmNZFgIADNHCgd1aO5HkgSQXcnlG/NtV9cQB596R5ENJziX566r6p0Wvz3aafEujuAYAhqTHDHZL8khVvZIkrbX7W2vvqqrvXHVSaz+X5I6q+v0O12TLeUsjADBUPdZgn9iP6z1fTHLfjPM+WFWPdrgeI2L2GgAYmoUCu7V2Msn3J49V1RtJXps6784kf7vItRiPyeUhAABDs+gSkduSvDTj+HS47yT5i9baR5Oc3Lvu41X13aNeeHd396gfPZJz586t5bpj86kzF/L8y28kSe689SbjzFIY5+1njMfBOI/DEMd50SUix5O8PuP4samv35rkV5J8rqoeSfLHST6x94AkXDEZ15+86+Sa7wYA4PAWncF+LcnNc5x3LMmfV9XFJKmq11trn03y4SR/eZQL7+zsHOVjR7b/U9Oqrzs6Tz6bJPk/n/gva7m8cR4H47z9jPE4GOdxWNc4LzJjvugM9otJbp3jvO/ux/W+veUh71jw+mwRa68BgG2wUGBX1YUkb5881lq7KZfXWU96tbX25qnz3pTZy0sYKVvzAQDboMc2fRdba2+b+Pr+JE9PnfNXST42dezX944zYqdOn81dv/ts7vrdZ68cszUfADBkPV4081iSB1tr55OcSPJCVX2rtXZ3klTVc1V1rrX2pdbabyf5QZI3JTlTVf/W4foMzKnTZ6/MVk8zew0ADN3Cgb23tvrhGcefm/r6H5L8w6LXY9hmxfW9t99i1hoA2Bo9ZrBhbpPrrEU1ALCNeqzBhkMT1wDAtjKDzUpcb901AMA2MYPNSkzGtQcZAYBtZgabpZt8gcyZh+5Z450AACyfGWyWzgtkAIAxEdisjAcbAYAxENgAANCRwAYAgI485MjS2JoPABgjgU13B70OHQBgDAQ2XU3HtVeiAwBjI7DpanJLPmENAIyRhxxZCnENAIyVwAYAgI4ENt1MvhIdAGCsBDbdeCU6AIDAppPJ2WvrrwGAMRPYdGH2GgDgMoFNV2avAYCxE9gAANCRwAYAgI4ENguzPR8AwH/wqnSO7NTps1cebkw84AgAkJjBZgHTce0BRwAAM9gc0eSykDMP3bPGOwEA2CwCm0OxLAQA4PosEWFus+LashAAgKuZwWYuk3EtrAEADiawmWl6tnqfuAYAuD5LRJhJXAMAHI0ZbK5hhxAAgKMzg81VptdaAwBwOAKbKzzICACwOIHNFeIaAGBxAptriGsAgKMT2CS5+sFGAACOTmCTJB5sBADoRGBz1ey15SEAAIsR2CNnWz4AgL4E9sjZOQQAoC+BTRJLQwAAehHYI2bnEACA/gT2iFl7DQDQn8DG8hAAgI4ENgAAdCSwAQCgI4E9Uh5wBABYDoE9Ql4uAwCwPMfXfQOszmRYJ14uAwCwDGawR0JcAwCshhnsEZheEiKsAQCWxwz2CIhrAIDVEdhbbnK3EHENALB8AnvL2S0EAGC1BPYWM3sNALB6AnuLmb0GAFg9gT0CZq8BAFZHYAMAQEcCGwAAOhLYW2ryAUcAAFZHYG+h6Tc3AgCwOgJ7C3lzIwDA+gjsLSauAQBWT2BvGWuvAQDWS2BvGWuvAQDWS2BvKctDAADW4/iif0Br7USSB5JcyOVg/3ZVPXGDz/xqkrdW1Z8sen3+g+UhAADr12MGuyV5pKr+V1X9YZKbW2vvOvDk1n40HcKea1keAgCwfj0C+0RVvTLx9ReT3Hed81uSxztclwNYHgIAsD4LBXZr7WSS708eq6o3krx2wPkfSvJ0VV1a5LoAALCpFl2qcVuSl2YcvybcW2tvSfITVfXkgtdMkuzu7vb4Y+Z27ty5tVz3KIZwj5tqSOPM0Rnn7WeMx8E4j8MQx3nRJSLHk7w+4/ixGcc+nuTRBa/HAT515sK6bwEAgCw+g/1akptvdFJr7eeSfK2qXl3welfs7Oz0+qPmsv9T06qvO49Tp8/m+ZfPJ7n8gOPOjjXYR7XJ40w/xnn7GeNxMM7jsK5xXmTGfNHAfjHJ3XOc94tJvtta+82JY7/QWjue5M+qyvTrEZ06ffaq3UM84AgAsF4LBXZVXWitvX3yWGvtpiQnp877venPttZesw/24sQ1AMBm6bFN38XW2tsmvr4/ydMd/lxuYPLFMuIaAGAz9Hjhy2NJHmytnU9yIskLVfWt1trdSVJVz3W4BjN4sQwAwOZZOLCr6mKSh2ccv25YWx7Sj9lrAIDN0WOJCAAAsEdgAwBARwJ7oCYfcAQAYHMI7IHygCMAwGYS2ANkez4AgM0lsAfI7DUAwOYS2ANm9hoAYPMIbAAA6EhgAwBARwJ7YGzPBwCw2QT2gJw6fdYDjgAAG05gD8hkXHvAEQBgMwnsgbD3NQDAMAjsAbA0BABgOAT2hpuOa7PXAACbTWBvOHENADAsAnsgxDUAwDAI7A1mz2sAgOER2BvKg40AAMMksDeQBxsBAIZLYG8YcQ0AMGwCe8OIawCAYRPYG0pcAwAMk8AGAICOjq/7Brhscu01AADDJbDX6KCoti0fAMBwCew1uF5YW3sNADBsAnsNJuNaVAMAbBeBvUZnHrpn3bcAAEBnAnuFPMgIALD9bNO3QtNLQwAA2D5msNfA0hAAgO1lBntFTp0+u+5bAABgBQT2iuwvD7E0BABguwnsFbMlHwDAdhPYAADQkcAGAICOBDYAAHQksAEAoCOBDQAAHQlsAADoSGCvgJfMAACMh8BeAS+ZAQAYD4G9Ql4yAwCw/QQ2AAB0JLCXzPprAIBxEdhLZv01AMC4COwVsf4aAGAcjq/7BrbVqdNnr8xeAwAwHmawl2Qyri0PAQAYDzPYS3bmoXvWfQsAAKyQGewlsHMIAMB4CewlsHMIAMB4CewlsnMIAMD4CGwAAOhIYAMAQEcCGwAAOhLYndlBBABg3AR2Z3YQAQAYN4G9JHYQAQAYJ4ENAAAdCWwAAOhIYHfkAUcAAAR2Rx5wBABAYC+BBxwBAMZLYAMAQEcCGwAAOjq+6B/QWjuR5IEkF3I52L9dVU/MOO/+JO9OcjHJm5M8VlUvLnp9AADYJAsHdpKW5JGqeiW5HNKttXdV1XeunNDa+5P8e1V9fuLYbyX5ww7XBwCAjdFjiciJ/bje88Uk902dc2dVfWXq2Pc7XBsAADbKQoHdWjuZqVCuqjeSvDZ16hdmfPzmRa4NAACbaNElIrcleWnG8avCvar+bfLr1tpNWTCwd3d3F/n4oZ07d27u66763ujnMOPMcBnn7WeMx8E4j8MQx3nRJSLHk7w+4/ixG3zu15L85YLXBgCAjbPoDPZrOeRMdGvtZ5P8S1X96yIX3tnZWeTjh7b/U9N1r/vkszc+h4021zgzeMZ5+xnjcTDO47CucV5kxnzRGewXk9w678mttR9JcldV/c2C1wUAgI20UGBX1YUkb588tre++uT0uXvHfyPJo4tcEwAANlmPbfouttbeNvH1/UmennHeJ5I8urfLCAAAbKUeL5p5LMmDrbXzSU4keaGqvtVauztJquq51toHk9yV5FJrbfKzZ6rq7zrcAwAAbISFA7uqLiZ5eMbx5yZ+/TdJrLsGAGDr9VgiAgAA7BHYnZw6fXbdtwAAwAYQ2J08883vJUnuvf2WNd8JAADrJLA7e/gjd6z7FgAAWCOBDQAAHQlsAADoSGADAEBHAhsAADoS2B3Yog8AgH0CuwNb9AEAsE9gd2SLPgAABDYAAHQksAEAoCOBDQAAHQlsAADoSGADAEBHAhsAADoS2AAA0JHABgCAjgQ2AAB0JLABAKAjgQ0AAB0JbAAA6EhgAwBARwIbAAA6EtgLOnX67LpvAQCADSKwF/TMN7+XJLn39lvWfCcAAGwCgd3Jwx+5Y923AADABhDYAADQkcAGAICOBDYAAHQksAEAoCOBDQAAHQnsBdgDGwCAaQJ7AfbABgBgmsDuwB7YAADsE9gAANCRwD4i668BAJhFYB+R9dcAAMwisBdk/TUAAJMENgAAdCSwj8D6awAADiKwj8D6awAADiKwF2D9NQAA0wQ2AAB0JLABAKAjgQ0AAB0JbAAA6EhgAwBARwIbAAA6EtgAANCRwAYAgI4ENgAAdCSwD+nU6bPrvgUAADaYwD6kZ775vSTJvbffsuY7AQBgEwnsI3r4I3es+xYAANhAAhsAADoS2AAA0JHABgCAjgQ2AAB0JLABAKAjgX0InzpzYd23AADAhhPYh/D8y28ksQc2AAAHE9hHYA9sAAAOIrABAKAjgQ0AAB0JbAAA6Oj4on9Aa+1EkgeSXMjlYP92VT0x47yfSfJLSc4neUuSz1XVC4tef1XsIAIAwDwWDuwkLckjVfVKkrTW7m+tvauqvnPlhNaOJflAVf3BxLEHknymw/VXwg4iAADMo8cSkRP7cb3ni0numzrnA0memjr2jdbauztcf6XsIAIAwPUsFNittZNJvj95rKreSPLa1Knvqap/nDr2fJKfWuT6AACwaRZdInJbkpdmHJ8O91kh/3KSdx71wru7u0f96ELWdV1W49y5c0mM87YzztvPGI+DcR6HIY7zoktEjid5fcbxYzf6YFVdSnLzgtcHAICNsugM9muZL5IvTR/Ye/DxjaNeeGdn56gfPZLPZnct12W19n86Ns7bzThvP2M8DsZ5HNY1zovMmC86g/1iklvnOG9WSL8zs5eXAADAYC0U2FV1IcnbJ4+11m5KcnLq1H9urb136tj7knx9kesDAMCm6bFN38XW2tsmvr4/ydOTJ1TVM7l2676frqp/6nB9AADYGD1eNPNYkgdba+eTnEjyQlV9q7V2d5JU1XN7532ltXYqyatJ3pHk8Q7XBgCAjbJwYFfVxSQPzzj+3NTXX03y1UWvBwAAm6zHEhEAAGCPwAYAgI4ENgAAdCSwAQCgI4ENAAAdCWwAAOhIYAMAQEcCGwAAOhLYAADQkcAGAICOBDYAAHQksAEAoKNjly5dWvc9HMpTTz01rBsGAGCw7rvvvmOH/YwZbAAA6GhwM9gAALDJzGADAEBHAhsAADoS2AAA0JHABgCAjgQ2AAB0JLABAKAjgQ0AAB0JbAAA6EhgAwBARwIbAAA6Or7uG9g0rbUTSR5IciGXfwD5dlU9MeO8n0nyS0nOJ3lLks9V1QurvFeO7hDjfH+Sdye5mOTNSR6rqhdXea8czbxjPPWZX03y1qr6kxXcIh0cZpxba3ck+VCSc0n+uqr+aWU3ykIO8Xf2Lyf58b3z3p7kiar62gpvlQW11n4hyYmq+vIBvz+I/hLY12pJHqmqV5LLgdVae1dVfefKCa0dS/KBqvqDiWMPJPnMyu+Wo5pnnN+f5N+r6vMTx34ryR+u/G45ihuO8VUnt/aj8XfiEM01zq21n0tyR1X9/jpukoXN83f2TyZ5o6o+PXHsk621b1TVxdXfMoexN6H1niRfTvL+A84ZTH9ZInKtE/vfwHu+mOS+qXM+kOSpqWPfaK29e6l3Rk/zjPOdVfWVqWPfX+5t0dE8YzypJXl8ubfEEsw7zh+sqkdXdE/0N884/3xV/dXUsS8l+all3hjdfKGqPl1Vf3+dcwbTXwJ7QmvtZKYCqqreSPLa1Knvqap/nDr2fHwTD8IhxvkLMz5+87Lui34OMcb7538oydNVdWkFt0cn845za+3OJH+7wlujo0N8Px9rrU3/K9Sbcnm5CBtuzr9/B9Nf/jn0arcleWnG8ekfRGb9YPJyknd2vyOWYa5xrqp/m/y6tXZTBPZQzPu9nNbaW5L8RFU9ufS7ord5x3knyV+01j6a5GQu/3/f41X13SXfH33MO86fT/KJ1tpnqurS3vf2z1fVw0u/Q1ZlMP1lBvtqx5O8PuP4sRt9cO8nL/E1DEcd519L8pf9b4clOMwYfzyJpQPDNO84vzXJr+Tyw1CPJPnjXA6xE8u9PTqZa5yr6ntJvprkT1trv5Pkj5I8tvzbY502tb8E9tVey3yDdM0/Y+wtvH+j+x2xDPOO8xWttZ9N8i9V9a/LuSU6m2uM9x58+1pVvbr8W2IJ5v1ePpbkz6vqQpJU1etJPpvkw0u8N/qZ9/v5/Uluq6qPVtX/TPI/knxsbyab7TCY/hLYV3sxya1znDdrIN+Z2f+ExeaZd5yTJK21H0lyV1X9zfJuic7mHeNfTPKfWmu/uf+fJP9179cnl3qH9DDvOH93eheJveUh71jKXdHbvOO8U1X/d/+LvR+oPp3kvy3rxli5wfSXwJ6w98349slje+tup/+P9p9ba++dOva+JF9f4u3RySHGef/4b8QSgkGZd4yr6veq6pHJ/yT5yt6vPRi14Q7xvfxqa+3NU+e9KbOXHbBhDjHO13zPVtUPonW2yWD6y//ornWxtfa2ia/vT/L05AlV9Uyu3R7op720YFBuOM57PpHk0b0n1hmWeceYYZtnnP8qycemjv363nGGYZ5xfrW19mOTB1prH07yd8u+OVZjSP1lF5FrPZbkwdba+SQnkrxQVd9qrd2dJFX13N55X2mtnUryai7/M6P9c4flhuPcWvtgkruSXGqtTX72TFX5C3vzzfu9zLDdcJyr6lxr7Uuttd9O8oNc3rrtzPROQWy0ecb58dbax/ceXn09yQ8l+bo3OQ7bUPvr2KVLtn0FAIBeLBEBAICOBDYAAHQksAEAoCOBDQAAHQlsAADoSGADAEBHAhsAADoS2AAA0JHABgCAjgQ2AAB0JLABAKAjgQ0AAB0JbAAA6EhgAwBARwIbAAA6EtgAANDR/wcTWcaQAppCuAAAAABJRU5ErkJggg==\n", 563 | "text/plain": [ 564 | "" 565 | ] 566 | }, 567 | "metadata": { 568 | "needs_background": "light" 569 | }, 570 | "output_type": "display_data" 571 | } 572 | ], 573 | "source": [ 574 | "from sklearn.metrics import roc_curve\n", 575 | "from sklearn.metrics import roc_auc_score\n", 576 | "from matplotlib import pyplot\n", 577 | "\n", 578 | "logistic_regression_probabilities = pipe.predict_proba(X_test)\n", 579 | "logistic_regression_probabilities = logistic_regression_probabilities[:, 1]\n", 580 | "logistic_regression_auc = roc_auc_score(y_test, logistic_regression_probabilities)\n", 581 | "print('Logistic: AUROC=%.3f' % (logistic_regression_auc))\n", 582 | "logistic_regression_fpr, logistic_regression_tpr, _ = roc_curve(y_test, logistic_regression_probabilities)\n", 583 | "pyplot.plot(logistic_regression_fpr, logistic_regression_tpr, label='Logistic')\n", 584 | "pyplot.show()" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": {}, 590 | "source": [ 591 | "We find an AUC of 0.83. Not bad for a quick exercise without fine tuning.\n" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "## Interpretability model\n", 599 | "We use [lime](https://github.com/marcotcr/lime) (Local Interpretable Model-Agnostic Explanations) to explain the predictions.\n", 600 | "It is a method of determining which feature has the greatest effect on the predicted value,\n", 601 | "and is explained in depth in the the [FFL report](https://ff06-2020.fastforwardlabs.com/).\n", 602 | "For more information, refer to the [lime documentation](https://lime-ml.readthedocs.io/en/latest/lime.html)." 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 16, 608 | "metadata": {}, 609 | "outputs": [], 610 | "source": [ 611 | "from lime.lime_tabular import LimeTabularExplainer\n", 612 | "\n", 613 | "data[labels.name + ' probability'] = pipe.predict_proba(X)[:, 1]\n", 614 | "\n", 615 | "# List of length number of features, containing names of features in order\n", 616 | "# in which they appear in X\n", 617 | "feature_names = list(ce.columns_)\n", 618 | "\n", 619 | "# List of indices of columns of X containing categorical features\n", 620 | "categorical_features = list(ce.cat_columns_ix_.values())\n", 621 | "\n", 622 | "# List of (index, [cat1, cat2...]) index-strings tuples, where each index\n", 623 | "# is that of a categorical column in X, and the list of strings are the\n", 624 | "# possible values it can take\n", 625 | "categorical_names = {i: ce.classes_[c]\n", 626 | " for c, i in ce.cat_columns_ix_.items()}\n", 627 | "class_names = ['No ' + labels.name, labels.name]\n", 628 | "explainer = LimeTabularExplainer(ce.transform(data),\n", 629 | " feature_names=feature_names,\n", 630 | " class_names=class_names,\n", 631 | " categorical_features=categorical_features,\n", 632 | " categorical_names=categorical_names) \n", 633 | "\n" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "## Explaining a Single Prediction\n", 641 | "\n", 642 | "Let's look at how one specfic prediction would be interpreted.\n", 643 | "Lime explains the prediction by giving every feature a weight from -1 to 1.\n", 644 | "Features with weights closer to -1 have a stronger impact in coming up with a 0 prediction result (will not churn) and vice versa." 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 17, 650 | "metadata": {}, 651 | "outputs": [ 652 | { 653 | "data": { 654 | "text/html": [ 655 | "
\n", 656 | "\n", 669 | "\n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | "
id4809
genderFemale
SeniorCitizen0
PartnerNo
DependentsNo
tenure1
PhoneServiceYes
MultipleLinesNo
InternetServiceNo
OnlineSecurityNo internet service
OnlineBackupNo internet service
DeviceProtectionNo internet service
TechSupportNo internet service
StreamingTVNo internet service
StreamingMoviesNo internet service
ContractMonth-to-month
PaperlessBillingNo
PaymentMethodMailed check
MonthlyCharges19.9
TotalCharges19.9
Churn probability0.220148
\n", 759 | "
" 760 | ], 761 | "text/plain": [ 762 | "id 4809\n", 763 | "gender Female\n", 764 | "SeniorCitizen 0\n", 765 | "Partner No\n", 766 | "Dependents No\n", 767 | "tenure 1\n", 768 | "PhoneService Yes\n", 769 | "MultipleLines No\n", 770 | "InternetService No\n", 771 | "OnlineSecurity No internet service\n", 772 | "OnlineBackup No internet service\n", 773 | "DeviceProtection No internet service\n", 774 | "TechSupport No internet service\n", 775 | "StreamingTV No internet service\n", 776 | "StreamingMovies No internet service\n", 777 | "Contract Month-to-month\n", 778 | "PaperlessBilling No\n", 779 | "PaymentMethod Mailed check\n", 780 | "MonthlyCharges 19.9\n", 781 | "TotalCharges 19.9\n", 782 | "Churn probability 0.220148" 783 | ] 784 | }, 785 | "execution_count": 17, 786 | "metadata": {}, 787 | "output_type": "execute_result" 788 | } 789 | ], 790 | "source": [ 791 | "data.sample().T # reminder of the features" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": 18, 797 | "metadata": {}, 798 | "outputs": [ 799 | { 800 | "name": "stdout", 801 | "output_type": "stream", 802 | "text": [ 803 | "('tenure > 55.00', -0.2764138466515261)\n", 804 | "('MonthlyCharges > 89.86', -0.24321978003513584)\n", 805 | "('InternetService=Fiber optic', 0.2096249701592442)\n", 806 | "('TotalCharges > 3794.74', 0.2031826086609449)\n", 807 | "('StreamingMovies=Yes', 0.08274884799449057)\n", 808 | "('StreamingTV=Yes', 0.07781839117828696)\n", 809 | "('PhoneService=Yes', 0.04962121848245511)\n", 810 | "('MultipleLines=Yes', 0.04446637536101756)\n", 811 | "('Contract=One year', -0.04392535067270691)\n", 812 | "('TechSupport=No', 0.04173749428961184)\n" 813 | ] 814 | } 815 | ], 816 | "source": [ 817 | "exp = explainer.explain_instance(ce.transform(data.sample())[0],pipe.predict_proba)\n", 818 | "for cols in exp.as_list():\n", 819 | " print(cols)" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": 19, 825 | "metadata": {}, 826 | "outputs": [ 827 | { 828 | "data": { 829 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1AAAAIwCAYAAACImIrfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAIABJREFUeJzs3Xm8XVV99/HPTgKEMCOCIk6Viog4pFasUEES0eLYan84wAOOLVXRVh/b56l9OmifTpa2zsgsOP2wKghOTRAsiEyXwTJJHgoqKFOYEwhJ9vPHXgd2Dufcu29yk5Ob+3m/Xnlxz15rr7323ifhfO9ae52qrmskSZIkSRObNeoOSJIkSdJ0YYCSJEmSpI4MUJIkSZLUkQFKkiRJkjoyQEmSJElSRwYoSZIkSerIACVJkiRJHRmgJEmSJKkjA5QkSZIkdWSAkiRJkqSODFCSJEmS1JEBSpIkSZI6MkBJkiRJUkdzRt0BSZI0PVRV9STgvUAFXFDX9ekj7pIkbXBVXdej7oMkaR1UVfWxuq4/Mup+jKeqqgOAOXVdLxp1XwaZDtcQoKqqC4ED6rpePoJjbwYcAxxV1/X9G/r4w1RVdQRwXl3XS0bdl2GqqnoicCSwGTCbZgbQauCYuq7/X1/dafFelGYyR6AkSdrIVFX1V8DH6rpe2Vf00rquHxpBlwD2BXJjCk/TQVVVzwT+CPjLuq7vaW2fC3y0qqrT6rq+aGQdlDRpPgMlSdI0McLwBPBk4L9HePxpp6qqCngf8KF2eAKo6/pB4H8B76mqys9j0jTiX1hJktTFbGDVqDsxzewPfHvASCIAZfvZwG9t0F5JWidO4ZOkGaSqqlcDLweW0zyPsQXw2bqurxpQ9wjgeTTPamwJ3A/8bd80pM1ppiftSrOwwKzS9j/1/8a9Y/+eCryztDOv/PlsXdeXl/IXAK+o6/rvB+z7J8CP67r+UVVVTwMWApcC/wOoy5/ZwNF1Xf9sgn7sDLyf5v+Ts2mu09U0z6ysbtX7WF3XH6mq6pXAK4AV5Th3luOsatWdBRwO7NXqSw18vK7rX5Y6+wOvAV4CbFNVVQ18o67r89vH6+vrvkDw6D3dEji1rusf9dX7GPAXNPfrGaWvWwDn1nX9zXGuxW7AB0q/96uq6m5geV3Xf9Gq0+l9VfpwbLm2q4Gr67o+YZxjPxt4N7CynNds4OS6ri8Ytk/Zb3/g9cDDZZ8tgRPqur6kr96TgaNKX2YBc4Frgc/13btn07wvV9K8z7eiee7qS+P1AzgA+NgEdRYBeww4hy7vqaHPS/WXlWt/NPBhYHPg5rqu/7n3fCJwDfCeUraS5h4eXdf1zyfovzTjGKAkaYaoquoPgYfqun5/a9tc4J+qqvpiXdc/bm1/H3BdXdcntbY9BfhUVVVva/1G/ZM0H7Kua9V7BnA88MZJ9u/pwJuAv+61XxYu+HhVVcfVdf2Tuq4vq6rqoKqqXtR+bqSqqucBW/eFhhcCjwM+2As9VVXNA/6lqqoP1XV935B+7AT8X5ppV3e3tr8W+Mvyp13/EGBuXdd/3Nq2D/CHwKdbVf+OJgyd2Kq3C/D5qqoOqev6wbquzwXOHecZqP6+vp7mw/cft85xNvCRqqp2HhCM/ho4o67rT7fa+GBVVc+r6/qKQceo6/oXwIeGLdYwmfcVsA1NePpIXdfLJji3PWjC74da74c5wN9VVbVq2HNDVVW9Cdi+735sDnyuqqp/7oW6su1/0bw/lrfqvhz4KPC/y+udgbcDH27fj6qqDq+q6r11XX9qnNOoJ7qHdV3fDNzcdw5d31OTMRv4P8BH67q+s69sF5rg/hd1Xd9bjrcV8E80gVtSi1P4JGkGKOFkl/aHd3jkOYz3A39UntforRi2VV3X3++r+zPgZOC1pd5+wOJ2eCr1/h9wQfmt/WS8FfjH9gfOuq4fBj4EvKtV7+PAe8sHvN6H9Q/SBJS2Z9R1/Q/tEaPyof2faZ5LGeYPgD9vh6ey7xnAbn115wG/Vtf1yX11L2zXLSMdd/SFCeq6vhX4AnDgOP0ZqKqqbYADB5zjqrqu/xr4nVKnZx/gov5RGOATwBsme/zSh87vq+L1wN9MFJ6Kd9N8oG+/H1bShJ5Dx9lv37quP9fXnxXA/6QJZD3PBb7Xv6JhXdf/AWzd2vRS4JT+IFTu+TMnOIe1+UV1p/fUWngl8IkB4QmakdE/74WncrwHaML889fhmNImyQAlSTPDm4CBvykvH77PpFllDZoPucOmJl1AM0UL4EbgO0Pq/QR4YtfOle8X+ll7elKrfw/TTB/svV4F/A3NKBE0I0J/X+q1nTboWHVd/xTYcZzufKMEm0FuLSM8PdsBpwyp2+7PAzRBaZBJXauW1wEnjlN+fKnTs7qu6zP7Kw24bpMxmfcVNCNYdw+q31ZV1dbA0kF9K0Hm8nF2/+yQ/vQHh1uApw1p40Otn39OM+VxonqDVBOUD9L1PTVZV9d1fcOQsnPqwasr/hR4+jocU9okOYVPkmaGeUN+89xzHvB75b+7DXtGqPxW+qvl51+M094KJvf/mBcALx1n1OopVVXN6o201HW9pKqqa6qq+hfgv+u6vnrAPqsHbOuZPaxgSFs9K1jzQ/Evx7kOj/ySsq7rpZNos6tfB744TvnFrBmgLh6n7tr+QnUy7yvovorfHjTP5Aw03nNTE9y/h1v1bqmq6sGqqj5CM8J0U6tsRevnC6uq+vOqqj4AfKmu69sG1ZtCnd5Ta2G8az9sGfX7aAKdpBYDlCTNDBN9QL8D2KH83Pkb1sv0rJcDL+PRRQkAngIM/ZA7wI7AP9d1PfRD8wCn0jzw/5xJ7NMz0fM3WwKH0ISUmkf/f7kfEy8KMF67vwW8mkcf0ofmA+qla9Hc7Lquh96ruq7rsgjF+jSZ99VkbEXz4X2tlKmFb6UZLd2cR/vZntJIXdfHVFX1eOB1ZbGM1cBXB0xL/dvyDGCU+g/ShK7xfokgaRNlgJKkmWGiD7o7AL1Rkk4fuss0q8/SjIL8eb3m6nQHMLn/x9xJs5LfZALU/wF+t/z3TyaxH4xzPaqqeg7N8zL/1F5Eo5T91SSP09tvNs21WkSzSMaKVtnTaFYMnFIl3K7vZccn876ajPsZf5rlUGU1xs2BT/VPFywr0a2hruvbgeNK+Vzgf1ZVdWNd16f01fsZZbpiVVXbAX9ZVdW367peNE53VldVtUU9zvd3lYVE9p6gnWHWd0CWNIDPQEnSzPBAWV1umBcDvVXYbi6LHjxGVVVzqqr6jfLyfTQren23HZ7W0uVMYiGFsmz2FXVdXwz8qKqq35vk8cb74P9e4N11Xf/XJNsczyHA1+q6zimc9vXTqlnWfZjnAtdP0bGGmcz7ajJ+Cuw5rLCsSDdo+27ADnVd/32XZ636lZUQP8qaz20NqncPzcIlEy2+cQ7NCO14fotm+feptsXEVSStDQOUJM0MX2XIcsRlpOJVwPll0+k0058GeQ6wd/l5K4Z/QJ/UiEpZynm3Mj1qUB9ntX7eiea7oL5Y9v0a8NtVVT2hb7ffYICyyuDt43Rn6aARg9KHSa+WV+wCjA0pG3StHmLiD8CnA28bp/wI4IwJe7ZuJvO+6qwsaLBTWcZ+kLcM2f54hgS2qqr2ohnl7L3+yDjtt5c1/+shfaxppvKN5xzgNWX59WEW0CzOsjZW91ajbCtLtE/5qKakhgFKkmaAsrT4nVVVvb29vXyA/BhwUu95mrr5UtcHq6p6RV/d7WlGZ7JsOpfm+3HadWZXVfXe8rK9FHQXHwE+Xb53p93mQppllnv+qvyhb9tf9m17YlVV7WWre1O0/hQ4Zpx+3Fu+V6q93zbAPwDXMfnzAvg+cGT/xqqq3kzzob+/zSU0X2I8VBkFObeqqj/rC5hVVVV/DPxne1nq9WEy76u1cCzwt+2QU87t/cC3h+xzFXBQVVVrhM+qqubThK72dMJvAx/sW2adqqpeyJojlBdXVdV+//XqHQzc1r+9rYzM/ivN96et8SxYVVWbV1X1UeDL6zCCewrwF+1zKO/xfwSuXMs2JU3AZ6Akafp7WVVVfz+k7Ja6rj8BUNf1p6uqem1VVZ+hWUThYZpnRU7sn65W1/W/VlX1tqqq/o1mcYjVNKMif9r7Dp+6rv+jqqq3VFX1idLeLJoPnqfSfLCcaOrSGuq6/nlVVUcBHy6LH6ykWS3v8t73DJVAdFb/ym91Xd9TVdVpVVW9q67rY8vmM2mmI/4bzbNAq0v/Pl5WExzmn4E/q6rqsNKHWeXc/5Hmu5SeBExqelhd11dVVbVbVVWfplkcYVb5cyZNIP3jvl1OB44uUxOvrev6uCHt/nvVfB/XJ6uqWk5zT+fSTBec9MjP2pjM+2qS7V5bVdXJwD9XVdV7D84BTqvreuCITV3XK6qq+jjNly8/QHO/Z9E8W/cRmvDcqztW3md/V/7bW3DiF8CHW/XOrKrqZVVV/WOrD1vSBJRhf+/afbqmPHv1ZyUM9t6HNXBs/4IVk1FWo7wAOL6qqvta5/sJ1vzOK0lTqFr7XwxJkrRx6i3MMCx4SJK0tpzCJ0mSJEkdGaAkSZIkqSMDlCRJkiR15DNQkiRJktSRI1CSJEmS1JEBSpIkSZI6MkBJkiRJUkcGKEmSJEnqyAAlSZIkSR0ZoCRJkiSpozmj7oC0vi1evNi1+iVJkvQYCxYsqCa7jyNQkiRJktSRI1CaMebPnz/qLsxYY2NjgPdgJvLez1ze+5nLez9zTad73+vr2nAESpIkSZI6MkBJkiRJUkcGKEmSJEnqyAAlSZIkSR0ZoCRJkiSpIwOUJEmSJHVkgJIkSZKkjgxQkiRJktSRAUqSJEmSOjJASZIkSVJHBihJkiRJ6sgAJUmSJEkdGaAkSZIkqSMDlCRJkiR1ZICSJEmSpI4MUJIkSZLUkQFKkiRJkjoyQEmSJElSRwYoSZIkSerIACVJkiRJHc0ZdQekDWWHHXccdRdmrAWj7oBGxns/cy0AFi9aNOpuSNKUcwRKkiRJkjoyQEmSJElSRwYoSZIkSerIACVJkiRJHRmgJEmSJKkjA5QkSZIkdWSAkiRJkqSODFCSJEmS1JEBSpIkSZI6MkBJkiRJUkcGKEmSJEnqaM6oO6DpKSIeB/w5cHVr868y88xS/g6g6tvtPzPzugFtvYMmzFfA3ZmZrbLdgN8DlgFbAosz8+r+NiRJkqQNwQC1gUTEoZl56qj7MYV2AU7NzLEh5Q91Od+IeA3w7cz8ZXk9PyJemJmXlCqvysxPtOq/mzVDmyRJkrTBOIVPa2tn4LYpaOdxvfAEUALZXgAR8RTg+r76Z0fES6bguJIkSdKkOQK1AUTEq4EDI2JrYElmLirb3wJsBywHNgNOyMxVEXEocDFwYCnbGvhKZt5R9ltjNKtMc9s9M88p+94M7AncmJnfjogdgMNppsFVwNWZ+Z8D+nkYcH5m3tDhtB4P/Gic8rpDGwAPD9i2ovz3WcBl7YLMXBIRvznBsSVJkqT1wgC1AWTmmRGxfV/oeQ3w415YiYidgTcASROm9s3Mz5ayCvgfwMkdDrcNsENmfqa17TDgE5m5urQXEbFzZvaPIJ0KvCoiXgt8f4JnjTYH9o+IJ9GEpata0+4AtisBcTOa0PaNzLxnQDv9z0nBoyOjOwF3jFMuSZIkbVAGqNHZvj3Sk5m3RUQvGOwMfLZVVkfEoJGaQeYC3+q9iIhn04wqrW7V+QbwWuDf2ztmZg30FoH4nYg4mGbRhjVGgYr/Bm7KzP8o9Q+IiN/KzAtK+a+A08uI2hzgfRHxycxc2eEceqFqVunTsHJJ0kZs2bJljI0Ne1RWm6ply5YBeO9noJly7w1Qo7NPRMzu27Zt+e/NmXlvX1nXUZfbM7MdtvYCtomIvfvqTfT80vnAk4DfoG8aHUBm/qjv9TkRcThwQXn99VbZyog4ETgYOKOvqfGm+q2KiGpIiJIkSZI2OAPU6FyYmadsgONUNKvlrZiwJhAROwK/T/Mc0pcz84FJHGvo6FJm3h0R2wwoGi8c3UEzje/2SewjSdpIzJs3j/nz54+6G9rAeqMP3vuZZzrd+3UZJfNZktHZYh327Q8QW45T9yfAb3ZpNCKOAF4FnJSZJ44XniLijcP6FRFPiIj9+urPAlYN2GfQdZhb/nst8Ny+dp4J/HxYvyRJkqT1yQC14SyPiHmt17f2T6sr4aCLrfpev2ZYxcy8BnhJRGzWOs7jy8p8/U7OzFMy86EOfZhTRqt6bc6jPJuUmb8CntdX/5XAY1b+A+4oC1H02pkPXFPa+TnQf01eMaQdSZIkab1zCt+Gcw7w/ohYkpmnZea3IuKNEfFimulyc3ns80HDXBAR76dZlnwuE3+x7DHAkRFxH80o0cOZ+cX+SpN81iiBt0dEb5/NgBNb5d+JiCNpzm0W8LPMvBkgIo7qfTluuQ7vKisNzgIe6Jva+K2I+ABwP81y7uf4TJQkSZJGpaprP4tq07Z48eIaYMHChaPuiiTNKIsXLZoWz0Joak2n52A0tabTve/1dcGCBZNe3dkpfJIkSZLUkQFKkiRJkjoyQEmSJElSRwYoSZIkSerIACVJkiRJHRmgJEmSJKkjA5QkSZIkdWSAkiRJkqSODFCSJEmS1JEBSpIkSZI6MkBJkiRJUkdzRt0BaUO5a+nSUXdhxhobGwNg/vz5I+6JNjTv/czVu/eStKlxBEqSJEmSOjJASZIkSVJHBihJkiRJ6sgAJUmSJEkdGaAkSZIkqSMDlCRJkiR1ZICSJEmSpI4MUJIkSZLUkV+kK21Edthxx1F3Yb1YMOoOaGS89zPXAmDxokWj7oYkTTlHoCRJkiSpIwOUJEmSJHVkgJIkSZKkjgxQkiRJktSRAUqSJEmSOjJASZIkSVJHBihJkiRJ6sgAJUmSJEkdGaAkSZIkqSMDlCRJkiR1ZICSJEmSpI7mjOKgEfEZ4KOZ+csh5U8D/iQzj1oPxz40M09d2/JWvc2Bw4FVQAXMBb6XmUsm0850FRFvBralOf/tgS9n5s2lbGvg3cC9wObALZn5zSHtHAw8DVhR2vtOZl7TKp8DHAZsAfxqWDuSJEnShjCSAAVcDBwMHD+k/OXAJVNxoIg4AFiSmb+YivZa/gA4PjOXtY717oj4VWbeP8XH2qhExEuBazPzsvK6Aj4IfLxUeQfw6cx8qJS/LiKemZk/7WtnD2B1Zn6mte0DEbEkMx8um/4ncEJm3rp+z0qSJEma2Kim8D0MzC4fvNcQEbOBGli9wXvVURkhW9IOT8UpNMFwU/eUXngCyMwauD4ito2IbWlGnB5q1T8DeNmAdl6Ymd/t23YO8Cx4ZHTqG4YnSZIkbSxGNQIFcDbNh+qz+7YfBHwP2L+3ISJ2BN4GLKcJV/f1pseVEaZfAgtL+WbAhZl5eUTsD7wWuC0irsvMb5R9dgF+H3iAR6fe3dDuRES8MzOP69v2OuAi4KXAl/tPKDOXR8QVrfpDjxMRLwKeTzMFbhbwX5l5QSk7FLgZ2BO4MTO/HRE70EyLu48mXF4DPDEzv1L22R94DrAM2AY4NTOXlrK382hYfhzw+cy8q+/cnkRzP76WmQ/2n1ufQe+bLcq5PBu4ol2QmXVEDGqziog5mbmytW1LYGn5+SmZ+e0J+iJJkiRtMCNbRKI8K7T7gKLdMvPnfdv+CPhkZn4mMz8LjEXEm1rlbwY+l5knZOYxNMGEzDyXZvTj1F54Kl6fmZ/KzBNLey8d0I8bIuLpfdt2Lc9tVa0pZv3ndd1Ex4mIzYBnZ+bnM/P4zDwW2D0ievdjG2CHcr69APFG4Oiy7XM0gexxpb1nANtm5qcz80Tg08AhpWwBsDgzjyuB8F96ZX39vhk4Ezg8It5RnmMa5ocREb0XEbFTOf4DNOFu0PuqHrDtTOAdvZHIiNiKZlSqdw1XRcRvRsThEfG2iFg4Tp8kSZKk9W6UI1AAt0fETpl5B0BE7Eoz8vKIiNgb+EFmruhty8yry4hLz/mZuar1egXDzeaxo16D6p8DHAH8d+nHdkBv1GbVgPqTOc5q4Et9ZTcCuwK/oAlH3+rftx3aMvN7EfHk8vIlmXlKq2xVRNxepkOuphmV65WtiIgc1OHMvBs4pkzDe2vZ/0tle7veDRHxgog4Hvh/wB7AkaX4auD1wLW9+iUY7gN8of94EXEV8MWIuBx4AfD+VpX9aEYTTy7tPDciXpuZZwzq/0TGxsbWZrcNasGoOyBJU2jZsmXT4t9eTa1ly5onHLz3M89MufejXsb8LOB3Wq970/fa9gQuH7Bve6GGa/rKxjuvVZl5/UT1M3M1PLJAAsCrgMlMJxt6nBL2toyIIyLinRFxBPBKHg20t7fDUhnhuW3AMXrT4p5a2nrkD83Kdttm5g+AF0fEoRHx3HL8pQPaekRm3gucD2xX2llDRBwI3JGZ78jM/wu8FzgyImZl5nJgizIlsDfa9l7gsgHtPB/YOTPfkpn/SLOq4VvLSBTA5Zl5ZatfVwI7jtd3SZIkaX0a6QhUGQ3ZDB4NKn0jST2DRog2G7Btqv2QZtrducB2JVgA1AOe3QFg0GpzA+psQ/M802fKtDciYr9xdlnN+Itq3NQegerXel7suRHxAeCsAeGu17cX0oz8XJmZfzekyadl5gmt9u+LiK8BB9CMup0MvLmMZFU0z4vtP6Cd+X3trChL3L+BZoRuUGjsMvo30Pz589d2V0nSWpg3b57/9s5AvdEH7/3MM53u/bqMko16Ch/A+RHxEpppaz8YUH4NzdSui/q2r/cAlZlLIuKAiFhCM7Wu50KaYLXGFL2I2AJ4HjBugKIJE5/vhadi7jj9WBoRjx+nvS2GFUTEvN5qgWUE58qIOBK4vq/erjQLXlycmf86Qf8HBdq7gK3LcWpaUxQj4teA/ufaBraTmQ+1ngV7zCqNNN8rJUmSJI3EqKfw9RZd2AN4ambeNKD8J8D+5XkcACLiWTz6PNJE7qf5gta1dT/NAg6PTC0sC2DsPWChhbcC/ctyD3IXsFvvRQleb2D8ULh5b7Su7PMioBeqFkXEW9qVI+KZ5cffLe33ts9i8GjWLzPz3zLzRx36v3kZRWt7HfCYfSNiS+DlmXnhgHaW96b6ter/DnBpeflwRDyhVbYdG0folyRJ0gy1sXwYvZvxF344AXhfRPSWA7+3t7BAB1cAH46IfcoKdZN1FvBX7UUsis8C74yIh2kCyeY0q93dN1GDmXl++dLdF9GMsswBPgNsP85uXwc+UK7BauAm4NbS3o0R8YQysrSMJoj1nhv7GnBERKwox9mevsUcShuDVskb5lSa1fMeplldb0vgot5iIAAR8eayvQZObG3fpxzvwsz897K63mY0U/O2AH6amdeUOl8tz27NK7tvARwziX5KkiRJU6qq68l8bp55ymjXzpn5w1H3pa08M/bGzDxt1H3Z2C1evLiG6TEfd4cdXSND0qZj8aJF0+LfXk2t6fQcjKbWdLr3vb4uWLBg0CMj4xr5FL5p4GXAf466ExGxT2tFQGi+OLjLdDtJkiRJU2RjmcK30SlLfu8P/Nckp7etLzcB74+I3vLtN5Qvv5UkSZK0gRighuitWDfqfvRk5q+AiVbHkyRJkrQeOYVPkiRJkjoyQEmSJElSRwYoSZIkSerIACVJkiRJHRmgJEmSJKkjA5QkSZIkdWSAkiRJkqSODFCSJEmS1JEBSpIkSZI6mjPqDkh61F1Ll466C+vF2NgYAPPnzx9xT7Shee9nrt69l6RNjSNQkiRJktSRAUqSJEmSOjJASZIkSVJHBihJkiRJ6sgAJUmSJEkdGaAkSZIkqSMDlCRJkiR15PdASZKk9WLheQvhvFH3QiPjvZ+5JnHvlx41/b4D0xEoSZIkSerIACVJkiRJHRmgJEmSJKkjA5QkSZIkdWSAkiRJkqSODFCSJEmS1JEBSpIkSZI6MkBJkiRJUkcGKEmSJEnqyAAlSZIkSR0ZoCRJkiSpIwOUJEmSJHU0Z112johDM/PUqerMhhIRuwCHAA8AmwHzgLMz8/IpPs5bgO9n5h1T2W7fMT4MLB1QdElmXh4RR2XmJ0rdjf5+RcQs4I8y81Oj7oskSZLUb50C1GRExAHAksz8xYY6ZuvY/cHhUODozKxbdY6MiJsz8/apOm5mfmmq2hrHLeOFol542pi1709mrgYMT5IkSdoozbgpfBGxPXB9OzwVxwIHj6BLkiRJkqaJKRuBKiNMvwQWAstppsZdWKaR7Q+8FrgtIq7LzG+UfV4DPBlYBmwNHJ+ZyyPiUOAqYD/gtsz8atn2K+DpwMPAFpl5TOv4+wPPKW1tA5wKPAQcBrwoIrYGzip9fHp//zNzZUScXtqaC7wTWAmsohnlOauUHUoTPOcClwC/kZnH9l2LtwFfAN7cG1mJiHnAkcCK0q+7MvO0UjYLeHu5Zg8BD07V6FXf6NtWEfEemus3D7gxM7/ZqvsWYDsevX8nZOaqcs43A3uWfb494DiHAdsDDwLbAidl5p2l7IjSXkVz3e7KzFMi4gnA63n0/nwlM+9u9zkiDgL2Au4v+349M2+eimsjSZIkTdZUT+F7M/DRzFwFj3xwvjwzz42IitYUvoh4MU04+lZ5PQ94C3B8aWvvzPxkq+1n0gSZY0v9J0fEgsxcHBHPALbNzE+XstnAuzPzs8DnIuL+9jS3iLgiIt5H84H9kSl7mXl3+fEI4LjMfLDUPyAi9srMq4B9gE9n5rWlbK+I2DIzl7f6ulkJHv3X5rOZuazs94KIeFFmXkQzpfDrmbm0lD07Ig7MzLMj4pXAE/qu8wO98DVJrwQiMx8ux3lDRDwxM39ZwuyPM/OGUrYz8AYgaQLpDpn5mUGNRsSbgEsz8+ryeg7wv4G/KVUWAkdm5n2l/EUR8RuZeSkD7k+r3Z2B7TPzX1rb/hD43FqcO2NjY2uzm6bAsmXLAO/BTOS9n7l6916SxjMd//8w1VP4zu+Fp2LFOHV/PTMv7L0oweKB8nI28L2++rMz8+xW/Z8Du5SXL+kFsVK2Cri9BKnHyMxzgOOAA8qzT4f06kbEdsDNvfDUqr93efmLXngqvgu8qvciIl4IDHonPNwLT6XNy4C5JViu6oWnUnY1sHP5+buZeVJ8DmdTAAAgAElEQVTfn3Z4elJE/GHfn98bdN7Amb3wVHwTWFB+3r4Xnspxb+PR98dc4FsMt0MvPJV9VwKLIuK5ZdPZvfBUyi8Cnj1Oez0HAf/et+3qQRUlSZKkDWGqR6Cu6Xs9XkDbq4xQte1a/rsqM2+doO12+08d0NZONFPJ7hp08DJi1JtC90TggxHxKWAP4FkR8bi+XXrhZ43pY5l5e5mK1vOi/pGaiNgJuG1AH35YRlmeGxGbDTm3idw8iZX1Huo7/qqI6D0Lts+AwLlt+e/tfcFr3HaLS2im513J4CC9asC2x+gL5GTmD7vsN8j8+fPXdleto95vl7wHM4/3fuaajr9VlrThjer/D+vyb9QGW4VvgKsy85Qpauumrm1FxLaZeW97W5nCdjTNlLUbgW9m5vWTOP5tZWn0O4HVA8rr8meQCliUmf8xpL+vAp7Yt/mBzPzyJPo3nl5Qu3Ad7seggNQfCPsNHB3sM+haSpIkSSMzygC1xYja2i8iruxfTr0sIrElcC3N1LHJBKizaJ7fuhlY1F+YmXeWkaY1RMSzaUbWnjKs4d7iFetDGXHqhZR1uR9zB2x7Ps1CIMN0GWGrImJWWdocaK5Ze7qgJEmStCFtyGXM7+fRKWEAP4mIhe0KEfHMtWx7UVlBblhb7fP8Ac3KfGsoi1rcmJn3ALtExLatsi0jYrdhB8/MB2jObY/MXDKk2uZloYxem7sDO5fl1FdGxJNaZbNK+VQ7uDxz1fMaHg18t0bE3u3Kk7gfd7f7W6YjvrQsugFwYERs3irfh2ZqX8+w9+F/AL/ft+0lHfskSZIkTbkNOQJ1BfDhiNgnM0/MzAsj4qCI+AOapa+3AM5Zm4Yz88aIeEJEHEnzrNJmwOWtKneU1dsuyMwrIuIrEfEhmuW6VwKbAz/LzNNL/c8D74iIh2hGaGYDJ0/QjauBp45T/mXgj0qby4F7MzNL2ReAw8uI0Cqaa7E+voT3QuDIiOgtY35D71mzzPxWRLyxBMkVNKNKZ3RpNDO/FhFvLYF4Vmn7860qP6a5nqtKu0v7ntt6OCLeTTOVcY2FLCLiroj4E+C+0vb31+7UJUmSpHVX1fWwR3OkqdH3XVQb3OLFi2vwIfZRciGBmct7P3ONjY2x8LyFE1eUNKMtPWrpxJXWg97/nxYsWFBNUPUxNuQUPkmSJEma1gxQkiRJktSRAUrr3Sin70mSJElTyQAlSZIkSR0ZoCRJkiSpIwOUJEmSJHVkgJIkSZKkjgxQkiRJktSRAUqSJEmSOjJASZIkSVJHBihJkiRJ6mjOqDsgSZI2TYv2W8T8+fNH3Q1tYGNjYwDe+xloptx7R6AkSZIkqSMDlCRJkiR1ZICSJEmSpI4MUJIkSZLUkQFKkiRJkjoyQEmSJElSRwYoSZIkSerIACVJkiRJHflFupIkab1YeN5COG/UvdDIeO9nrta9X3rU0tH1Yz1xBEqSJEmSOjJASZIkSVJHBihJkiRJ6sgAJUmSJEkdGaAkSZIkqSMDlCRJkiR1ZICSJEmSpI4MUJIkSZLUkQFKkiRJkjoyQEmSJElSRwYoSZIkSepozigPHhH7Anu2Nr0Y+HH5+eHMPHk9HPPQzDy1/FwBhwKbl+ItgB9n5lgpPwBYkpm/mOp+bAwi4oU01/whYDPggsy8LCJ2AN4wYJd9gD/OzPsj4gnAW4F7gC2Bn2TmOUOO8wHgksw8r2/7k4FXDNjlxZn5zgHtvBR4fmZ+ous5SpIkSVNppAEqM88Hzu+9jogHe+Gmq3YgWgtHAGdm5u2t9g6JiNs21dDUExHbAntl5qda294eEVdl5l3AcQP2eSgz7y8vDwM+npl1KXtXRFyemXf37fMc4G4GyMyf9x+nhNrlA469JbA7sHQSpylJkiRNqRk7ha98IK/b4QkgM78KHDiaXm1QLwPO6Nt2MbDHoMoRsQdwbfl5T+BHvfBUfAl4Td8+s4ADgHMm2a8fDNh+GPDlSbQjSZIkTbmRjkB1FRGHAdsDDwLbAidl5p0R8WrgwIjYmmaq3aKI2Ibmw/aDZffVmXnSgGZfDPxwyCEvbP28VUS8h2ZUZDPgwsy8vPRrT+ClwEqaMHpTZn6/lB0KXAXsB9yWmV+NiC2A99BMmVsJ3AJslZlfKfs8r7S3DJgHnJGZN5Wy3y/XoAZ2AE7LzBv7rtM2QABfy8x7hl3P4oy+AEQ55rD99snML5Sfn0Nf+MrMB0pgajsEOI1mil9Xu2Xm2e0NEfFc4PrMXB4Rk2hKkiRJmlobfYCKiDcBl2bm1eX1HOB/A3+TmWdGxPZ9U/h+HzgmM1eV+jtHxO9k5nf6mn4CcB4DZOZ1rZdvBj7aau8I4PJS9tuZeUyrrwdHxONbo1p7Z+Yn26cDHJuZ95X6LwBeX37enub5nk+22nsP8OmI2B24NTNPa5W9D2i3TWbeFxFfAg6JiHnNprxjyDmuEZ5KCH1RX397ZXNZc1rdagaPXj7SZkQ8iSa83hoRTxvUhwHH2RW4uW/bLODAzPzXLm2MZ2xsbF2b0FpatmwZ4D2Yibz3M1fv3kua2TbFf/+nwxS+HXrhCSAzVwKLyqjEIF/phZ1S/zaaEZt+szPz4Q7HP7/dHrCi9XP/s1c/4dEpcLOB7w045n2tvl0G/Ly8PHhAe5dFxFNogslmfWVfHNTZzFxeRtyOB14VER8owWSgiJgfEX8GnAicPqTa7wDfbb2+BHhJXztbA3u3Nr0JyGHHHeJA4Oy+bW/FqXuSJEnaSGz0I1A00936XUIzcnNlf0FmLouIVwK7AatowsegKWSrI2JWZq6e4PjX9L1uh87VEfEWmqlvK2mC2mWlbFVm3tq37yoeqzfVcGfgsL4pavOAOzPzuoh4TkQcTvMc0sWZOe5iCpn5UEScS7Oa3rNopgsOqjcGjEXEbOAPImLQqNW2fcHvpoh4bURcnJn3RsRWwLt75x4RrwK+O2CK4FDl+HV7n7JK38oB13GtzJ8/fyqa0Vro/fbJezDzeO9nrk3xt86SJm9j/fd/Xf6Nmg4BasWAbf2jMY+IiHfSjBp9t7Xt0AFVbwOeSN+UsVL/mZn50/E6VaaWvR84LjPvLNt2o1kpbphBAarnzsw8ZVhhZp5ejrE7TdC5NDMvGtK3PWiWB78ROHpQkImIzdojcJm5KiI+R/P82MmtenszIKgCxwCHlue6VgLH8uiS5POBx0fEb5bXOwE7RMRTMvNLQ05xIbCob9tBNCH1iNa23yrTOM+YKERKkiRJU206BKi5A7Y9n2aBhkEezMz+UaMtBtS7gGYUa9BUuBcB4wYoYC/gO73wNE5f28abMllHxNzMfLC/ICLmZeYygMxcAiyJiCOBi/rqbQ28A7iuw3clHU7fEuKZuToiVvbVmz/o+7gycwVwQuvYv035Dq/M/Ghfv55GszjEwGfOiif0jzRl5vH9lSJi5TosWy9JkiStk+nwDNTdZdQFaEZOgJdmZi9ALS8LJvS0fyYi3ghs3d9oZj4AzC1fCNuu/3vAuV36BTyptV9F89zP0NExYFVZKa+3z6/x6BcJf4sm/LT78szy40ERsWNfW9WA9h/IzH9rj76N47KIWONLbCNif+DS1uutgPv7d+wXETsBT13b786KiKfSjJZJkiRJG7WNfgQqM78WEW+NiIU0gW8e8PlWlXOA90fEkrJK3VVldOZBYHOakaZhoeYE4G0llK0s9S4pX/A6Ub9+HhH7limDq2lGuU4CXjjObv8OvCciVpTj3UUZScvMeyLi7LLy3gPlXG+kGQk7HTi8TBucRbOU+1kD+tT5maPMvDQiDoqI9/Loc2Y3Zea1rWoHDzoOPBJk30qzWMZyhixqMWTfo/pGyA4AvjCkuiRJkrTRqOq682durQcRcUj58l6tJ4sXL65h432IcSZwIYGZy3s/c42NjbHwvIWj7oakEVt61Mb5yHrv/08LFiwYNKtrXNNhCt8moywZvnnr9fOY+FkrSZIkSRuJjX4K3ybmeuDIiHiAZnn1OzPzmyPukyRJkqSODFAbUPkupX8bdT8kSZIkrR2n8EmSJElSRwYoSZIkSerIACVJkiRJHRmgJEmSJKkjA5QkSZIkdWSAkiRJkqSODFCSJEmS1JEBSpIkSZI68ot0JUnSerFov0XMnz9/1N3QBjY2NgbgvZ+BZsq9dwRKkiRJkjoyQEmSJElSRwYoSZIkSerIACVJkiRJHRmgJEmSJKkjA5QkSZIkdWSAkiRJkqSODFCSJEmS1JFfpCtJktaLhecthPNG3QuNZ+lRS0fdBWnacQRKkiRJkjoyQEmSJElSRwYoSZIkSerIACVJkiRJHRmgJEmSJKkjA5QkSZIkdWSAkiRJkqSODFCSJEmS1JEBSpIkSZI6MkBJkiRJUkcGKEmSJEnqaM6oO9BFROwCHAI8AGwGzAPOzszLR9qxASLiqMz8xBS08xngo5n5yyHlTwP+JDOPWsfjvBf4TGauXpd2xmn/Q8CnMvPBAWUfyMx/XR/HlSRJktaHaRGggEOBozOz7m2IiCMj4mZgL2BJZv5iZL1rmYrwVFwMHAwcP6T85cAl63qQzPzUurYxgZOBtwGfbW+MiNcC31nPx5YkSZKm1EY/hS8itgeub4en4liagLGpehiYHRFVf0FEzAZqYL2MGk2lzLyd5jwe19sWEZsBu2fmdaPrmSRJkjR502EE6n7g6f0bM3NlRNwIvA64LSKuy8xvRMShwFXAfsBtmfnViJgLvBNYCawCbsnMswAiYk/gpaVsFnBTZn6/lB0KfK8cYwWwNXARcB+wb9lnO+CszLyht09mntravwI2pwk8t2fmt3rnEBGHA9sCD5V+PZiZX2yd5tnAy8p/2w4q/dq/1daONCM9y8ux7svMUyNiFnBEZp7QbiAi3pWZx/b1d7zr9JxyTR8GtgR+lpln9N+XIU4A3gf8Q3n9P4BHzrP08e000zMfKtfhS6XsyTTXfzmwRTmvUzoeV5IkSZpSG32AKkHpioh4H/CVMqLRKzu3jND0T+HbOzM/2Xp9BHBc7zmciDggIvbKzKuA387MY3oVI+LgiHh86zivzszjWuXvAZa1A0lEvBO4YUD3Xwt8IDNvKfV+NyK2y8x7ImIf4L8y89JSth3wKVrBIjOXRMSBPDZA7ZaZ34mI9rY/Av4xM1eU9p4dEW/KzK9ExBr3OSK2oAmE/ca7Tvtk5udabRwSETtn5m0R8UaacNl2a2Z+p5zHsoj4VUTsDtwBzMvMW1t1DwW+nplLW30/MDPPBg5qTzOMiP0iYn5mjg3ovyRJkrRebfQBCiAzz4mIC4FXR8ROwFLga5m5akD12cB3ey9KMLm5vYhBae9NNCNVp/bt/xNgD+D20tZ5feUPA1/v27Z8SNev74Wn4irg12meXdozM09q9emeiDh3QBu3R8ROmXlHOZ9dgZvbFSJib+AHvfBU2rs6InojVBdFxG9l5gXl9cG0rlFpY6LrNLuvX9+kvH8y82tDzr/tVOAvgXuBdhCrgFW98NTq+3PLy/5ppucD23c43mOMjZm5RmXZsmWA92Am8t7PXL17r43b+vi76d/7mWum3PtpEaAAMnM5cBpARDwR+GBEDFoAYVXf6MYewLPaz+AUvX/ZV0fEW2hW9lsJ7ABc1mrr+v79MvOevm2PeU6puGbAMXfttT2g/mNWqgPOolmBsDdt7aDWzz17lnr97gfIzMvLanu9ALVr3zWCia/TuRHxLuBXwLmZeS/NdLtOMnNVRFwK7JKZ97eKHg88tzwX1dYLTmdFxB/QBNofliB5V9fjSpIkSVNpow9QEbFt+bD+iMz8ZUQcDbwB6A8C/SrgmwOCUO/Zm/fTTFu7s2zbDdh9Sjo/vkEB6jEyc0UvXPQWlBgy8jZoSl47lDwcEZsD2wB3Dqg79DqVY14HXBcR2wCvKFMRjy/9OqS02/arzDyzb9sVwG4DjrsoM/9jyHFvAY4pz2ctiIhn0CyLPukFNObPnz/ZXTRFer+J8h7MPN77mWtT/w30pmJ9/N307/3MNZ3u/br8G7XRByhgv4i4sn+Z8vJs1JYd9r+WZtRmUDDYC/hOLzwVc9e+q5MymRUQz4+Il9D07QcDyq8BXkCzwEVbO0B9j2bq3g7AoCl3Q69TWfVvdmauyMz7gK9FxIER8czM/GlmfnUS59LvNuApwwojYl5mLitTC8+KiF8DFgLfX4djSpIkSWtlo1/GnCYwHNa/MSJeDNxIM01t22E7l+l2u0TEI3UiYssy0nQ38KTW9gp4E2sGj/Xl2oh4JJ73RlgGVSyjP3sAT83MmwaU/wTYvwSdXnvPojXVLTNvBJ4BbF9CUH8b412nWTTTCNu2YvizX52V5elXRkT7PswqC04AvLlvl21ovlBZkiRJ2uA2+hGozFweEV+JiA/RfGBfSbMs+M8y8/Qyve3DEbFPZp44pJnPA++IiIdovjtpNnByGcXat6yit5pmmeyTgBeu59MiMy+JiEMjYl+aZ4kGLVjRdjeDp+n1nAC8LyIeoBmpujczT+6rcw/jT3kc7zrdUK5TXdq/NTN/Pk5bk/EF4PASAFfR3IcvlbJL+u7Pit7UQUmSJGlDq+q6//tpNSoRccg6TofTAIsXL65heszH3VRNpznRmlre+5lrbGyMhectHHU3NIGlRy2duNIk+fd+5ppO977X1wULFgxbDG6o6TCFb5MUEbuWL4ntvZ5HM8oiSZIkaSO10U/h21Rl5i0R8bYyBXElzXS5k0bbK0mSJEnjMUCN0DjPbEmSJEnaCDmFT5IkSZI6MkBJkiRJUkcGKEmSJEnqyAAlSZIkSR0ZoCRJkiSpIwOUJEmSJHVkgJIkSZKkjgxQkiRJktSRX6QrSZLWi0X7LWL+/Pmj7oYkTSlHoCRJkiSpIwOUJEmSJHVkgJIkSZKkjgxQkiRJktSRAUqSJEmSOjJASZIkSVJHBihJkiRJ6sgAJUmSJEkd+UW6kiRpvVh43kI4b9S9EMDSo5aOugvSJsMRKEmSJEnqyAAlSZIkSR0ZoCRJkiSpIwOUJEmSJHVkgJIkSZKkjgxQkiRJktSRAUqSJEmSOjJASZIkSVJHBihJkiRJ6sgAJUmSJEkdGaAkSZIkqSMDlCRJkiR1NGfUHdiYRMQuwCHAA8BmwDzg7My8fKQdGyAijsrMT6xjG/sCe7Y2vRj4cfn5YeCnwAOZeeWAfZ8BPD0zF43T/m7AyzPzxAFlLwPu2hivrSRJkjSMAWpNhwJHZ2bd2xARR0bEzcBewJLM/MXIeteyruGptHE+cH7vdUQ8mJmntl5XwLuAxwQo4ADgpAna/0VEbB8R22Tmfa12ZwEvyMyj1+0MJEmSpA3LKXxFRGwPXN8OT8WxwMEj6NLIDbgWbXMyc1WHZk4A3tG37RDg39e6Y5IkSdKIOAL1qPuBp/dvzMyVEXEj8Drgtoi4LjO/ERGHAlcB+wG3ZeZXI2Iu8E5gJbAKuCUzzwKIiD2Bl5ayWcBNmfn9UnYo8L1yjBXA1sBFwH3AvmWf7YCzMvOG3j690aKyfwVsDtTA7Zn5rd45RMThwLbAQ6VfD2bmFztelysj4rntaXxl+t71XXbOzHsiYllEPCkzb46IecAumXlTq73nlWuzjGba5Bm98oj4fWD7cl47AKdl5o0d+y5JkiRNKQNUUYLSFRHxPuArmXl7q+zcMp2tfwrf3pn5ydbrI4DjMvNBgIg4ICL2ysyrgN/OzGN6FSPi4Ih4fOs4r87M41rl7wGWZeYJrW3vBG4Y0P3XAh/IzFtKvd+NiO1KeNkH+K/MvLSUbQd8CugUoDLzxxHRP41vf+Dk0t7uNCGy3/cy85fl55OAPwU+Cry9t2/Zf3vg+e3rWM7906XtWzPztFbZ+4D2Ne9sbGxsbXbTFFi2bBngPZiJvPczV+/ea+OwIf8O+vd+5pop994A1ZKZ50TEhcCrI2InYCnwtSFT1WYD3+29KMHk5l54arX3JpqRqlP79v8JsAdwe2nrvL7yh4Gv921bPqTr1/fCU3EV8OvAJcCemXlSq0/3RMS5Q9rp6pHpe5m5BFgyXuXMXBERP42IVwArM/OuVvHBPPbaXBYRT6EZddqsr6zryJkkSZI05QxQfTJzOXAaQEQ8EfhgRHxqQNVVmXlr6/UewLMi4nF99Xq/glsdEW+hmaK2kmY62mWttvqnxC3LzHv6tlVDun3NgGPu2mt7QP0HB2wbz5UR8bzMvKKMCnWavtcnaa7rW/u27wwcFhHtbfOAOzPzuoh4TpmCeC1wcWYuXYtjAzB//vy13VXrqPebKO/BzOO9n7k29d9ATzcb8u+gf+9nrul079fl3ygDVBER22bmve1tmfnLiDgaeANw6+A9H1EB3xwQhHqrzr2fZnrfnWXbbsDuU9L58XVZ6GFcmXlhmT54Bc2zSu0peL9OM6Wv37fbo2KZWUfENzPzob56d2bmKeMc+/RynN2BP4iISzPzonU4HUmSJGmtGaAetV9EXNm/THl5NmrLDvtfCxzE4NGZvYDv9MJTMXftuzopU73S4uz2lMYSGNdmRKqnjoi57amPPRExLzOXleMsAZZExJE0C2xIkiRJG5zLmD/qB8Bh/Rsj4sXAjTSr9G07bOcy3W6XiHikTkRsWUaa7gae1NpeAW/isc/3rA/XRsQj46hlpcAFa9HOFRHxRpov151K36JvmfOIeGb58aCI2LGv/rBpjJIkSdJ65whUkZnLI+IrEfEhmsUaVtIsC/6zzDw9IjYDPhwR+2TmiUOa+Tzwjoh4CFhNszjEyWUUa98yDW41sAXNynQvXM+nRWZeEhGHRsS+NMuYD1qwoks7F0fEnwIxYeXJtXtPRJxdVt57gCbU30gT1E4HDi9TIGfRBNizpvL4kiRJ0mRUdT3ed6VqUxQRh2TmV0fdjw1l8eLFNUyPBxo3VdPpoVJNLe/9zDU2NsbC8xaOuhsqlh611mswTZp/72eu6XTve31dsGDBpGc3OYVvExcRu0bEk1uv59GMgkmSJEmaJKfwbeIy85aIeFuZgriSZgrfSaPtlSRJkjQ9GaBmgHGe2ZIkSZI0CU7hkyRJkqSODFCSJEmS1JEBSpIkSZI6MkBJkiRJUkcGKEmSJEnqyAAlSZIkSR0ZoCRJkiSpIwOUJEmSJHXkF+lKkqT1YtF+i5g/f/6ouyFJU8oRKEmSJEnqyAAlSZIkSR0ZoCRJkiSpIwOUJEmSJHVkgJIkSZKkjgxQkiRJktSRAUqSJEmSOjJASZIkSVJHfpGuJElaLxaetxDOG3UvRmPpUUtH3QVJ64kjUJIkSZLUkQFKkiRJkjoyQEmSJElSRwYoSZIkSerIACVJkiRJHRmgJEmSJKkjA5QkSZIkdWSAkiRJkqSODFCSJEmS1JEBSpIkSZI6MkBJkiRJUkdzRt2BUYuIDwNL/3979xplZ1XnefxbJCEQLuEiUZQR7VZREFujDTTYDZiIouBlxD+IEVFRGh1plz2r16w1L2bWmhezpteM3aN4QWgEAcW/2Cg0QmuCCGEExOKmIBcRBFQChktIQRKSmhdnH3h4cqrqqapz6lB1vp+1slJn7+fZzz5n10nqV3s/+1SKFgB3ZuaqiFiRmef1oU8vBo4F1pf+LAKuyMybunyd44EfZeYj3Wy30v4ngYsz86EOdZ8D/m9mjvbi2pIkSVIvDHyAAn5fD0kR8e6I2KdfHQJWAF+ohouIOCUiHszMh7t1kcz8VrfaGsM3gc8B/6taGBFvBm42PEmSJGm2cQlfB5l5KXBAP64dEbsAd3UIF2cA7+pDl6YsMzcA90bEa2tVyzPzJ/3okyRJkjQdzkCNbRSgzES9DXgK2BG4oL3kLSL2Bj4IPEkrjD6QmReXuhXAj0v9emA74N8z855SvyvwUWAEGAJuy8yrS1uvrHcmM5+JiB+Uc7cDTgKeATbTmkW7tHLdbcr1bgDenJlnVNuKiI/Rmh36UHv2LSIWAacAG4ENwKOZ+d1Stw3wcVrLCTcAT09i9iqB/wb899LW0cBltf4cDywur/EC4KzM3BwRi8trtJ7W9+p84CvOXEmSJKlfDFAdRMR8WiFkHnBIZn61lA8BJwDnRMS2wIeB/9n+gT4iDo+IwyuzK+/LzNMq7Z4I3FMefgT4YmZuKXUREUsyc01E3BwRn6UV1p5dspeZj5UvTwTOzMyny7mHRcR+mfkr4EDgy5n561K3X0Rsn5lPVZ7ighJQqk/7Q8BXM3OknPemiDggM6+ntaTwXzNzbanbNyLelplXRMQ7gZfUXsL17fCVmaMRsToiDgWuAV6fmZdUXpOjgWsrwXIJ8AFawet95blsLnV7A0cDF281aA0MDw9P5TR1wcjICOAYDCLHfnC1x35QDfL3vO/7wTUoY+8Svpoy2/I54FJgCXBhu64EpU3l4TuAc6qzISU4tZerzQOuqDW/sVxjX+CadngqLgL+urRzJXAmcFi59+nYiJhXzl0MPNgOT5Xj9y8PH2iHp+Jy4N2V5/cWoNN39aZ2eCpt3ghsV0Lj5nZ4KnW3ldeGzLw8M8+u/fluteHMXAm8HfgYUN+UY5d2eCrHruG578t5tXbuA1Z36LskSZI0I5yBgj3LzBC0ltItAM7PzD+VTRueqB3f/uF+j8x8sEN7T5a/N2fmXWOcux+wU0TsX6tf0/6izBi1l9DtCfx9RJwG7AO8NiJ2r53bDj/P61NmPhwR1RmiAzLzK9VjIuJF1WtXzr2qzAi9ISIWjPFcmroQ+GBm3l8rP7AdDit2rpxzUkQ8Cvy/zHygGuQma+nSpVM9VdPU/k2UYzB4HPvBNdd/Az2RQf6e930/uGbT2E/n3ygDFPxhiluVbxyjvB40OhkCzsvMrdqIiJ3roS0z/xARX6C1tO1e4Psdwtl41pSt0f8EbOlQP1r+jNXXlZn5406VEfFuYM9a8frM/Ha1IDNviojXd2jiusw8t1Pb5XU4vSyp/JuIOI7Wkr6nOh0vSf3+cLQAABnWSURBVJIk9ZoBauoejYiXZOYfa+VNAtStwF/Suieo7q0RcUtmPlAtLJtIbA/8GjgCmEyAuhQ4ntbs1Mp6ZZltW1IvL0sNbwdePlbD7c0rpmHhWBURsSgzRzLzGeCKiLieVoic8c/mkiRJksB7oKbjx7RCybPKRgm/7nz4czLzduDg6rK4iNij7Mz3E1obTDxPRBwE3JuZjwMvjoidK3XbR8Re41xvPa1lcftk5t1jHLZt2Ymv3eargCXlHq9nIuJllbptSn03PFRfyhgRrylffqh27CLgaSRJkqQ+cQZqijJzY0RcFBF/R+v+o+2B+zPzooZNnA6cEhHraC2f25SZ5wNExAUR8Z9pbev9DLAt8LvM/EE59+vAJyJiA60lefOAcya43m3A3uPUfxv4dGnzKeCJzMxS903go+Vepc20Zo268iG8mXlJRBxTAuJGWtuvt3fZuyIiPkVr445ty3VP69ySJEmS1HtDo6N+pI7mtlWrVo3C7Lihca6aTTeVqrsc+8E1PDzM8tXL+92Nvll76pT3PJr1fN8Prtk09u2+Llu2bGiy57qET5IkSZIaMkBJkiRJUkMGKEmSJElqyAAlSZIkSQ0ZoCRJkiSpIQOUJEmSJDVkgJIkSZKkhgxQkiRJktSQAUqSJEmSGjJASZIkSVJDBihJkiRJamh+vzsgSZLmppVvXcnSpUv73Q1J6ipnoCRJkiSpIQOUJEmSJDVkgJIkSZKkhgxQkiRJktSQAUqSJEmSGjJASZIkSVJDBihJkiRJasgAJUmSJEkN+UG6kiSpJ5avXg6r+92L3lp76tp+d0HSDHMGSpIkSZIaMkBJkiRJUkMGKEmSJElqyAAlSZIkSQ0ZoCRJkiSpIQOUJEmSJDVkgJIkSZKkhgxQkiRJktSQAUqSJEmSGjJASZIkSVJDBihJkiRJasgAVRERX4mIPcepf0VEfLFBO4dFxF5j1L0oIo5v2J8V0z0uIk5t0kYvRMRbIuLIMeqOG++1liRJkl6IDFDP93PgXePUvx24YbKNVgNOZj6Smd+aQt+mJDMnDHw9vPYNwNKImFctj4gdgF0z8w/96ZkkSZI0NQao59sEzIuIoXpFCQGjwJYZ79Xs9m3gw7WyjwLn9aEvkiRJ0rTM73cHXoCuAA4vf1cdAfw7cCi0ZpUy83khoF4WEa8GlgEHRMSOwBmZubl9XEQcBgwB+wLPAIuByzPzlnqnImJXWsFjpJxzW2ZePdGTqfapzIStB3Yv13um1t+jgf9QrrEj8C+Z+VRELAA+BTxNK3QvBr6YmRsnun5m3hMRx0TEDpm5PiJeCoxk5rrKdQ8FXl+uuxNwXmauLXUf57mgvzvw9cx8dKLrSpIkSb1ggKrJzLsj4m1sHaD2yszLImIybd0F3BURT9bDVsUbM/Of2g8i4hRgqwAFfIRWaNlSjouIWJKZaxp3CA4GvpqZF5U2/ioi9snMOyLiIGBNZl5S6hYBxwP/AvxH4JzMfLLU7QIcC5wbEW8E3tjhWt+rhKSzgE8C/wycAPyfyvP9c2DnzPxyeTyPVlj7akQsA1Zl5n2lblvg48DXJvGcJUmSpK4xQHX2cES8KDMfASizJg/26Fo/qD2+JiL+IjNvbhdExL7ANe3wVFwEvAf43iSu9Whm3lp5PAwcBdwBvDozz21XZOZIRKwvD7ehNevVrnssIi4tX98E3DTeRTPzkYjYHBHvAO7IzE2V6oNr190cEQ+XILUFWFCp2xgROYnn+zzDw8NTPVXTNDIyAjgGg8ixH1ztsZ/r/N7emu/7wTUoY2+A6uxSygxLeXxE5etuqy+D+w2wHLi5UrYfsFNE7F87djKzTwC3Vx9k5oaI2L59jYg4sXb8S8vfFwIfjYgR4OeZeVd7id0knAl8MzM/WCvfu8N1X0RrVuonEbGizI7dkpm3TOG6kiRJUtcYoDooMx0LANobSmTm5hm6/Hxam1VUDdG6L2jCe46m4VfVmaCqMmN0ZnktDoyI99IKQ2siYimwtPNp+UTlwVMRUZ9tA7hvrOuW89r3b70hIj4HXFqWRk7a0qWduqmZ0P5NlGMweBz7wTXXfwPd5vf21nzfD67ZNPbT+TfKADW2ayLiYGA74Ccd6p8XciJiIZVlbtPwZ8Bva2W3An8JXNOF9seycKyKiFiUmSOZOQpcGxE/p3Uv0hmZOUxrKWDPrgtQNta4pdwjNqUAJUmSJE2X25iPITPvAPYB9m5vYlCzQ+3xUYwdoMZ7nd9Ze3xg7T4lMvN24OD2rBhAROxRdubrllsjYnm1ICJeU748tra1+0JaW753w8r6BwtXrvv+Ekzb5dvgNvKSJEnqI2egxvcYW9+j1HZZRHweeIJWoLgJeOUYx24fEZ8Evp+ZD9fq/lhmVbYAOwOXj9HG6cApEbGO1uzXpsw8v9S9NCJOqh3/cGZ2WjLXUWZeFxFHRMTJtLYrXwhcWap/AJwcERtofc/sAny5adsTXPfeiHhJeQ1GaG0a0d6U4kLgxIjYWLnuN7txXUmSJGkqhkZH67fbaKaUz4G6OzMf6Hdf5rJVq1aNwuxYjztXzaY10eoux35wDQ8Ps3z18okPnOXWnureRnW+7wfXbBr7dl+XLVs26VtwXMInSZIkSQ0ZoCRJkiSpIe+B6qPMvLLffZAkSZLUnDNQkiRJktSQAUqSJEmSGjJASZIkSVJDBihJkiRJasgAJUmSJEkNGaAkSZIkqSEDlCRJkiQ1ZICSJEmSpIb8IF1JktQTK9+6kqVLl/a7G5LUVc5ASZIkSVJDBihJkiRJasgAJUmSJEkNGaAkSZIkqSEDlCRJkiQ1ZICSJEmSpIYMUJIkSZLUkAFKkiRJkhryg3QlSeqTXXfbrd9d6JllwKqVK/vdDUnqOmegJEmSJKkhA5QkSZIkNWSAkiRJkqSGDFCSJEmS1JABSpIkSZIaMkBJkiRJUkMGKEmSJElqyAAlSZIkSQ0ZoCRJkiSpIQOUJEmSJDVkgJIkSZKkhub3uwP9FhEHAfsDm2kFyqeAb2XmaF871gMRMQ/4MK1x31L+vi4zb+1rxyRJkqRZYqADVES8FliSmWdUyl4GHAd8uwvtHwbcnZkPTLetMdpfkZnnTeKUk2iFw3WVNo6NiMcy8/7u91CSJEmaWwZ9Cd/hmXlxtSAzHwS27VN/eiYiXg7cXw1PRQJH9qFLkiRJ0qwz0DNQwKYxyldHxLzM3BwRuwEfo7W0bxRY1571KTNMfwCWl/oFtJbE3RQRhwLvAdZExB2ZeVFErAB+BbwVWJOZ34mI1wF/AzxDK9Del5k/anckIj4B7Ag8CSwEzgB2B94HHBAROwIXZOZjEzzXw+gwq5aZoxGxuXK9FcAQrRA5CjycmZdU6o8HFlee71mZubnaZkSclJln1so+Rmv2a8NYbUTETsBHgKfLaVsy8+xy/iuANwCvBh4vz/nJCZ6zJEmS1FUGqA4y8zeVh58G/jEzNwJExL4RcVxmXlDqPwT8j3aIiIgTgZsy86cRMcTWS/j2z8wvVR7/dWae3n4QEe+KiD0y8+GIOAK4OjPvLHU7AZGZ5wNfi4gnK2FuR+CYDk/n5sy8ERjKzLECY7X8PcDnMvP3pd33R8TizHw8Io4Grs3Me0rdEuADtGaxqp6MiB1rAWe7Ep7Ga+ODwOmV13JJRByZmZeVNo4CTp6L96dJkiRpdhj0ALV5vMqI2B/4STs8AWTmbWV2qe2a2gzMRsY2D7i8Vla/h+lWYB/gYWDP6mxUZq6LiI73KpWwcvY41x7vuW6pfH1XOzwVv6I163MDsEs7+JRrromITstAf0hrWeB34dl7zW4vdeO1cUH1tSx1u1baXTWd8DQ8PDzVUzVNIyMjgGMwiBz78S3rdwd6bGRkxLEfQL7vB9egjP2gB6iJvA64tEN5dWbl9lrdePeVbc7Mh2plW8qStkW0lvHtCtzYPr7eQGZeNW6PxzY0Tt28ytf15zMCvLR8fWDZya9q53pjmflERCyuFB1Ka+nhuG1k5khEvBPYi9ZzHwW2rxz34DjPQZIkSeq5QQ9Q9R/kAYiIPwfuLQ87zSgt6MbFy8zL3wFnZuafStlewKvKIVvGOrdDWzvS2j2w7sbM/AWwOSLmZ+YzHY5p+nyuy8xzGx57T0S8MjN/S2v5YPu5jNlGRJxEa0bv8krZiobXm9DSpUu71ZQmqf2bKMdg8Dj2g23RokWO/QDyfT+4ZtPYT2eWbNAD1MIxyv8qM38TEbcDbwKur9V3JUAB+wGXtcNTsV3l663GJyL2zczb6uVlCd+Z9fKKn9HaSGJlrb0hxgiSHYz1enVyJXBiRLykXLtJG09nZn0GbDLXlCRJknpq0LcxXx0R768WRMSLaS2lo3zA7KHVJWflfp5HG7b/JB2WuFU8Brys0vYQrVmkdkD7Y0TsU6lfCLy5cn7j8SszQftExC61qg8Al3U4pZOHyn1hz4qI14xxvfaM05sy8+aGbSyqlR9DawdCSZIk6QVhoGegyoYQiyPiMzy3dfbTwLcqh50FfDYi1tOaHXoiM89peImbgX+IiAMz8xsdrn9/RBxSlq5toTXbcjbwllL/o4g4qdwXtI7W1uJnV5rYFBGfAlZWN2YYx+nAJ0pQ20Br5un6zPxdkyeTmZdExDERcRCtpY3bARePc8o1tHbOa9rGryLiFFpjsC2tmatuzfZJkiRJ0zY0OuqO0OqNiHgH8OvMvK+f/Vi1atUozI71uHPVbFoTre5y7Me362679bsLPbVq5UrHfgD5vh9cs2ns231dtmzZeButdTToS/jUW6/qd3iSJEmSummgl/CpNyLiEOBAYFW/+yJJkiR1kwFKXZeZ19C6/0mSJEmaU1zCJ0mSJEkNGaAkSZIkqSEDlCRJkiQ1ZICSJEmSpIYMUJIkSZLUkAFKkiRJkhoyQEmSJElSQwYoSZIkSWrIACVJkiRJDc3vdwckSRpUj65d2+8u9Mzw8HC/uyBJPeEMlCRJkiQ1ZICSJEmSpIYMUJIkSZLUkAFKkiRJkhoyQEmSJElSQwYoSZIkSWrIACVJkiRJDfk5UJIkqSeWr14Oq/vdi+5ae+rc/ewuSc04AyVJkiRJDRmgJEmSJKkhA5QkSZIkNWSAkiRJkqSGDFCSJEmS1JABSpIkSZIaMkBJkiRJUkMGKEmSJElqyAAlSZIkSQ0ZoCRJkiSpIQOUJEmSJDVkgJIkSZKkhub3uwMvdBFxCPC6StFBwLXl602Zec4U212RmeeNU/8q4EhgBFgIbA98LzPvncr1Xkgi4mvA5zNzpEPduK+LJEmS1E8GqAlk5jXANe3HEfF0r3/Aj4htgSMz80u18v8SEV/IzI29vP5kTSH03ACsAL7eoy5JkiRJPeESvhem1wE/7VB+PnD4DPelF54G7oyI/fvdEUmSJGkynIHqgog4HlgMPAUsAM7KzM0RMQ/4NDBEaynelsw8q3LePsDbynk7Ahdk5iPAH4B3ALdUr5OZ90fEE+XcrWZ92mURsQK4HTgQ2ALsApyfmfeX404s/RwCtgMezcxzK+28F9i79Gsn4F/bSwdL29uU824A3gIcFBE7Aldl5m1NXrPMvDIiPhsRv8zM0TFe1zeW12d96e8vM/PKJu1LkiRJvWCAmqaIOBq4NjPvKY+XAB8AEvgg8O0SioiIvSLi3Zl5Ka1AcEhmfrXUDQEnAOdk5pqIWBcRJwGZmU+0r5eZjzfo1hDwisz8SqXtk4GvlfrlwCmZua7UHxARb87MX0TEocC6zPxi5Tn+14j435m5gVYo+3Jm/rpUD0fESDXMRcQxtAJh1UOZeVmtLIEAvlN/AhGxO/A3mfmFStnxEfHayrUnZXh4eCqnqQtGRlq3uzkGg8exH1ztsZ9r/F6emO/7wTUoY+8SvunbpR2eADJzDc+9rgvb4anUPQCsKw+XABdW6kaBTZXH3we+C7wnIj5dgtpk/KDW9gMRsWspuqIdnkr99cC+5eG+mXlFra1zgLeXrx+YKMBk5oWZeXbtTz08kZkPAUMR8eIOzRwFnFE7/lvAYeNdW5IkSeolZ6Cm78CyVK9q5/L35vrBmXlV+fLB6sxSsU3t2MeB86C1K19E/D3wT5m5ZYI+jWbmM7Wy3wIvBx4FOm1C0e7rhg59fiAi2vdePTjBtScrgc8AX6qVz8vMpzocv36qF1q6dOlUT9U0tX8T5RgMHsd+cM3V30D7vTwx3/eDazaN/XT+jTJATd911fuHaiYKOh1FxM71cJWZd0fEWcA7gR9Oodn5QMd7jYp2CBxrh78FTS8UEcfSuneq6o+Z+W/1YzNzS0RcFRGH1e5vmnY/JEmSpG4zQE3fwnHqtnp9I2LfBhstHBMR36zPImXmoxGxR3n4vDAUEQtp3fs0lr2Bq8apb89+bdehzy8DHpqgz9V+bnVP0wTH3xwRfxsR11aKt4mIBZn57LLGci9XfbZPkiRJmjHeAzV9D9W3446I15QvN1UCT3tjhH0atHk5cFK9sOyOd315uEOt+iieC1DbRMTbK+cNAXtm5mOl6G3ls6ba9Qfy3I5/d0bEwbW2PwL8eJz+duP76LxynbZLgRNrxxxH5+3dJUmSpBnhDNQ0ZeYlEXFMRBxEa9nZdsDF7WrgMxEBrXt3RoFvNGjz9xHxs4j4PK3PTNpCa6br5sy8vRx2Wal/otTdBLyy1G0BNkTEKeWai2l9hlTbtcAnImJz6e/a9i56mXlVRBxdOXcn4DsTfHjv3RHxn4CbMnP1RM9vjOf8ZEQ8AOxeHv8pIm4o7W4Etgduzcw7p9K+JEmS1A1Do6Pj3Raj2ajTZ0RNpn6uWbVq1SjMjhsa56rZdFOpusuxH1zDw8MsX728393ourWnru13F17wfN8Prtk09u2+Llu2bLxbYDpyCZ8kSZIkNWSAkiRJkqSGDFBz0ETL8wZp+Z4kSZLUTQYoSZIkSWrIACVJkiRJDRmgJEmSJKkhA5QkSZIkNWSAkiRJkqSGDFCSJEmS1JABSpIkSZIaMkBJkiRJUkPz+90BSZI0N61860qWLl3a725IUlc5AyVJkiRJDRmgJEmSJKkhA5QkSZIkNWSAkiRJkqSGDFCSJEmS1JABSpIkSZIaMkBJkiRJUkMGKEmSJElqyAAlSZIkSQ0ZoCRJkiSpIQOUJEmSJDVkgJIkSZKkhgxQkiRJktSQAUqSJEmSGjJASZIkSVJDBihJkiRJasgAJUmSJEkNGaAkSZIkqSEDlCRJkiQ1ZICSJEmSpIaGRkdH+90HqadWrVrlN7kkSZK2smzZsqHJnuMMlCRJkiQ15AyUJEmSJDXkDJQkSZIkNWSAkiRJkqSGDFCSJEmS1JABSpIkSZIaMkBJkiRJUkMGKEmSJElqyAAlSZIkSQ0ZoCRJkiSpIQOUJEmSJDVkgJIkSZKkhub3uwOS5p6IWAycCKwHtgVuzMyfjXHs8cBOwCZgEfCNzFw/Q11Vl01m7MvxuwKRmafPTA/VbRGxAPgUsJHWL2Z/l5mXdThuP+Bw4GlgB+DCzHxwJvuq7mo69uXYRcAHgEsy87GZ66V6YRLv+6OAvXju//jvZOaamexrLxigJPXCMcBpmbkZICJOiIgbM/Pp6kERcQTwi8y8ozzeFlgBnDXTHVbXNB37PYDjgIeBLTPfTXVRAGe3f/EREUdFxEsz8/fPHhAxBBySmadVyj4FfH3Ge6tuajL2C4FPAuuAx4EdAQPU7Ndk7N8IPJ6Z/1YpOxmY9b8wcwmfpK6KiO2A9e0foIvvA0d2OPzF7fAEkJkbgQ097qJ6ZJJj/0hmfikzLwCempEOqlcW1GaNfwgsqx1zCLCqVnZ3ROzV056p1yYc+8zckJmnZeY5GJzmkibv+/0z8+pa2ZO97dbMMEBJ6raXA7+pFmTmE8B2HY69tEPZvF50SjOi8dhn5uhMdUq9U2aNn/cDUWZuAZ6pHfryzPxNrexW4LU97J56aBJjrzlmEmM/Z/+PN0BJ6rYlQKf1zVv9e5OZa6uPI2IHWuupNTs1HnvNGUtoLcOsq495p++BR4A9ut4jzZSmY6+5p9HYd/g/fhvmSIDyHihJUxIR/wjsViu+C7gO2Lz1GQw1aPY44Lxpdk091qOx1+w0nymOeWaORsSc+GFqQE157DXrTXXsjwEu7n53Zp4BStKUZOY/dCqPiLcyhd8wRcRhwPWZ6T1QL3DdHnvNas/QbMy3WrJZNpZwA5HZq+nYa+6Z9NhHxBuAhzLzT73p0sxymlVStz1Ea3q/sYj4M2CXzLy1N13SDJn02GvWWwO8qMFxnYLSHnReBqTZoenYa+6Z1NhHxG7AmzLzp73r0swyQEnqtvuBV1ULImJnxthdr3w2yJGZ+f0Z6Jt6a1Jjr9mv7Jy5c7Ws3Oewbe3Q+yPi1bWyvwDu7GH31EOTGHvNMZMZ+1J+AnDuzPRuZhigJHVV+byfHSOiukT4GFpbnHZyMnBmzzumnpvC2Gtu2BQRO1UeHwVcUT0gM1ez9RbHr8vM+3rdOfXUhGOvOavp2H8COLfs0jdneA+UpF5I4LMRsY7WFta/rHzY3nuBmzLzvog4Adgd+EhEVM+/uvr5UJpVGo19PzuorvsO8LcR8TSwAHgwM++NiAMBMvO6ctzVEfFZWp/7tRj4Xl96q25qOvaaeyYc+4g4FHgTMFr7P/7GzPzFjPe4i4ZGR/0oDkmSJElqwiV8kiRJktSQAUqSJEmSGjJASZIkSVJDBihJkiRJasgAJUmSJEkNGaAkSZIkqSEDlCRJkiQ1ZICSJEmSpIYMUJIkSZLUkAFKkiRJkhoyQEmSJElSQwYoSZIkSWrIACVJkiRJDRmgJEmSJKkhA5QkSZIkNWSAkiRJkqSG/j8Q5gq6gSUXTQAAAABJRU5ErkJggg==\n", 830 | "text/plain": [ 831 | "" 832 | ] 833 | }, 834 | "metadata": { 835 | "needs_background": "light" 836 | }, 837 | "output_type": "display_data" 838 | } 839 | ], 840 | "source": [ 841 | "exp.as_pyplot_figure()\n", 842 | "plt.tight_layout()" 843 | ] 844 | }, 845 | { 846 | "cell_type": "markdown", 847 | "metadata": {}, 848 | "source": [ 849 | "We see that one of the features that contributes most strongly to the positive prediction is the short tenure of the customer." 850 | ] 851 | }, 852 | { 853 | "cell_type": "markdown", 854 | "metadata": {}, 855 | "source": [ 856 | "## Saving the model\n", 857 | "Now that we've done all this work to build the models, we want to be able to use them later.\n", 858 | "The `ExplainedModel` class is a handy wrapper for using the `CategoricalEncoder`, the `Pipeline` object which *is* the churn model, and the Lime Explainer.\n", 859 | "Here, we use it to save these trained models for use in later parts of the Project." 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": 20, 865 | "metadata": {}, 866 | "outputs": [], 867 | "source": [ 868 | "from churnexplainer import ExplainedModel\n", 869 | "explainedmodel = ExplainedModel(data=data, labels=labels, model_name='telco_linear',\n", 870 | " categoricalencoder=ce, pipeline=pipe,\n", 871 | " explainer=explainer,data_dir=data_dir)\n", 872 | "explainedmodel.save()" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": 21, 878 | "metadata": {}, 879 | "outputs": [], 880 | "source": [ 881 | "spark.stop()" 882 | ] 883 | }, 884 | { 885 | "cell_type": "markdown", 886 | "metadata": {}, 887 | "source": [ 888 | "## Wrap up\n", 889 | "We've now covered all the steps to **building a machine learning model** including interpretability\n", 890 | "and saved our work for use in later sections.\n", 891 | "\n", 892 | "In the next part of the series we will explore how to use the **Experiments** feature of CML\n", 893 | "for when we want to test lots of combinations of hyperparameters to fine tune our models.\n" 894 | ] 895 | } 896 | ], 897 | "metadata": { 898 | "kernelspec": { 899 | "display_name": "Python 3", 900 | "language": "python", 901 | "name": "python3" 902 | }, 903 | "language_info": { 904 | "codemirror_mode": { 905 | "name": "ipython", 906 | "version": 3 907 | }, 908 | "file_extension": ".py", 909 | "mimetype": "text/x-python", 910 | "name": "python", 911 | "nbconvert_exporter": "python", 912 | "pygments_lexer": "ipython3", 913 | "version": "3.6.9" 914 | } 915 | }, 916 | "nbformat": 4, 917 | "nbformat_minor": 4 918 | } 919 | -------------------------------------------------------------------------------- /4_train_models.py: -------------------------------------------------------------------------------- 1 | # Part 4: Model Training 2 | 3 | # This script is used to train an Explained model and also how to use the 4 | # Jobs to run model training and the Experiments feature of CML to facilitate model 5 | # tuning. 6 | 7 | # If you haven't yet, run through the initialization steps in the README file and Part 1. 8 | # In Part 1, the data is imported into the `default.telco_churn` table in Hive. 9 | # All data accesses fetch from Hive. 10 | # 11 | # To simply train the model once, run this file in a workbench session. 12 | # 13 | # There are 2 other ways of running the model training process 14 | # 15 | # ***Scheduled Jobs*** 16 | # 17 | # The **[Jobs](https://docs.cloudera.com/machine-learning/cloud/jobs-pipelines/topics/ml-creating-a-job.html)** 18 | # feature allows for adhoc, recurring and depend jobs to run specific scripts. To run this model 19 | # training process as a job, create a new job by going to the Project window and clicking _Jobs > 20 | # New Job_ and entering the following settings: 21 | # * **Name** : Train Mdoel 22 | # * **Script** : 4_train_models.py 23 | # * **Arguments** : _Leave blank_ 24 | # * **Kernel** : Python 3 25 | # * **Schedule** : Manual 26 | # * **Engine Profile** : 1 vCPU / 2 GiB 27 | # The rest can be left as is. Once the job has been created, click **Run** to start a manual 28 | # run for that job. 29 | 30 | # ***Experiments*** 31 | # 32 | # Training a model for use in production requires testing many combinations of model parameters 33 | # and picking the best one based on one or more metrics. 34 | # In order to do this in a *principled*, *reproducible* way, an Experiment executes model training code with **versioning** of the **project code**, **input parameters**, and **output artifacts**. 35 | # This is a very useful feature for testing a large number of hyperparameters in parallel on elastic cloud resources. 36 | 37 | # **[Experiments](https://docs.cloudera.com/machine-learning/cloud/experiments/topics/ml-running-an-experiment.html)**. 38 | # run immediately and are used for testing different parameters in a model training process. 39 | # In this instance it would be use for hyperparameter optimisation. To run an experiment, from the 40 | # Project window click Experiments > Run Experiment with the following settings. 41 | # * **Script** : 4_train_models.py 42 | # * **Arguments** : 5 lbfgs 100 _(these the cv, solver and max_iter parameters to be passed to 43 | # LogisticRegressionCV() function) 44 | # * **Kernel** : Python 3 45 | # * **Engine Profile** : 1 vCPU / 2 GiB 46 | 47 | # Click **Start Run** and the expriment will be sheduled to build and run. Once the Run is 48 | # completed you can view the outputs that are tracked with the experiment using the 49 | # `cdsw.track_metrics` function. It's worth reading through the code to get a sense of what 50 | # all is going on. 51 | 52 | # More Details on Running Experiments 53 | # Requirements 54 | # Experiments have a few requirements: 55 | # - model training code in a `.py` script, not a notebook 56 | # - `requirements.txt` file listing package dependencies 57 | # - a `cdsw-build.sh` script containing code to install all dependencies 58 | # 59 | # These three components are provided for the churn model as `4_train_models.py`, `requirements.txt`, 60 | # and `cdsw-build.sh`, respectively. 61 | # You can see that `cdsw-build.sh` simply installs packages from `requirements.txt`. 62 | # The code in `4_train_models.py` is largely identical to the code in the last notebook. 63 | # with a few differences. 64 | # 65 | # The first difference from the last notebook is at the "Experiments options" section. 66 | # When you set up a new Experiment, you can enter 67 | # [**command line arguments**](https://docs.python.org/3/library/sys.html#sys.argv) 68 | # in standard Python fashion. 69 | # This will be where you enter the combination of model hyperparameters that you wish to test. 70 | # 71 | # The other difference is at the end of the script. 72 | # Here, the `cdsw` package (available by default) provides 73 | # [two methods](https://docs.cloudera.com/machine-learning/cloud/experiments/topics/ml-tracking-metrics.html) 74 | # to let the user evaluate results. 75 | # 76 | # **`cdsw.track_metric`** stores a single value which can be viewed in the Experiments UI. 77 | # Here we store two metrics and the filepath to the saved model. 78 | # 79 | # **`cdsw.track_file`** stores a file for later inspection. 80 | # Here we store the saved model, but we could also have saved a report csv, plot, or any other 81 | # output file. 82 | # 83 | 84 | 85 | from pyspark.sql.types import * 86 | from pyspark.sql import SparkSession 87 | import sys 88 | import os 89 | import os 90 | import datetime 91 | import subprocess 92 | import glob 93 | import dill 94 | import pandas as pd 95 | import numpy as np 96 | import cdsw 97 | 98 | from sklearn.model_selection import train_test_split 99 | from sklearn.metrics import classification_report 100 | from sklearn.preprocessing import OneHotEncoder, StandardScaler 101 | from sklearn.pipeline import Pipeline 102 | from sklearn.linear_model import LogisticRegressionCV 103 | from sklearn.pipeline import TransformerMixin 104 | from sklearn.preprocessing import LabelEncoder 105 | from sklearn.compose import ColumnTransformer 106 | 107 | from lime.lime_tabular import LimeTabularExplainer 108 | 109 | from churnexplainer import ExplainedModel, CategoricalEncoder 110 | 111 | data_dir = '/home/cdsw' 112 | 113 | idcol = 'customerID' 114 | labelcol = 'Churn' 115 | cols = (('gender', True), 116 | ('SeniorCitizen', True), 117 | ('Partner', True), 118 | ('Dependents', True), 119 | ('tenure', False), 120 | ('PhoneService', True), 121 | ('MultipleLines', True), 122 | ('InternetService', True), 123 | ('OnlineSecurity', True), 124 | ('OnlineBackup', True), 125 | ('DeviceProtection', True), 126 | ('TechSupport', True), 127 | ('StreamingTV', True), 128 | ('StreamingMovies', True), 129 | ('Contract', True), 130 | ('PaperlessBilling', True), 131 | ('PaymentMethod', True), 132 | ('MonthlyCharges', False), 133 | ('TotalCharges', False)) 134 | 135 | 136 | # This is a fail safe incase the hive table did not get created in the last step. 137 | try: 138 | spark = SparkSession\ 139 | .builder\ 140 | .appName("PythonSQL")\ 141 | .master("local[*]")\ 142 | .getOrCreate() 143 | 144 | if (spark.sql("SELECT count(*) FROM default.telco_churn").collect()[0][0] > 0): 145 | df = spark.sql("SELECT * FROM default.telco_churn").toPandas() 146 | except: 147 | print("Hive table has not been created") 148 | df = pd.read_csv(os.path.join( 149 | 'raw', 'WA_Fn-UseC_-Telco-Customer-Churn-.csv')) 150 | 151 | # Clean and shape the data from lr and LIME 152 | df = df.replace(r'^\s$', np.nan, regex=True).dropna().reset_index() 153 | df.index.name = 'id' 154 | data, labels = df.drop(labelcol, axis=1), df[labelcol] 155 | data = data.replace({'SeniorCitizen': {1: 'Yes', 0: 'No'}}) 156 | # This is Mike's lovely short hand syntax for looping through data and doing useful things. I think if we started to pay him by the ASCII char, we'd get more readable code. 157 | data = data[[c for c, _ in cols]] 158 | catcols = (c for c, iscat in cols if iscat) 159 | for col in catcols: 160 | data[col] = pd.Categorical(data[col]) 161 | labels = (labels == 'Yes') 162 | 163 | # Prepare the pipeline and split the data for model training 164 | ce = CategoricalEncoder() 165 | X = ce.fit_transform(data) 166 | y = labels.values 167 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 168 | ct = ColumnTransformer( 169 | [('ohe', OneHotEncoder(), list(ce.cat_columns_ix_.values()))], 170 | remainder='passthrough' 171 | ) 172 | 173 | # Experiments options 174 | # If you are running this as an experiment, pass the cv, solver and max_iter values 175 | # as arguments in that order. e.g. `5 lbfgs 100`. 176 | 177 | if len(sys.argv) == 4: 178 | try: 179 | cv = int(sys.argv[1]) 180 | solver = str(sys.argv[2]) 181 | max_iter = int(sys.argv[3]) 182 | except: 183 | sys.exit("Invalid Arguments passed to Experiment") 184 | else: 185 | cv = 5 186 | solver = 'lbfgs' # one of newton-cg, lbfgs, liblinear, sag, saga 187 | max_iter = 100 188 | 189 | clf = LogisticRegressionCV(cv=cv, solver=solver, max_iter=max_iter) 190 | pipe = Pipeline([('ct', ct), 191 | ('scaler', StandardScaler()), 192 | ('clf', clf)]) 193 | 194 | # The magical model.fit() 195 | pipe.fit(X_train, y_train) 196 | train_score = pipe.score(X_train, y_train) 197 | test_score = pipe.score(X_test, y_test) 198 | print("train", train_score) 199 | print("test", test_score) 200 | print(classification_report(y_test, pipe.predict(X_test))) 201 | data[labels.name + ' probability'] = pipe.predict_proba(X)[:, 1] 202 | 203 | 204 | # Create LIME Explainer 205 | feature_names = list(ce.columns_) 206 | categorical_features = list(ce.cat_columns_ix_.values()) 207 | categorical_names = {i: ce.classes_[c] 208 | for c, i in ce.cat_columns_ix_.items()} 209 | class_names = ['No ' + labels.name, labels.name] 210 | explainer = LimeTabularExplainer(ce.transform(data), 211 | feature_names=feature_names, 212 | class_names=class_names, 213 | categorical_features=categorical_features, 214 | categorical_names=categorical_names) 215 | 216 | 217 | # Create and save the combined Logistic Regression and LIME Explained Model. 218 | explainedmodel = ExplainedModel(data=data, labels=labels, model_name='telco_linear', 219 | categoricalencoder=ce, pipeline=pipe, 220 | explainer=explainer, data_dir=data_dir) 221 | explainedmodel.save() 222 | 223 | 224 | # If running as as experiment, this will track the metrics and add the model trained in this 225 | # training run to the experiment history. 226 | cdsw.track_metric("train_score", round(train_score, 2)) 227 | cdsw.track_metric("test_score", round(test_score, 2)) 228 | cdsw.track_metric("model_path", explainedmodel.model_path) 229 | cdsw.track_file(explainedmodel.model_path) 230 | 231 | # Wrap up 232 | 233 | # We've now covered all the steps to **running Experiments**. 234 | # 235 | # Notice also that any script that will run as an Experiment can also be run as a Job or in a Session. 236 | # Our provided script can be run with the same settings as for Experiments. 237 | # A common use case is to **automate periodic model updates**. 238 | # Jobs can be scheduled to run the same model training script once a week using the latest data. 239 | # Another Job dependent on the first one can update the model parameters being used in production 240 | # if model metrics are favorable. 241 | -------------------------------------------------------------------------------- /5_model_serve_explainer.py: -------------------------------------------------------------------------------- 1 | ## Part 5: Model Serving 2 | # 3 | # This notebook explains how to create and deploy Models in CML which function as a 4 | # REST API to serve predictions. This feature makes it very easy for a data scientist 5 | # to make trained models available and usable to other developers and data scientists 6 | # in your organization. 7 | # 8 | # In the last part of the series, you learned: 9 | # - the requirements for running an Experiment 10 | # - how to set up a new Experiment 11 | # - how to monitor the results of an Experiment 12 | # - limitations of the feature 13 | # 14 | # In this part, you will learn: 15 | # - the requirements for creating and deploying a Model 16 | # - how to deploy a Model 17 | # - how to test and use a Model 18 | # - limitations of the feature 19 | # 20 | # If you haven't yet, run through the initialization steps in the README file and Part 1. 21 | # In Part 1, the data is imported into the `default.telco_churn` table in Hive. 22 | # All data accesses fetch from Hive. 23 | # 24 | ### Requirements 25 | # Models have the same requirements as Experiments: 26 | # - model code in a `.py` script, not a notebook 27 | # - a `requirements.txt` file listing package dependencies 28 | # - a `cdsw-build.sh` script containing code to install all dependencies 29 | # 30 | # > In addition, Models *must* be designed with one main function that takes a dictionary as its sole argument 31 | # > and returns a single dictionary. 32 | # > CML handles the JSON serialization and deserialization. 33 | 34 | # In this file, there is minimal code since calculating predictions is much simpler 35 | # than training a machine learning model. 36 | # Once again, we use the `ExplainedModel` helper class in `churnexplainer.py`. 37 | # When a Model API is called, CML will translate the input and returned JSON blobs to and from python dictionaries. 38 | # Thus, the script simply loads the model we saved at the end of the last notebook, 39 | # passes the input dictionary into the model, and returns the results as a dictionary with the following format: 40 | # 41 | # { 42 | # 'data': dict(data), 43 | # 'probability': probability, 44 | # 'explanation': explanation 45 | # } 46 | # 47 | # The Model API will return this dictionary serialized as JSON. 48 | # 49 | ### Model Operations 50 | # 51 | # This model is deployed using the model operations feature of CML which consists of 52 | # [Model Metrics](https://docs.cloudera.com/machine-learning/cloud/model-metrics/topics/ml-enabling-model-metrics.html) 53 | # and [Model Governance](https://docs.cloudera.com/machine-learning/cloud/model-governance/topics/ml-enabling-model-governance.html) 54 | # 55 | # The first requirement to make the model use the model metrics feature by adding the 56 | # `@cdsw.model_metrics` [Python Decorator](https://wiki.python.org/moin/PythonDecorators) 57 | # before the fuction. 58 | # 59 | # Then you can use the *`cdsw.track_metric`* function to add additional 60 | # data to the underlying database for each call made to the model. 61 | # **Note:** `cdsw.track_metric` has different functionality depening on if its being 62 | # used in an *Experiment* or a *Model*. 63 | # 64 | # More detail is available 65 | # using the `help(cdsw.track_mertic)` function 66 | #``` 67 | # help(cdsw.track_metric) 68 | # Help on function track_metric in module cdsw: 69 | # 70 | # track_metric(key, value) 71 | # Description 72 | # ----------- 73 | # 74 | # Tracks a metric for an experiment or model deployment 75 | # Example: 76 | # model deployment usage: 77 | # >>>@cdsw.model_metrics 78 | # >>>predict_func(args): 79 | # >>> cdsw.track_metric("input_args", args) 80 | # >>> return {"result": "prediction"} 81 | # 82 | # experiment usage: 83 | # >>>cdsw.track_metric("input_args", args) 84 | # 85 | # Parameters 86 | # ---------- 87 | # key: string 88 | # The metric key to track 89 | # value: string, boolean, numeric 90 | # The metric value to track 91 | #``` 92 | # 93 | # 94 | ### Creating and deploying a Model 95 | # To create a Model using our `5_model_serve_explainer.py` script, use the following settings: 96 | # * **Name**: Explainer 97 | # * **Description**: Explain customer churn prediction 98 | # * **File**: `5_model_serve_explainer.py` 99 | # * **Function**: explain 100 | # * **Input**: 101 | # ``` 102 | # { 103 | # "StreamingTV": "No", 104 | # "MonthlyCharges": 70.35, 105 | # "PhoneService": "No", 106 | # "PaperlessBilling": "No", 107 | # "Partner": "No", 108 | # "OnlineBackup": "No", 109 | # "gender": "Female", 110 | # "Contract": "Month-to-month", 111 | # "TotalCharges": 1397.475, 112 | # "StreamingMovies": "No", 113 | # "DeviceProtection": "No", 114 | # "PaymentMethod": "Bank transfer (automatic)", 115 | # "tenure": 29, 116 | # "Dependents": "No", 117 | # "OnlineSecurity": "No", 118 | # "MultipleLines": "No", 119 | # "InternetService": "DSL", 120 | # "SeniorCitizen": "No", 121 | # "TechSupport": "No" 122 | # } 123 | # ``` 124 | #* **Kernel**: Python 3 125 | #* **Engine Profile**: 1 vCPU / 2 GiB Memory 126 | # 127 | # The rest can be left as is. 128 | # 129 | # After accepting the dialog, CML will *build* a new Docker image using `cdsw-build.sh`, 130 | # then *assign an endpoint* for sending requests to the new Model. 131 | 132 | ## Testing the Model 133 | # > To verify it's returning the right results in the format you expect, you can 134 | # > test any Model from it's *Overview* page. 135 | # 136 | # If you entered an *Example Input* before, it will be the default input here, 137 | # though you can enter your own. 138 | 139 | ## Using the Model 140 | # 141 | # > The *Overview* page also provides sample `curl` or Python commands for calling your Model API. 142 | # > You can adapt these samples for other code that will call this API. 143 | # 144 | # This is also where you can find the full endpoint to share with other developers 145 | # and data scientists. 146 | # 147 | # **Note:** for security, you can specify 148 | # [Model API Keys](https://docs.cloudera.com/machine-learning/cloud/models/topics/ml-model-api-key-for-models.html) 149 | # to add authentication. 150 | 151 | ## Limitations 152 | # 153 | # Models do have a few limitations that are important to know: 154 | # - re-deploying or re-building Models results in Model downtime (usually brief) 155 | # - re-starting CML does not automatically restart active Models 156 | # - Model logs and statistics are only preserved so long as the individual replica is active 157 | # 158 | # A current list of known limitations are 159 | # [documented here](https://docs.cloudera.com/machine-learning/cloud/models/topics/ml-models-known-issues-and-limitations.html). 160 | 161 | 162 | from collections import ChainMap 163 | import cdsw, numpy 164 | from churnexplainer import ExplainedModel 165 | 166 | #Load the model save earlier. 167 | em = ExplainedModel(model_name='telco_linear',data_dir='/home/cdsw') 168 | 169 | # *Note:* If you want to test this in a session, comment out the line 170 | #`@cdsw.model_metrics` below. Don't forget to uncomment when you 171 | # deploy, or it won't write the metrics to the database 172 | 173 | @cdsw.model_metrics 174 | # This is the main function used for serving the model. It will take in the JSON formatted arguments , calculate the probablity of 175 | # churn and create a LIME explainer explained instance and return that as JSON. 176 | def explain(args): 177 | data = dict(ChainMap(args, em.default_data)) 178 | data = em.cast_dct(data) 179 | probability, explanation = em.explain_dct(data) 180 | 181 | # Track inputs 182 | cdsw.track_metric('input_data', data) 183 | 184 | # Track our prediction 185 | cdsw.track_metric('probability', probability) 186 | 187 | # Track explanation 188 | cdsw.track_metric('explanation', explanation) 189 | 190 | return { 191 | 'data': dict(data), 192 | 'probability': probability, 193 | 'explanation': explanation 194 | } 195 | 196 | # To test this is a session, comment out the `@cdsw.model_metrics` line, 197 | # uncomment the and run the two rows below. 198 | #x={"StreamingTV":"No","MonthlyCharges":70.35,"PhoneService":"No","PaperlessBilling":"No","Partner":"No","OnlineBackup":"No","gender":"Female","Contract":"Month-to-month","TotalCharges":1397.475,"StreamingMovies":"No","DeviceProtection":"No","PaymentMethod":"Bank transfer (automatic)","tenure":29,"Dependents":"No","OnlineSecurity":"No","MultipleLines":"No","InternetService":"DSL","SeniorCitizen":"No","TechSupport":"No"} 199 | #explain(x) 200 | 201 | ## Wrap up 202 | # 203 | # We've now covered all the steps to **deploying and serving Models**, including the 204 | # requirements, limitations, and how to set up, test, and use them. 205 | # This is a powerful way to get data scientists' work in use by other people quickly. 206 | # 207 | # In the next part of the project we will explore how to launch a **web application** 208 | # served through CML. 209 | # Your team is busy building models to solve problems. 210 | # CML-hosted Applications are a simple way to get these solutions in front of 211 | # stakeholders quickly. -------------------------------------------------------------------------------- /6_application.py: -------------------------------------------------------------------------------- 1 | # Part 6: Application 2 | 3 | # This script explains how to create and deploy Applications in CML. 4 | # This feature allows data scientists to **get ML solutions in front of stakeholders quickly**, 5 | # including business users who need results fast. 6 | # This may be good for sharing a **highly customized dashboard**, a **monitoring tool**, or a **product mockup**. 7 | 8 | # CML is agnostic regarding frameworks. 9 | # [Flask](https://flask.palletsprojects.com/en/1.1.x/), 10 | # [Dash](https://plotly.com/dash/), 11 | # or even [Tornado](https://www.tornadoweb.org/en/stable/) apps will all work. 12 | # R users will find it easy to deploy Shiny apps. 13 | 14 | # If you haven't yet, run through the initialization steps in the README file. Do that 15 | # now 16 | 17 | # This file is provides a sample Flask app script, ready for deployment, 18 | # which displays churn predictions and explanations using the Model API deployed in 19 | # Part 5 20 | 21 | # Deploying the Application 22 | # 23 | # > Once you have written an app that is working as desired, including in a test Session, 24 | # > it can be deployed using the *New Application* dialog in the *Applications* tab in CML. 25 | 26 | # After accepting the dialog, CML will deploy the application then *assign a URL* to 27 | # the Application using the subdomain you chose. 28 | # 29 | # *Note:* This does not requirement the `cdsw-build.sh* file as it doen now follows a 30 | # seperate build process to deploy an application. 31 | # 32 | 33 | # To create an Application using our sample Flask app, perform the following. 34 | # This is a special step for this particular app: 35 | # 36 | # In the deployed Model from step 5, go to *Model* > *Settings* in CML and make a note (i.e. copy) the 37 | # "**Access Key**". eg - `mqc8ypo...pmj056y` 38 | # 39 | # While you're there, **disable** the additional Model authentication feature by unticking **Enable Authentication**. 40 | # 41 | # **Note**: Disabling authentication is only necessary for this Application to work. 42 | # Ordinarily, you may want to keep Authentication in place. 43 | # 44 | # Next, from the Project level, click on *Open Workbench* (note you don't actually have to Launch a 45 | # Session) in order to edit a file. Select the `flask/single_view.html` file and paste the Access 46 | # Key in at line 19. 47 | # 48 | # ` const accessKey = "mp3ebluylxh4yn5h9xurh1r0430y76ca";` 49 | # 50 | # Save the file (if it has not auto saved already) and go back to the Project. 51 | # 52 | # Finally, go to the *Applications* section of the Project and select *New Application* with the following: 53 | # * **Name**: Churn Analysis App 54 | # * **Subdomain**: churn-app _(note: this needs to be unique, so if you've done this before, 55 | # pick a more random subdomain name)_ 56 | # * **Script**: 6_application.py 57 | # * **Kernel**: Python 3 58 | # * **Engine Profile**: 1 vCPU / 2 GiB Memory 59 | # 60 | # Accept the inputs, and in a few minutes the Application will be ready to use. 61 | 62 | # Using the Application 63 | 64 | # > A few minutes after deploying, the *Applications* page will show the app as Running. 65 | # You can then click on its name to access it. 66 | # CML Applications are accessible by any user with read-only (or higher) access to the project. 67 | # 68 | 69 | # This deploys a basic flask application for serving the HTML and some specific data 70 | # use for project Application. 71 | 72 | # At this point, you will be able to open the Churn Analysis App. 73 | # The initial view is a table of randomly selected customers from the dataset. 74 | # This provides a snapshot of the customer base as a whole. 75 | # The colors in the *Probability* column correspond to the prediction, with red customers being deemed more likely to churn. 76 | # The colors of the features show which are most important for each prediction. 77 | # Deeper red indicates incresed importance for predicting that a customer **will churn** 78 | # while deeper blue indicates incresed importance for predicting that a customer **will not**. 79 | # 80 | from flask import Flask, send_from_directory, request 81 | from IPython.display import Javascript, HTML 82 | import random 83 | import os 84 | from churnexplainer import ExplainedModel 85 | from collections import ChainMap 86 | from flask import Flask 87 | from pandas.io.json import dumps as jsonify 88 | import logging 89 | import subprocess 90 | from IPython.display import Image 91 | Image("images/table_view.png") 92 | # 93 | # Clicking on any row will show a "local" interpreted model for that particular customer. 94 | # Here, you can see how adjusting any one of the features will change that customer's churn prediction. 95 | # 96 | Image("images/single_view_1.png") 97 | # 98 | # Changing the *InternetService* to *DSL* lowers the probablity of churn. 99 | # **Note**: this obviously does *not* mean that you should change that customer's internet service to DSL 100 | # and expect they will be less likely to churn. 101 | # Imagine if your ISP did that to you. 102 | # Rather, the model is more optimistic about an otherwise identical customer who has been using DSL. 103 | # This information simply gives you a clearer view of what to expect given specific factors 104 | # as a starting point for developing your business strategies. 105 | # Furthermore, as you start implementing changes based on the model, it may change customers' behavior 106 | # so that the predictions stop being reliable. 107 | # It's important to use Jobs to keep models up-to-date. 108 | # 109 | Image("images/single_view_2.png") 110 | # 111 | # There are many frameworks that ease the development of interactive, informative webapps. 112 | # Once written, it is straightforward to deploy them in CML. 113 | 114 | 115 | # This reduces the the output to the console window 116 | log = logging.getLogger('werkzeug') 117 | log.setLevel(logging.ERROR) 118 | 119 | # Since we have access in an environment variable, we want to write it to our UI 120 | # Change the line in the flask/single_view.html file. 121 | if os.environ.get('SHTM_ACCESS_KEY') != None: 122 | access_key = os.environ.get('SHTM_ACCESS_KEY', "") 123 | subprocess.call(["sed", "-i", 's/const\saccessKey.*/const accessKey = "' + 124 | access_key + '";/', "/home/cdsw/flask/single_view.html"]) 125 | 126 | 127 | # Load the explained model 128 | em = ExplainedModel(model_name='telco_linear', data_dir='/home/cdsw') 129 | 130 | # Creates an explained version of a partiuclar data point. This is almost exactly the same as the data used in the model serving code. 131 | 132 | 133 | def explainid(N): 134 | customer_data = dataid(N)[0] 135 | customer_data.pop('id') 136 | customer_data.pop('Churn probability') 137 | data = em.cast_dct(customer_data) 138 | probability, explanation = em.explain_dct(data) 139 | return {'data': dict(data), 140 | 'probability': probability, 141 | 'explanation': explanation, 142 | 'id': int(N)} 143 | 144 | # Gets the rest of the row data for a particular customer. 145 | 146 | 147 | def dataid(N): 148 | customer_id = em.data.index.dtype.type(N) 149 | customer_df = em.data.loc[[customer_id]].reset_index() 150 | return customer_df.to_dict(orient='records') 151 | 152 | 153 | # Flask doing flasky things 154 | flask_app = Flask(__name__, static_url_path='') 155 | 156 | 157 | @flask_app.route('/') 158 | def home(): 159 | return "" 160 | 161 | 162 | @flask_app.route('/flask/') 163 | def send_file(path): 164 | return send_from_directory('flask', path) 165 | 166 | # Grabs a sample explained dataset for 10 randomly selected customers. 167 | 168 | 169 | @flask_app.route('/sample_table') 170 | def sample_table(): 171 | sample_ids = random.sample(range(1, len(em.data)), 10) 172 | sample_table = [] 173 | for ids in sample_ids: 174 | sample_table.append(explainid(str(ids))) 175 | return jsonify(sample_table) 176 | 177 | # Shows the names and all the catagories of the categorical variables. 178 | 179 | 180 | @flask_app.route("/categories") 181 | def categories(): 182 | return jsonify({feat: dict(enumerate(cats)) 183 | for feat, cats in em.categories.items()}) 184 | 185 | # Shows the names and all the statistical variations of the numerica variables. 186 | 187 | 188 | @flask_app.route("/stats") 189 | def stats(): 190 | return jsonify(em.stats) 191 | 192 | 193 | # A handy way to get the link if you are running in a session. 194 | HTML("Open Table View".format( 195 | os.environ['CDSW_ENGINE_ID'], os.environ['CDSW_DOMAIN'])) 196 | 197 | # Launches flask. Note the host and port details. This is specific to CML/CDSW 198 | if __name__ == "__main__": 199 | flask_app.run(host='127.0.0.1', port=int(os.environ['CDSW_APP_PORT'])) 200 | -------------------------------------------------------------------------------- /7a_ml_ops_simulation.py: -------------------------------------------------------------------------------- 1 | ## Part 7a - Model Operations - Drift Simulation 2 | # 3 | # This script show cases how to use the model operations features of CML. 4 | # # This feature allows machine learning engineering to **measure and manage models 5 | # through their life cycle**, and know how a model is performing over time. As part 6 | # of the larger machine learning lifecycle, this closes the loop on managing 7 | # models that have been deployed into production. 8 | 9 | ### Add Model Metrics 10 | # New metrics can be added to a model and existing ones updated using the `cdsw` 11 | # library and the [model metrics SDK](https://docs.cloudera.com/machine-learning/cloud/model-metrics/topics/ml-tracking-model-metrics-using-python.html) 12 | # If model metrics is enabled for a model, then every call to that model is recorded 13 | # in the model metric database. There are situations in which its necessary to update or 14 | # add to those recordered metrics. This script shows you how this works. 15 | 16 | #### Update Exsiting Tracked Metrics 17 | # This is part of what is called "ground truth". Certain machine learning implemetations, 18 | # (like this very project) will use a supervised approach where a model is making a 19 | # prediction and the acutal value (or lable) is only available at later stage. To check 20 | # how well a model is performing, these actual values need to be compared with the 21 | # prediction the model. Each time a model end point is called, it provides the response 22 | # from the function, some other details and a unique uuid for that response. 23 | # This tracked model response entry can then be updated at a later date to add the 24 | # actual "ground truth" value, or any other data that you want to add. 25 | # 26 | # Data can be added to a tracked model response using the `cdsw.track_delayed_metrics`. 27 | # 28 | # ```python 29 | # help(cdsw.track_delayed_metrics) 30 | # Help on function track_delayed_metrics in module cdsw: 31 | # 32 | # track_delayed_metrics(metrics, prediction_uuid) 33 | # Description 34 | # ----------- 35 | # 36 | # Track a metric for a model prediction that is only known after prediction time. 37 | # For example, for a model that makes a binary or categorical prediction, the actual 38 | # correctness of the prediction is not known at prediction time. This function can be 39 | # used to retroactively to track a prediction's correctness later, when ground truth 40 | # is available 41 | # Example: 42 | # >>>track_delayed_metrics({"ground_truth": "value"}, "prediction_uuid") 43 | # 44 | # Parameters 45 | # ---------- 46 | # metrics: object 47 | # metrics object 48 | # prediction_uuid: string, UUID 49 | # prediction UUID of model metrics 50 | # ``` 51 | 52 | #### Adding Additional Metrics 53 | # It is also possible to add additional data/metrics to the model database to track 54 | # things like aggrerate metrics that aren't associated with the one particular response. 55 | # This can be done using the `cdsw.track_aggregate_metrics` function. 56 | 57 | # ```python 58 | # help(cdsw.track_aggregate_metrics) 59 | # Help on function track_aggregate_metrics in module cdsw: 60 | # 61 | # track_aggregate_metrics(metrics, start_timestamp_ms, end_timestamp_ms, model_deployment_crn=None) 62 | # Description 63 | # ----------- 64 | # 65 | # Track aggregate metric data for model deployment or model build or model 66 | # Example: 67 | # >>>track_aggregate_metrics({"val_count": 125}, 1585685142786, 68 | # ... 1585685153602, model_deployment_crn="/db401b6a-4b26-4c8f-8ea6-a1b09b93db88")) 69 | # 70 | # Parameters 71 | # ---------- 72 | # metrics: object 73 | # metrics data object 74 | # start_timestamp_ms: int 75 | # aggregated metrics start timestamp in milliseconds 76 | # end_timestamp_ms: int 77 | # aggregated metrics end timestamp in milliseconds 78 | # model_deployment_crn: string 79 | # model deployment Crn 80 | # ``` 81 | # 82 | 83 | ### Model Drift Simlation 84 | # This script simulates making calls to the model using sample data, and slowly 85 | # introducting an increasing amount of random variation to the churn value so that 86 | # the model will be less accurate over time. 87 | 88 | # The script will grab 1000 random samples from the data set and simulate 1000 89 | # predictions. The live model will be called each time in the loop and while the 90 | # `churn_error` function adds an increasing amount of error to the data to make 91 | # the model less accurate. The actual value, the response value and the uuid are 92 | # added to an array. 93 | # 94 | # Then there is "ground truth" loop that iterates though the array and updates the 95 | # recorded metric to add the actual lable value using the uuid. At the same time, the 96 | # model accruacy is evaluated every 100 samples and added as an aggregate metric. 97 | # Overtime this accuracy metric falls due the error introduced into the data. 98 | 99 | 100 | import cdsw, time, os, random, json 101 | import numpy as np 102 | import pandas as pd 103 | import matplotlib.pyplot as plt 104 | from sklearn.metrics import classification_report 105 | from cmlbootstrap import CMLBootstrap 106 | import seaborn as sns 107 | import copy 108 | 109 | 110 | ## Set the model ID 111 | # Get the model id from the model you deployed in step 5. These are unique to each 112 | # model on CML. 113 | 114 | model_id = "63" 115 | 116 | # Grab the data from Hive. 117 | from pyspark.sql import SparkSession 118 | from pyspark.sql.types import * 119 | spark = SparkSession\ 120 | .builder\ 121 | .appName("PythonSQL")\ 122 | .master("local[*]")\ 123 | .getOrCreate() 124 | 125 | df = spark.sql("SELECT * FROM default.telco_churn").toPandas() 126 | 127 | # Get the various Model CRN details 128 | HOST = os.getenv("CDSW_API_URL").split(":")[0] + "://" + os.getenv("CDSW_DOMAIN") 129 | cml = CMLBootstrap() 130 | 131 | latest_model = cml.get_model({"id": model_id, "latestModelDeployment": True, "latestModelBuild": True}) 132 | 133 | Model_CRN = latest_model ["crn"] 134 | Deployment_CRN = latest_model["latestModelDeployment"]["crn"] 135 | model_endpoint = HOST.split("//")[0] + "//modelservice." + HOST.split("//")[1] + "/model" 136 | 137 | # This will randomly return True for input and increases the likelihood of returning 138 | # true based on `percent` 139 | def churn_error(item,percent): 140 | if random.random() < percent: 141 | return True 142 | else: 143 | return True if item=='Yes' else False 144 | 145 | 146 | # Get 1000 samples 147 | df_sample = df.sample(1000) 148 | 149 | df_sample.groupby('Churn')['Churn'].count() 150 | 151 | df_sample_clean = df_sample.\ 152 | replace({'SeniorCitizen': {"1": 'Yes', "0": 'No'}}).\ 153 | replace(r'^\s$', np.nan, regex=True).\ 154 | dropna() 155 | 156 | # Create an array of model responses. 157 | response_labels_sample = [] 158 | 159 | # Make 1000 calls to the model with increasing error 160 | percent_counter = 0 161 | percent_max = len(df_sample_clean) 162 | 163 | for record in json.loads(df_sample_clean.to_json(orient='records')): 164 | print("Added {} records".format(percent_counter)) if (percent_counter%50 == 0) else None 165 | percent_counter += 1 166 | no_churn_record = copy.deepcopy(record) 167 | no_churn_record.pop('customerID') 168 | no_churn_record.pop('Churn') 169 | # **note** this is an easy way to interact with a model in a script 170 | response = cdsw.call_model(latest_model["accessKey"],no_churn_record) 171 | response_labels_sample.append( 172 | { 173 | "uuid":response["response"]["uuid"], 174 | "final_label":churn_error(record["Churn"],percent_counter/percent_max), 175 | "response_label":response["response"]["prediction"]["probability"] >= 0.5, 176 | "timestamp_ms":int(round(time.time() * 1000)) 177 | } 178 | ) 179 | 180 | # The "ground truth" loop adds the updated actual label value and an accuracy measure 181 | # every 100 calls to the model. 182 | for index, vals in enumerate(response_labels_sample): 183 | print("Update {} records".format(index)) if (index%50 == 0) else None 184 | cdsw.track_delayed_metrics({"final_label":vals['final_label']}, vals['uuid']) 185 | if (index%100 == 0): 186 | start_timestamp_ms = vals['timestamp_ms'] 187 | final_labels = [] 188 | response_labels = [] 189 | final_labels.append(vals['final_label']) 190 | response_labels.append(vals['response_label']) 191 | if (index%100 == 99): 192 | print("Adding accuracy metrc") 193 | end_timestamp_ms = vals['timestamp_ms'] 194 | accuracy = classification_report(final_labels,response_labels,output_dict=True)["accuracy"] 195 | cdsw.track_aggregate_metrics({"accuracy": accuracy}, start_timestamp_ms , end_timestamp_ms, model_deployment_crn=Deployment_CRN) 196 | 197 | 198 | -------------------------------------------------------------------------------- /7b_ml_ops_visual.py: -------------------------------------------------------------------------------- 1 | ## Part 7b - Model Operations - Visualising Model Metrics 2 | 3 | # This is a continuation of the previous process started in the 4 | # `7a_ml_ops_simulations.py` script. 5 | # Here we will load in the metrics saved to the model database in the previous step 6 | # into a Pandas dataframe, and display different features as graphs. 7 | 8 | #```python 9 | # help(cdsw.read_metrics) 10 | # Help on function read_metrics in module cdsw: 11 | # 12 | # read_metrics(model_deployment_crn=None, start_timestamp_ms=None, end_timestamp_ms=None, model_crn=None, model_build_crn=None) 13 | # Description 14 | # ----------- 15 | # 16 | # Read metrics data for given Crn with start and end time stamp 17 | # 18 | # Parameters 19 | # ---------- 20 | # model_deployment_crn: string 21 | # model deployment Crn 22 | # model_crn: string 23 | # model Crn 24 | # model_build_crn: string 25 | # model build Crn 26 | # start_timestamp_ms: int, optional 27 | # metrics data start timestamp in milliseconds , if not passed 28 | # default value 0 is used to fetch data 29 | # end_timestamp_ms: int, optional 30 | # metrics data end timestamp in milliseconds , if not passed 31 | # current timestamp is used to fetch data 32 | # 33 | # Returns 34 | # ------- 35 | # object 36 | # metrics data 37 | #``` 38 | 39 | 40 | import cdsw, time, os 41 | import pandas as pd 42 | import matplotlib.pyplot as plt 43 | import numpy as np 44 | from sklearn.metrics import classification_report 45 | from cmlbootstrap import CMLBootstrap 46 | import seaborn as sns 47 | import sqlite3 48 | 49 | 50 | ## Set the model ID 51 | # Get the model id from the model you deployed in step 5. These are unique to each 52 | # model on CML. 53 | 54 | model_id = "63" 55 | 56 | # Get the various Model CRN details 57 | cml = CMLBootstrap() 58 | 59 | latest_model = cml.get_model({"id": model_id, "latestModelDeployment": True, "latestModelBuild": True}) 60 | 61 | Model_CRN = latest_model ["crn"] 62 | Deployment_CRN = latest_model["latestModelDeployment"]["crn"] 63 | 64 | # Read in the model metrics dict. 65 | model_metrics = cdsw.read_metrics(model_crn=Model_CRN,model_deployment_crn=Deployment_CRN) 66 | 67 | # This is a handy way to unravel the dict into a big pandas dataframe. 68 | metrics_df = pd.io.json.json_normalize(model_metrics["metrics"]) 69 | metrics_df.tail().T 70 | 71 | # Write the data to SQL lite for Viz Apps 72 | if not(os.path.exists("model_metrics.db")): 73 | conn = sqlite3.connect('model_metrics.db') 74 | metrics_df.to_sql(name='model_metrics', con=conn) 75 | 76 | # Do some conversions & calculations 77 | metrics_df['startTimeStampMs'] = pd.to_datetime(metrics_df['startTimeStampMs'], unit='ms') 78 | metrics_df['endTimeStampMs'] = pd.to_datetime(metrics_df['endTimeStampMs'], unit='ms') 79 | metrics_df["processing_time"] = (metrics_df["endTimeStampMs"] - metrics_df["startTimeStampMs"]).dt.microseconds * 1000 80 | 81 | # This shows how to plot specific metrics. 82 | sns.set_style("whitegrid") 83 | sns.despine(left=True,bottom=True) 84 | 85 | prob_metrics = metrics_df.dropna(subset=['metrics.probability']).sort_values('startTimeStampMs') 86 | sns.lineplot(x=range(len(prob_metrics)), y="metrics.probability", data=prob_metrics, color='grey') 87 | 88 | time_metrics = metrics_df.dropna(subset=['processing_time']).sort_values('startTimeStampMs') 89 | sns.lineplot(x=range(len(prob_metrics)), y="processing_time", data=prob_metrics, color='grey') 90 | 91 | # This shows how the model accuracy drops over time. 92 | agg_metrics = metrics_df.dropna(subset=["metrics.accuracy"]).sort_values('startTimeStampMs') 93 | sns.barplot(x=list(range(1,len(agg_metrics)+1)), y="metrics.accuracy", color="grey", data=agg_metrics) 94 | -------------------------------------------------------------------------------- /8_check_model.py: -------------------------------------------------------------------------------- 1 | # # Check Model 2 | # This file should be run in a job that will periodically check the current model's accuracy and trigger the 3 | # model retrain job if its below the required thresh hold. 4 | 5 | import cdsw, time, os 6 | import pandas as pd 7 | from sklearn.metrics import classification_report 8 | from cmlbootstrap import CMLBootstrap 9 | 10 | # replace this with these values relevant values from the project 11 | model_id = "63" 12 | job_id = "107" 13 | 14 | # Get the various Model CRN details 15 | cml = CMLBootstrap() 16 | 17 | latest_model = cml.get_model({"id": model_id, "latestModelDeployment": True, "latestModelBuild": True}) 18 | 19 | Model_CRN = latest_model ["crn"] 20 | Deployment_CRN = latest_model["latestModelDeployment"]["crn"] 21 | 22 | # Read in the model metrics dict. 23 | model_metrics = cdsw.read_metrics(model_crn=Model_CRN,model_deployment_crn=Deployment_CRN) 24 | 25 | # This is a handy way to unravel the dict into a big pandas dataframe. 26 | metrics_df = pd.io.json.json_normalize(model_metrics["metrics"]) 27 | 28 | latest_aggregate_metric = metrics_df.dropna(subset=["metrics.accuracy"]).sort_values('startTimeStampMs')[-1:]["metrics.accuracy"] 29 | 30 | 31 | if latest_aggregate_metric.to_list()[0] < 0.6: 32 | print("model is below threshold, retraining") 33 | cml.start_job(job_id,{}) 34 | #TODO reploy new model 35 | else: 36 | print("model does not need to be retrained") 37 | -------------------------------------------------------------------------------- /9_build_project.py: -------------------------------------------------------------------------------- 1 | # Run this file to auto deploy the model, run a job, and deploy the application 2 | 3 | # Install the requirements 4 | !pip3 install -r requirements.txt --progress-bar off 5 | import subprocess 6 | import datetime 7 | import xml.etree.ElementTree as ET 8 | import requests 9 | import json 10 | import time 11 | import os 12 | from IPython.display import Javascript, HTML 13 | from cmlbootstrap import CMLBootstrap 14 | 15 | try: 16 | os.environ["SPARK_HOME"] 17 | print("Spark is enabled") 18 | except: 19 | print('Spark is not enabled, please enable spark before running this script') 20 | raise KeyError('Spark is not enabled, please enable spark before running this script') 21 | 22 | run_time_suffix = datetime.datetime.now() 23 | run_time_suffix = run_time_suffix.strftime("%d%m%Y%H%M%S") 24 | 25 | 26 | # Instantiate API Wrapper 27 | cml = CMLBootstrap() 28 | 29 | # Set the STORAGE environment variable 30 | try : 31 | storage=os.environ["STORAGE"] 32 | except: 33 | storage = cml.get_cloud_storage() 34 | storage_environment_params = {"STORAGE":storage} 35 | storage_environment = cml.create_environment_variable(storage_environment_params) 36 | os.environ["STORAGE"] = storage 37 | 38 | # Create the directories and upload data 39 | !hadoop fs -mkdir -p $STORAGE/datalake 40 | !hadoop fs -mkdir -p $STORAGE/datalake/data 41 | !hadoop fs -mkdir -p $STORAGE/datalake/data/churn 42 | !hadoop fs -copyFromLocal /home/cdsw/raw/WA_Fn-UseC_-Telco-Customer-Churn-.csv $STORAGE/datalake/data/churn/WA_Fn-UseC_-Telco-Customer-Churn-.csv 43 | 44 | # This will run the data ingest file. You need this to create the hive table from the 45 | # csv file. 46 | exec(open("1_data_ingest.py").read()) 47 | 48 | # Get User Details 49 | user_details = cml.get_user({}) 50 | user_obj = {"id": user_details["id"], "username": os.getenv("CDSW_PROJECT_URL").split("/")[6], 51 | "name": user_details["name"], 52 | "type": user_details["type"], 53 | "html_url": user_details["html_url"], 54 | "url": user_details["url"] 55 | } 56 | 57 | # Get Project Details 58 | project_details = cml.get_project({}) 59 | project_id = project_details["id"] 60 | 61 | #Get the runtime_id 62 | runtime_id = 14 63 | for ids in cml.get_runtimes()["runtimes"]: 64 | if ids["kernel"] == "Python 3.7" and ids["edition"] == "Standard" and ids["shortVersion"] == "2021.09" and ids["editor"] == "Workbench": 65 | runtime_id = ids["id"] 66 | 67 | #Get runtime addon numbers 68 | addon_val = cml.get_runtimes_addons()[0]['identifier'] 69 | 70 | # Create Job 71 | create_jobs_params = {"name": "Train Model", 72 | "type": "manual", 73 | "script": "4_train_models.py", 74 | "timezone": "America/Los_Angeles", 75 | "environment": {}, 76 | "kernel": "python3", 77 | "cpu": 1, 78 | "memory": 2, 79 | "nvidia_gpu": 0, 80 | "include_logs": True, 81 | "notifications": [ 82 | {"user_id": user_obj["id"], 83 | "user": user_obj, 84 | "success": False, "failure": False, "timeout": False, "stopped": False 85 | } 86 | ], 87 | "recipients": {}, 88 | "attachments": [], 89 | "include_logs": True, 90 | "report_attachments": [], 91 | "success_recipients": [], 92 | "failure_recipients": [], 93 | "timeout_recipients": [], 94 | "stopped_recipients": [] 95 | } 96 | 97 | 98 | if os.getenv("ML_RUNTIME_EDITION") != None: 99 | create_jobs_params["runtime_id"] = runtime_id 100 | create_jobs_params["addons"] = [addon_val-1,addon_val] 101 | create_jobs_params["kernel"] = "" 102 | 103 | 104 | 105 | new_job = cml.create_job(create_jobs_params) 106 | new_job_id = new_job["id"] 107 | print("Created new job with jobid", new_job_id) 108 | 109 | ## 110 | # Start a job 111 | job_env_params = {} 112 | start_job_params = {"environment": job_env_params} 113 | job_id = new_job_id 114 | job_status = cml.start_job(job_id, start_job_params) 115 | print("Job started") 116 | 117 | # Stop a job 118 | #job_dict = cml.start_job(job_id, start_job_params) 119 | #cml.stop_job(job_id, start_job_params) 120 | 121 | 122 | # Get Default Engine Details 123 | default_engine_details = cml.get_default_engine({}) 124 | default_engine_image_id = default_engine_details["id"] 125 | 126 | # Create the YAML file for the model lineage 127 | yaml_text = \ 128 | """"Model Explainer {}": 129 | hive_table_qualified_names: # this is a predefined key to link to training data 130 | - "default.telco_churn@cm" # the qualifiedName of the hive_table object representing 131 | metadata: # this is a predefined key for additional metadata 132 | query: "select * from historical_data" # suggested use case: query used to extract training data 133 | training_file: "4_train_models.py" # suggested use case: training file used 134 | """.format(run_time_suffix) 135 | 136 | with open('lineage.yml', 'w') as lineage: 137 | lineage.write(yaml_text) 138 | 139 | 140 | # Create Model 141 | example_model_input = {"StreamingTV": "No", "MonthlyCharges": 70.35, "PhoneService": "No", "PaperlessBilling": "No", "Partner": "No", "OnlineBackup": "No", "gender": "Female", "Contract": "Month-to-month", "TotalCharges": 1397.475, 142 | "StreamingMovies": "No", "DeviceProtection": "No", "PaymentMethod": "Bank transfer (automatic)", "tenure": 29, "Dependents": "No", "OnlineSecurity": "No", "MultipleLines": "No", "InternetService": "DSL", "SeniorCitizen": "No", "TechSupport": "No"} 143 | 144 | 145 | create_model_params = { 146 | "projectId": project_id, 147 | "name": "Model Explainer 2", 148 | "description": "Explain a given model prediction", 149 | "visibility": "private", 150 | "enableAuth": False, 151 | "targetFilePath": "5_model_serve_explainer.py", 152 | "targetFunctionName": "explain", 153 | "engineImageId": default_engine_image_id, 154 | "kernel": "python3", 155 | "examples": [ 156 | { 157 | "request": example_model_input, 158 | "response": {} 159 | }], 160 | "cpuMillicores": 1000, 161 | "memoryMb": 2048, 162 | "nvidiaGPUs": 0, 163 | "replicationPolicy": {"type": "fixed", "numReplicas": 1}, 164 | "environment": {}} 165 | 166 | if os.getenv("ML_RUNTIME_EDITION") != None: 167 | create_model_params["runtimeId"] = runtime_id 168 | 169 | new_model_details = cml.create_model(create_model_params) 170 | access_key = new_model_details["accessKey"] # todo check for bad response 171 | model_id = new_model_details["id"] 172 | 173 | print("New model created with access key", access_key) 174 | 175 | # Disable model_authentication 176 | cml.set_model_auth({"id": model_id, "enableAuth": False}) 177 | 178 | # Wait for the model to deploy. 179 | is_deployed = False 180 | while is_deployed == False: 181 | model = cml.get_model({"id": str( 182 | new_model_details["id"]), "latestModelDeployment": True, "latestModelBuild": True}) 183 | if model["latestModelDeployment"]["status"] == 'deployed': 184 | print("Model is deployed") 185 | break 186 | else: 187 | print("Deploying Model.....") 188 | time.sleep(10) 189 | 190 | 191 | # Change the line in the flask/single_view.html file. 192 | subprocess.call(["sed", "-i", 's/const\saccessKey.*/const accessKey = "' + 193 | access_key + '";/', "/home/cdsw/flask/single_view.html"]) 194 | 195 | # Change the model_id value in the 7a_model_operations.py, 7b_ml_ops_visual.py and 8_check_model.py file 196 | subprocess.call(["sed", "-i", 's/model_id =.*/model_id = "' + 197 | model_id + '"/', "/home/cdsw/7a_ml_ops_simulation.py"]) 198 | subprocess.call(["sed", "-i", 's/model_id =.*/model_id = "' + 199 | model_id + '"/', "/home/cdsw/7b_ml_ops_visual.py"]) 200 | subprocess.call(["sed", "-i", 's/model_id =.*/model_id = "' + 201 | model_id + '"/', "/home/cdsw/8_check_model.py"]) 202 | 203 | 204 | # Create Application 205 | create_application_params = { 206 | "name": "Explainer App", 207 | "subdomain": run_time_suffix[:], 208 | "description": "Explainer web application", 209 | "type": "manual", 210 | "script": "6_application.py", "environment": {}, 211 | "kernel": "python3", "cpu": 1, "memory": 2, 212 | "nvidia_gpu": 0 213 | } 214 | 215 | if os.getenv("ML_RUNTIME_EDITION") != "": 216 | create_application_params["runtime_id"] = runtime_id 217 | create_application_params["addons"] = [addon_val-1,addon_val] 218 | create_application_params["kernel"] = "" 219 | 220 | new_application_details = cml.create_application(create_application_params) 221 | application_url = new_application_details["url"] 222 | application_id = new_application_details["id"] 223 | 224 | # print("Application may need a few minutes to finish deploying. Open link below in about a minute ..") 225 | print("Application created, deploying at ", application_url) 226 | 227 | # Wait for the application to deploy. 228 | is_deployed = False 229 | while is_deployed == False: 230 | # Wait for the application to deploy. 231 | app = cml.get_application(str(application_id), {}) 232 | if app["status"] == 'running': 233 | print("Application is deployed") 234 | break 235 | else: 236 | print("Deploying Application.....") 237 | time.sleep(10) 238 | 239 | HTML("Open Application UI".format(application_url)) 240 | 241 | # This will run the model operations section that makes calls to the model to track 242 | # mertics and track metric aggregations 243 | 244 | exec(open("7a_ml_ops_simulation.py").read()) 245 | 246 | # Change the job_id value in the 8_check_model.py file 247 | subprocess.call(["sed", "-i", 's/job_id =.*/job_id = "' + 248 | str(new_job_id) + '"/', "/home/cdsw/8_check_model.py"]) 249 | 250 | # Create the check model Job 251 | # Create Job 252 | create_jobs_params = {"name": "Check Model", 253 | "type": "manual", 254 | "script": "8_check_model.py", 255 | "timezone": "America/Los_Angeles", 256 | "environment": {}, 257 | "kernel": "python3", 258 | "cpu": 1, 259 | "memory": 2, 260 | "nvidia_gpu": 0, 261 | "include_logs": True, 262 | "notifications": [ 263 | {"user_id": user_obj["id"], 264 | "user": user_obj, 265 | "success": False, "failure": False, "timeout": False, "stopped": False 266 | } 267 | ], 268 | "recipients": {}, 269 | "attachments": [], 270 | "include_logs": True, 271 | "report_attachments": [], 272 | "success_recipients": [], 273 | "failure_recipients": [], 274 | "timeout_recipients": [], 275 | "stopped_recipients": [] 276 | } 277 | 278 | 279 | if os.getenv("ML_RUNTIME_EDITION") != None: 280 | create_jobs_params["runtime_id"] = runtime_id 281 | create_jobs_params["addons"] = [addon_val-1,addon_val] 282 | create_jobs_params["kernel"] = "" 283 | 284 | new_job = cml.create_job(create_jobs_params) 285 | new_job_id = new_job["id"] 286 | print("Created new job with jobid", new_job_id) 287 | 288 | # Start a job 289 | job_env_params = {} 290 | start_job_params = {"environment": job_env_params} 291 | job_id = new_job_id 292 | job_status = cml.start_job(job_id, start_job_params) 293 | print("Job started") 294 | 295 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Churn Prediction Prototype 2 | This project is a Cloudera Machine Learning 3 | ([CML](https://www.cloudera.com/products/machine-learning.html)) **Applied Machine Learning 4 | Project Prototype**. It has all the code and data needed to deploy an end-to-end machine 5 | learning project in a running CML instance. 6 | 7 | ## Project Overview 8 | This project builds the telco churn with model interpretability project discussed in more 9 | detail [this blog post](https://blog.cloudera.com/visual-model-interpretability-for-telco-churn-in-cloudera-data-science-workbench/). 10 | The initial idea and code comes from the FFL Interpretability report which is now freely 11 | available and you can read the full report [here](https://ff06-2020.fastforwardlabs.com/) 12 | 13 | ![table_view](images/table_view.png) 14 | 15 | The goal is to build a classifier model using Logistic Regression to predict the churn 16 | probability for a group of customers from a telecoms company. On top that, the model 17 | can then be interpreted using [LIME](https://github.com/marcotcr/lime). Both the Logistic 18 | Regression and LIME models are then deployed using CML's real-time model deployment 19 | capability and finally a basic flask based web application is deployed that will let 20 | you interact with the real-time model to see which factors in the data have the most 21 | influence on the churn probability. 22 | 23 | By following the notebooks in this project, you will understand how to perform similar 24 | classification tasks on CML as well as how to use the platform's major features to your 25 | advantage. These features include **streamlined model experimentation**, 26 | **point-and-click model deployment**, and **ML app hosting**. 27 | 28 | We will focus our attention on working within CML, using all it has to offer, while 29 | glossing over the details that are simply standard data science. 30 | We trust that you are familiar with typical data science workflows 31 | and do not need detailed explanations of the code. 32 | Notes that are *specific to CML* will be emphasized in **block quotes**. 33 | 34 | ### Initialize the Project 35 | There are a couple of steps needed at the start to configure the Project and Workspace 36 | settings so each step will run sucessfully. You **must** run the project bootstrap 37 | before running other steps. If you just want to launch the model interpretability 38 | application without going through each step manually, then you can also deploy the 39 | complete project. 40 | 41 | ***Project bootstrap*** 42 | 43 | Open the file `0_bootstrap.py` in a normal workbench python3 session. You only need a 44 | 1 vCPU / 2 GiB instance. Once the session is loaded, click **Run > Run All Lines**. 45 | This will file will create an Environment Variable for the project called **STORAGE**, 46 | which is the root of default file storage location for the Hive Metastore in the 47 | DataLake (e.g. `s3a://my-default-bucket`). It will also upload the data used in the 48 | project to `$STORAGE/datalake/data/churn/`. The original file comes as part of this 49 | git repo in the `raw` folder. 50 | 51 | ***Deploy the Complete Project*** 52 | 53 | If you just wish build the project artifacts without going through each step manually, 54 | run the `9_build_projet.py` file in a python3 session. Again a 1 vCPU / 2 GiB instance 55 | will be suffient. This script will: 56 | * run the bootstrap 57 | * then create the Hive Table and import the data 58 | * deploy the model 59 | * update the application files to use this new model 60 | * deploy the application 61 | * run the model drift simulation 62 | Once the script has completed you will see the new model and application are now available 63 | in the project. 64 | 65 | ## Project Build 66 | If you want go through each of the steps manually to build and understand how the project 67 | works, follow the steps below. There is a lot more detail and explanation/comments in each 68 | of the files/notebooks so its worth looking into those. Follow the steps below and you 69 | will end up with a running application. 70 | 71 | ### 0 Bootstrap 72 | Just to reiterate that you have run the bootstrap for this project before anything else. 73 | So make sure you run step 0 first. 74 | 75 | Open the file `0_bootstrap.py` in a normal workbench python3 session. You only need a 76 | 1 CPU / 2 GB instance. Then **Run > Run All Lines** 77 | 78 | ### 1 Ingest Data 79 | This script will read in the data csv from the file uploaded to the object store (s3/adls) setup 80 | during the bootstrap and create a managed table in Hive. This is all done using Spark. 81 | 82 | Open `1_data_ingest.py` in a Workbench session: python3, 1 CPU, 2 GB. Run the file. 83 | 84 | ### 2 Explore Data 85 | This is a Jupyter Notebook that does some basic data exploration and visualistaion. It 86 | is to show how this would be part of the data science workflow. 87 | 88 | ![data](images/data.png) 89 | 90 | Open a Jupyter Notebook session (rather than a work bench): python3, 1 CPU, 2 GB and 91 | open the `2_data_exploration.ipynb` file. 92 | 93 | At the top of the page click **Cells > Run All**. 94 | 95 | ### 3 Model Building 96 | This is also a Jupyter Notebook to show the process of selecting and building the model 97 | to predict churn. It also shows more details on how the LIME model is created and a bit 98 | more on what LIME is actually doing. 99 | 100 | Open a Jupyter Notebook session (rather than a work bench): python3, 1 CPU, 2 GB and 101 | open the ` 3_model_building.ipynb` file. 102 | 103 | At the top of the page click **Cells > Run All**. 104 | 105 | ### 4 Model Training 106 | A model pre-trained is saved with the repo has been and placed in the `models` directory. 107 | If you want to retrain the model, open the `4_train_models.py` file in a workbench session: 108 | python3 1 vCPU, 2 GiB and run the file. The newly model will be saved in the models directory 109 | named `telco_linear`. 110 | 111 | There are 2 other ways of running the model training process 112 | 113 | ***1. Jobs*** 114 | 115 | The **[Jobs](https://docs.cloudera.com/machine-learning/cloud/jobs-pipelines/topics/ml-creating-a-job.html)** 116 | feature allows for adhoc, recurring and depend jobs to run specific scripts. To run this model 117 | training process as a job, create a new job by going to the Project window and clicking _Jobs > 118 | New Job_ and entering the following settings: 119 | * **Name** : Train Mdoel 120 | * **Script** : 4_train_models.py 121 | * **Arguments** : _Leave blank_ 122 | * **Kernel** : Python 3 123 | * **Schedule** : Manual 124 | * **Engine Profile** : 1 vCPU / 2 GiB 125 | The rest can be left as is. Once the job has been created, click **Run** to start a manual 126 | run for that job. 127 | 128 | ***2. Experiments*** 129 | 130 | The other option is running an **[Experiment](https://docs.cloudera.com/machine-learning/cloud/experiments/topics/ml-running-an-experiment.html)**. Experiments run immediately and are used for testing different parameters in a model training process. In this instance it would be use for hyperparameter optimisation. To run an experiment, from the Project window click Experiments > Run Experiment with the following settings. 131 | * **Script** : 4_train_models.py 132 | * **Arguments** : 5 lbfgs 100 _(these the cv, solver and max_iter parameters to be passed to 133 | LogisticRegressionCV() function) 134 | * **Kernel** : Python 3 135 | * **Engine Profile** : 1 vCPU / 2 GiB 136 | 137 | Click **Start Run** and the expriment will be sheduled to build and run. Once the Run is 138 | completed you can view the outputs that are tracked with the experiment using the 139 | `cdsw.track_metrics` function. It's worth reading through the code to get a sense of what 140 | all is going on. 141 | 142 | 143 | ### 5 Serve Model 144 | The **[Models](https://docs.cloudera.com/machine-learning/cloud/models/topics/ml-creating-and-deploying-a-model.html)** 145 | is used top deploy a machine learning model into production for real-time prediction. To 146 | deploy the model trailed in the previous step, from to the Project page, click **Models > New 147 | Model** and create a new model with the following details: 148 | 149 | * **Name**: Explainer 150 | * **Description**: Explain customer churn prediction 151 | * **File**: 5_model_serve_explainer.py 152 | * **Function**: explain 153 | * **Input**: 154 | ``` 155 | { 156 | "StreamingTV": "No", 157 | "MonthlyCharges": 70.35, 158 | "PhoneService": "No", 159 | "PaperlessBilling": "No", 160 | "Partner": "No", 161 | "OnlineBackup": "No", 162 | "gender": "Female", 163 | "Contract": "Month-to-month", 164 | "TotalCharges": 1397.475, 165 | "StreamingMovies": "No", 166 | "DeviceProtection": "No", 167 | "PaymentMethod": "Bank transfer (automatic)", 168 | "tenure": 29, 169 | "Dependents": "No", 170 | "OnlineSecurity": "No", 171 | "MultipleLines": "No", 172 | "InternetService": "DSL", 173 | "SeniorCitizen": "No", 174 | "TechSupport": "No" 175 | } 176 | ``` 177 | * **Kernel**: Python 3 178 | * **Engine Profile**: 1vCPU / 2 GiB Memory 179 | 180 | Leave the rest unchanged. Click **Deploy Model** and the model will go through the build 181 | process and deploy a REST endpoint. Once the model is deployed, you can test it is working 182 | from the model Model Overview page. 183 | 184 | _**Note: This is important**_ 185 | 186 | Once the model is deployed, you must disable the additional model authentication feature. In the model settings page, untick **Enable Authentication**. 187 | 188 | ![disable_auth](images/disable_auth.png) 189 | 190 | ### 6 Deploy Application 191 | The next step is to deploy the Flask application. The **[Applications](https://docs.cloudera.com/machine-learning/cloud/applications/topics/ml-applications.html)** feature is still quite new for CML. For this project it is used to deploy a web based application that interacts with the underlying model created in the previous step. 192 | 193 | _**Note: This next step is important**_ 194 | 195 | _In the deployed model from step 5, go to **Model > Settings** and make a note (i.e. copy) the 196 | "Access Key". It will look something like this (ie. mukd9sit7tacnfq2phhn3whc4unq1f38)_ 197 | 198 | _From the Project level click on "Open Workbench" (note you don't actually have to Launch a 199 | session) in order to edit a file. Select the flask/single_view.html file and paste the Access 200 | Key in at line 19._ 201 | 202 | ` const accessKey = "mp3ebluylxh4yn5h9xurh1r0430y76ca";` 203 | 204 | _Save the file (if it has not auto saved already) and go back to the Project._ 205 | 206 | From the Go to the **Applications** section and select "New Application" with the following: 207 | * **Name**: Churn Analysis App 208 | * **Subdomain**: churn-app _(note: this needs to be unique, so if you've done this before, 209 | pick a more random subdomain name)_ 210 | * **Script**: 6_application.py 211 | * **Kernel**: Python 3 212 | * **Engine Profile**: 1vCPU / 2 GiB Memory 213 | 214 | 215 | After the Application deploys, click on the blue-arrow next to the name. The initial view is a 216 | table of randomly selected from the dataset. This shows a global view of which features are 217 | most important for the predictor model. The reds show incresed importance for preditcting a 218 | cusomter that will churn and the blues for for customers that will not. 219 | 220 | ![table_view](images/table_view.png) 221 | 222 | Clicking on any single row will show a "local" interpreted model for that particular data point 223 | instance. Here you can see how adjusting any one of the features will change the instance's 224 | churn prediction. 225 | 226 | 227 | ![single_view_1](images/single_view_1.png) 228 | 229 | Changing the InternetService to DSL lowers the probablity of churn. *Note: this does not mean 230 | that changing the Internet Service to DSL cause the probability to go down, this is just what 231 | the model would predict for a customer with those data points* 232 | 233 | 234 | ![single_view_2](images/single_view_2.png) 235 | 236 | ### 7 Model Operations 237 | The final step is the model operations which consists of [Model Metrics](https://docs.cloudera.com/machine-learning/cloud/model-metrics/topics/ml-enabling-model-metrics.html) 238 | and [Model Governance](https://docs.cloudera.com/machine-learning/cloud/model-governance/topics/ml-enabling-model-governance.html) 239 | 240 | **Model Governance** is setup in the `0_bootstrap.py` script, which writes out the lineage.yml file at 241 | the start of the project. For the **Model Metrics** open a workbench session (1 vCPU / 2 GiB) and open the 242 | `7a_ml_ops_simulation.py` file. You need to set the `model_id` number from the model created in step 5 on line 243 | 113. The model number is on the model's main page. 244 | 245 | ![model_id](images/model_id.png) 246 | 247 | `model_id = "95"` 248 | 249 | From there, run the file. This goes through a process of simulating an model that drifts over 250 | over 1000 calls to the model. The file contains comments with details of how this is done. 251 | 252 | In the next step you can interact and display the model metrics. Open a workbench 253 | session (1 vCPU / 2 GiB) and open and run the `7b_ml_ops_visual.py` file. Again you 254 | need to set the `model_id` number from the model created in step 5 on line 53. 255 | The model number is on the model's main page. 256 | 257 | ![model_accuracy](images/model_accuracy.png) 258 | 259 | -------------------------------------------------------------------------------- /cdsw-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pip3 install -r requirements.txt -------------------------------------------------------------------------------- /churnexplainer.py: -------------------------------------------------------------------------------- 1 | import datetime, dill, os 2 | import pandas as pd 3 | 4 | from sklearn.pipeline import TransformerMixin 5 | from sklearn.preprocessing import LabelEncoder 6 | 7 | 8 | """ 9 | Explained model is a class that has attributes: 10 | 11 | - data, i.e. the features you get for a given dataset from load_dataset. This 12 | is a pandas dataframe that may include categorical variables. 13 | - labels, i.e. the boolean labels you get for a given dataset from 14 | load_dataset. 15 | - categoricalencoder, a fitted sklearn Transformer object that transforms 16 | the categorical columns in `data` to deterministic integer codes, yielding a 17 | plain numpy array often called `X` (leaves non-categorical columns 18 | untouched) 19 | - pipeline, a trained sklearn pipeline that takes `X` as input and predicts. 20 | - explainer, an instantiated LIME explainer that yields an explanation when 21 | it's explain instance method is run on an example `X` 22 | 23 | properties: 24 | - default_data 25 | - categorical_features 26 | - non_categorical_features 27 | - dtypes 28 | 29 | and methods for API (which works in terms of dictionaries): 30 | - cast_dct, converts values of dictionary to dtype corresponding to key 31 | - explain_dct, returns prediction and explanation for example dictionary 32 | 33 | and methods for users (who usually have dataframes): 34 | - predict_df, returns predictions for a df, i.e. runs it through categorical 35 | encoder and pipeline 36 | - explain_df, returns predictions and explanation for example dataframe 37 | """ 38 | 39 | class ExplainedModel(): 40 | 41 | def __init__(self, model_name=None, labels=None, data=None, #dataset=None, data=None, labels=None, 42 | categoricalencoder=None, pipeline=None, explainer=None, data_dir=None, 43 | load=True): 44 | if model_name is not None: 45 | self.model_name = model_name 46 | self.is_loaded = False 47 | else: 48 | self.data = data 49 | self.labels = labels 50 | self.categoricalencoder = categoricalencoder 51 | self.pipeline = pipeline 52 | self.explainer = explainer 53 | self.is_loaded = True 54 | self.model_dir = os.path.join(data_dir, 'models', self.model_name) 55 | self.model_path = os.path.join(self.model_dir, 56 | self.model_name + '.pkl') 57 | # if asked to load and not yet loaded, load model! 58 | if load and not self.is_loaded: 59 | self.load() 60 | 61 | def load(self): 62 | if not self.is_loaded: 63 | with open(self.model_path, 'rb') as f: 64 | self.__dict__.update(dill.load(f)) 65 | self.is_loaded = True 66 | 67 | def save(self): 68 | dilldict = { 69 | 'data': self.data, 70 | 'labels': self.labels, 71 | 'categoricalencoder': self.categoricalencoder, 72 | 'pipeline': self.pipeline, 73 | 'explainer': self.explainer 74 | } 75 | #self._make_model_dir() 76 | with open(self.model_path, 'wb') as f: 77 | dill.dump(dilldict, f) 78 | 79 | # def _make_model_name(self): 80 | # now = datetime.datetime.now().strftime("%Y%m%dT%H%M%S") 81 | # model_type = os.environ.get('CHURN_MODEL_TYPE', 'linear') 82 | # #model_name = '_'.join([now, self.dataset, model_type, get_git_hash()]) 83 | # model_name = '_'.join([now, self.dataset, model_type]) 84 | # return model_name 85 | # 86 | # def _make_model_dir(self): 87 | # if not os.path.exists(self.model_dir): 88 | # os.makedirs(self.model_dir) 89 | 90 | def predict_df(self, df): 91 | X = self.categoricalencoder.transform(df) 92 | return self.pipeline.predict_proba(X)[:, 1] 93 | 94 | def explain_df(self, df): 95 | X = self.categoricalencoder.transform(df) 96 | probability = self.pipeline.predict_proba(X)[0, 1] 97 | e = self.explainer.explain_instance( 98 | X[0], self.pipeline.predict_proba 99 | ).as_map()[1] 100 | explanations = {self.explainer.feature_names[c]: weight 101 | for c, weight in e} 102 | return probability, explanations 103 | 104 | def explain_dct(self, dct): 105 | return self.explain_df(pd.DataFrame([dct])) 106 | 107 | def cast_dct(self, dct): 108 | return {k: self.dtypes[k].type(v) for k, v in dct.items()} 109 | 110 | @property 111 | def dtypes(self): 112 | if not hasattr(self, '_dtypes'): 113 | d = self.data[self.non_categorical_features].dtypes.to_dict() 114 | d.update({c: self.data[c].cat.categories.dtype 115 | for c in self.categorical_features}) 116 | self._dtypes = d 117 | return self._dtypes 118 | 119 | @property 120 | def non_categorical_features(self): 121 | return list(self.data.select_dtypes(exclude=['category']).columns 122 | .drop(self.labels.name + ' probability')) 123 | 124 | @property 125 | def categorical_features(self): 126 | return list(self.data.select_dtypes(include=['category']).columns) 127 | 128 | @property 129 | def stats(self): 130 | def describe(s): 131 | return {'median': s.median(), 132 | 'mean': s.mean(), 133 | 'min': s.min(), 134 | 'max': s.max(), 135 | 'std': s.std()} 136 | if not hasattr(self, '_stats'): 137 | self._stats = {c: describe(self.data[c]) 138 | for c in self.non_categorical_features} 139 | return self._stats 140 | 141 | @property 142 | def label_name(self): 143 | return self.labels.name + ' probability' 144 | 145 | @property 146 | def categories(self): 147 | return {feature: list(self.categoricalencoder.classes_[feature]) 148 | for feature in self.categorical_features} 149 | 150 | @property 151 | def default_data(self): 152 | # 0th class for categorical variables and mean for continuous 153 | if not hasattr(self, '_default_data'): 154 | d = {} 155 | d.update({feature: self.categoricalencoder.classes_[feature][0] 156 | for feature in self.categorical_features}) 157 | d.update({feature: self.data[feature].median() 158 | for feature in self.non_categorical_features}) 159 | self._default_data = d 160 | return self._default_data 161 | 162 | class CategoricalEncoder(TransformerMixin): 163 | 164 | def fit(self, X, y=None, *args, **kwargs): 165 | self.columns_ = X.columns 166 | self.cat_columns_ix_ = {c: i for i, c in enumerate(X.columns) 167 | if pd.api.types.is_categorical_dtype(X[c])} 168 | self.cat_columns_ = pd.Index(self.cat_columns_ix_.keys()) 169 | self.non_cat_columns_ = X.columns.drop(self.cat_columns_) 170 | self.les_ = {c: LabelEncoder().fit(X[c]) 171 | for c in self.cat_columns_} 172 | self.classes_ = {c: list(self.les_[c].classes_) 173 | for c in self.cat_columns_} 174 | return self 175 | 176 | def transform(self, X, y=None, *args, **kwargs): 177 | data = X[self.columns_].values 178 | for c, i in self.cat_columns_ix_.items(): 179 | data[:, i] = self.les_[c].transform(data[:, i]) 180 | return data.astype(float) 181 | 182 | def __repr__(self): 183 | return('{}()'.format(self.__class__.__name__)) -------------------------------------------------------------------------------- /flask/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/flask/ajax-loader.gif -------------------------------------------------------------------------------- /flask/churn_vis.css: -------------------------------------------------------------------------------- 1 | @import url('https://fonts.googleapis.com/css?family=Open+Sans'); 2 | 3 | 4 | body { 5 | 6 | margin: 0 auto; 7 | font-family: 'Open Sans', sans-serif; 8 | font-size: 12px; 9 | } 10 | 11 | table { 12 | border: 0px solid black; 13 | /* border-collapse: collapse;*/ 14 | } 15 | 16 | tr { 17 | cursor: pointer; 18 | } 19 | 20 | th, td { 21 | padding: 4px; 22 | 23 | } 24 | .header { 25 | font-family: 'Open Sans', sans-serif; 26 | font-weight: 300; 27 | font-size: 35px; 28 | text-align: center; 29 | padding-top: 20px; 30 | vertical-align: top; 31 | line-height: 55px; 32 | } 33 | #loader { 34 | padding-left: 330px; 35 | padding-top: 100px; 36 | } 37 | 38 | .churn_div { 39 | font-size: 15px; 40 | padding-bottom: 0; 41 | } 42 | 43 | .explanation { 44 | width: 680px; 45 | margin: 0 auto; 46 | font-family: "Open Sans", sans-serif; 47 | font-size: 10pt; 48 | font-weight: 300; 49 | padding-bottom: 20px; 50 | padding-top:10px; 51 | } 52 | 53 | h1 { 54 | width: 500px; 55 | padding-top:8px; 56 | padding-left: 20px; 57 | float: left; 58 | font-family: "Open Sans", sans-serif; 59 | font-size: 15pt; 60 | font-weight: 300; 61 | 62 | } 63 | 64 | input { 65 | width: 70px; 66 | } 67 | 68 | .submit_div { 69 | float:right; 70 | padding: 0 10px 0 10px; 71 | } 72 | 73 | .input_div { 74 | float:left; 75 | padding: 5px 10px 0 10px; 76 | } 77 | 78 | .inner_div { 79 | float:left; 80 | padding: 5px 5px 5px 5px; 81 | margin: 0 2px 0 2px; 82 | } 83 | 84 | 85 | div { 86 | /* float:left;*/ 87 | padding:10px 10px 0 10px; 88 | } 89 | 90 | .main_div { 91 | clear:both; 92 | } 93 | 94 | #pred_value { 95 | float:left; 96 | } 97 | 98 | #loader { 99 | /*background-color: #fff;*/ 100 | /*opacity: 0.9;*/ 101 | position: absolute; 102 | padding: 100px 10px 10px 300px; 103 | width: 400px; 104 | height: 500px; 105 | } 106 | 107 | 108 | /* I got the button CSS from http://www.lab.tommasoraspo.com/simple-web-buttoms/ */ 109 | 110 | .button { 111 | float: left; 112 | cursor: pointer; 113 | margin: 0 5px; 114 | text-align: center; 115 | /*display: inline-block;*/ 116 | text-decoration: none; 117 | font: bold 12px/12px HelveticaNeue, Arial; 118 | padding: 8px 11px; 119 | color: #555; 120 | border: 1px solid #dedede; 121 | -webkit-border-radius: 3px; 122 | -moz-border-radius: 3px; 123 | border-radius: 3px; 124 | } 125 | .button.white { 126 | background: #f5f5f5; 127 | filter: progid: DXImageTransform.Microsoft.gradient(startColorstr='#f9f9f9', endColorstr='#f0f0f0'); 128 | /* IE */ 129 | background: -webkit-gradient(linear, left top, left bottom, from(#f9f9f9), to(#f0f0f0)); 130 | /* WebKit */ 131 | background: -moz-linear-gradient(top, #f9f9f9, #f0f0f0); 132 | border-color: #dedede #d8d8d8 #d3d3d3; 133 | color: #555; 134 | text-shadow: 0 1px 0 #fff; 135 | -webkit-box-shadow: 0 1px 1px #eaeaea, inset 0 1px 0 #fbfbfb; 136 | -moz-box-shadow: 0 1px 1px #eaeaea, inset 0 1px 0 #fbfbfb; 137 | box-shadow: 0 1px 1px #eaeaea, inset 0 1px 0 #fbfbfb; 138 | } 139 | .button.white:hover { 140 | background: #f4f4f4; 141 | filter: progid: DXImageTransform.Microsoft.gradient(startColorstr='#efefef', endColorstr='#f8f8f8'); 142 | /* IE */ 143 | background: -webkit-gradient(linear, left top, left bottom, from(#efefef), to(#f8f8f8)); 144 | /* WebKit */ 145 | background: -moz-linear-gradient(top, #efefef, #f8f8f8); 146 | border-color: #c7c7c7 #c3c3c3 #bebebe; 147 | text-shadow: 0 1px 0 #fdfdfd; 148 | -webkit-box-shadow: 0 1px 1px #ebebeb, inset 0 1px 0 #f3f3f3; 149 | -moz-box-shadow: 0 1px 1px #ebebeb, inset 0 1px 0 #f3f3f3; 150 | box-shadow: 0 1px 1px #ebebeb, inset 0 1px 0 #f3f3f3; 151 | -------------------------------------------------------------------------------- /flask/churn_vis.js: -------------------------------------------------------------------------------- 1 | //This is the javascript code that builds and updates the bar graph 2 | 3 | window.updater = function(data) { 4 | //d3.select("#svg_container").text(data); 5 | my_data = data; 6 | console.log(data); 7 | 8 | // var svg_margin = { top: 20, right: 20, bottom: 20, left: 40 }; 9 | // var svg_width = d3.select("body").node().getBoundingClientRect().width - svg_margin.left - svg_margin.right; 10 | // var svg_height = 300 - svg_margin.top - svg_margin.bottom; 11 | // 12 | // var y = d3.scaleLinear() 13 | // .domain([0, d3.max(data, function(d) { return d.petal_length; })]) 14 | // .range([svg_height, 0]); 15 | // 16 | // var x = d3.scaleBand() 17 | // .domain(d3.range(data.length)) 18 | // .range([0, svg_width]) 19 | // .padding(0.1); 20 | // 21 | // var species_list = d3.map(data, function (d) { return d.species;}).keys(); 22 | // 23 | // if (d3.select("#svg_container").select("svg").empty()) { 24 | // 25 | // 26 | // svg = d3.select("#svg_container").append("svg") 27 | // .attr("width", svg_width + svg_margin.left + svg_margin.right) 28 | // .attr("height", svg_height + svg_margin.top + svg_margin.bottom) 29 | // .append("g") 30 | // .attr("transform", 31 | // "translate(" + svg_margin.left + "," + svg_margin.top + ")"); 32 | // 33 | // svg.append("g") 34 | // .attr("transform", "translate(0," + svg_height + ")") 35 | // .attr("class", "x axis") 36 | // .call(d3.axisBottom(x)); 37 | // 38 | // // add the y Axis 39 | // svg.append("g") 40 | // .attr("class", "y axis") 41 | // .call(d3.axisLeft(y)); 42 | // } else { 43 | // svg.attr("width", svg_width + svg_margin.left + svg_margin.right) 44 | // svg.selectAll("g.y.axis") 45 | // .call(d3.axisLeft(y)); 46 | // 47 | // svg.selectAll("g.x.axis") 48 | // .call(d3.axisBottom(x)); 49 | // } 50 | // 51 | // // DATA JOIN 52 | // // Join new data with old elements, if any. 53 | // 54 | // var bars = svg.selectAll(".bar") 55 | // .data(data); 56 | // 57 | // // UPDATE 58 | // // Update old elements as needed. 59 | // 60 | // bars 61 | // .attr("style",function(d) { return "fill:" + d3.schemeCategory10[species_list.indexOf(d.species)];}) 62 | // .attr("x", function(d, i) { return x(i); }) 63 | // .attr("width", x.bandwidth()) 64 | // .transition() 65 | // .duration(100) 66 | // .attr("y", function(d) { return y(d.petal_length); }) 67 | // .attr("height", function(d) { return svg_height - y(d.petal_length); }); 68 | // 69 | // // ENTER + UPDATE 70 | // // After merging the entered elements with the update selection, 71 | // // apply operations to both. 72 | // 73 | // bars.enter().append("rect") 74 | // .attr("class", "bar") 75 | // .attr("style",function(d) { return "fill:" + d3.schemeCategory10[species_list.indexOf(d.species)];}) 76 | // .attr("x", function(d, i) { return x(i); }) 77 | // .attr("width", x.bandwidth()) 78 | // .attr("y", function(d) { return y(d.petal_length); }) 79 | // .attr("height", function(d) { return svg_height - y(d.petal_length); }) 80 | // .merge(bars); 81 | // 82 | // // EXIT 83 | // // Remove old elements as needed. 84 | // 85 | // bars.exit().remove(); 86 | 87 | }; -------------------------------------------------------------------------------- /flask/env_vars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/flask/env_vars.png -------------------------------------------------------------------------------- /flask/single_view.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |

Single Prediction View

12 |
13 |
Churn Probability
14 |
15 | 16 |
17 |
18 | 185 | 186 | -------------------------------------------------------------------------------- /flask/table_view.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |

Refractor

12 |
13 | Loading Sample Data... 14 |
15 | 16 |
17 | 122 | 123 | -------------------------------------------------------------------------------- /images/data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/images/data.png -------------------------------------------------------------------------------- /images/disable_auth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/images/disable_auth.png -------------------------------------------------------------------------------- /images/model_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/images/model_accuracy.png -------------------------------------------------------------------------------- /images/model_id.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/images/model_id.png -------------------------------------------------------------------------------- /images/single_view_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/images/single_view_1.png -------------------------------------------------------------------------------- /images/single_view_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/images/single_view_2.png -------------------------------------------------------------------------------- /images/table_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/images/table_view.png -------------------------------------------------------------------------------- /lineage.yml: -------------------------------------------------------------------------------- 1 | "Model Explainer 29072021101927": 2 | hive_table_qualified_names: # this is a predefined key to link to training data 3 | - "default.telco_churn@cm" # the qualifiedName of the hive_table object representing 4 | metadata: # this is a predefined key for additional metadata 5 | query: "select * from historical_data" # suggested use case: query used to extract training data 6 | training_file: "4_train_models.py" # suggested use case: training file used 7 | -------------------------------------------------------------------------------- /models/telco_linear/telco_linear.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/models/telco_linear/telco_linear.pkl -------------------------------------------------------------------------------- /raw/telco-data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastforwardlabs/cml_churn_demo_mlops/0a189a7b250f682d8db14205878510591bcad529/raw/telco-data/_SUCCESS -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/fastforwardlabs/cmlbootstrap#egg=cmlbootstrap 2 | seaborn==0.9.0 3 | dill==0.3.1.1 4 | lime==0.1.1.36 5 | scikit-learn==0.21.3 6 | xlrd==1.2.0 7 | pandas==0.25.1 8 | numpy==1.17.2 9 | flask==1.1.2 10 | --------------------------------------------------------------------------------