├── 01 - Data Preparation: Feature Engineering and AutoML.py ├── 02 - Data Exploration: Generated by AutoML.py ├── 03 - Best Trial Run: XGBoost training improvements.py ├── End-to-End ML Workshop DB SQL Dashboard.pdf ├── End-to-End ML Workshop.dbdash └── README.md /01 - Data Preparation: Feature Engineering and AutoML.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## End to End ML on Databricks 4 | # MAGIC 5 | # MAGIC 6 | 7 | # COMMAND ---------- 8 | 9 | from pyspark.sql.functions import * 10 | from pyspark.sql.types import * 11 | 12 | import pyspark.pandas as pd 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC 18 | # MAGIC For this workshop we will use a publicly available adult dataset example found in `/databricks-datasets/`. We could also use Python or Spark to read data from databases or cloud storage. 19 | 20 | # COMMAND ---------- 21 | 22 | # We can ls the directory and see what files we have available 23 | dbutils.fs.ls("/databricks-datasets/adult") 24 | 25 | # COMMAND ---------- 26 | 27 | # MAGIC %md 28 | # MAGIC #### Path configs 29 | 30 | # COMMAND ---------- 31 | 32 | # Set config for database name, file paths, and table names 33 | database_name = 'ml_income_workshop' 34 | user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user') 35 | 36 | # Paths for various Delta tables 37 | raw_tbl_path = '/home/{}/ml_income_workshop/raw/'.format(user) 38 | clean_tbl_path = '/home/{}/ml_income_workshop/clean/'.format(user) 39 | inference_tbl_path = '/home/{}/ml_income_workshop/inference/'.format(user) 40 | 41 | raw_tbl_name = 'raw_income' 42 | clean_tbl_name = 'clean_income' 43 | inference_tbl_name = 'inference_income' 44 | 45 | # Delete the old database and tables if needed 46 | spark.sql('DROP DATABASE IF EXISTS {} CASCADE'.format(database_name)) 47 | 48 | # Create database to house tables 49 | spark.sql('CREATE DATABASE {}'.format(database_name)) 50 | spark.sql('USE {}'.format(database_name)) 51 | 52 | # Drop any old delta lake files if needed (e.g. re-running this notebook with the same path variables) 53 | dbutils.fs.rm(raw_tbl_path, recurse = True) 54 | dbutils.fs.rm(clean_tbl_path, recurse = True) 55 | dbutils.fs.rm(inference_tbl_path, recurse = True) 56 | 57 | # COMMAND ---------- 58 | 59 | # MAGIC %md 60 | # MAGIC ### Exploratory Data Analysis 61 | 62 | # COMMAND ---------- 63 | 64 | # MAGIC %md #### Reading in Data 65 | 66 | # COMMAND ---------- 67 | 68 | # MAGIC %sh cat /dbfs/databricks-datasets/adult/README.md 69 | 70 | # COMMAND ---------- 71 | 72 | census_income_path = "/databricks-datasets/adult/adult.data" 73 | 74 | # defining the schema for departure_delays 75 | census_income_schema = StructType([ \ 76 | StructField("age", IntegerType(), True), \ 77 | StructField("workclass", StringType(), True), \ 78 | StructField("fnlwgt", DoubleType(), True), \ 79 | StructField("education", StringType(), True), \ 80 | StructField("education_num", DoubleType(), True), \ 81 | StructField("marital_status", StringType(), True), \ 82 | StructField("occupation", StringType(), True), \ 83 | StructField("relationship", StringType(), True), \ 84 | StructField("race", StringType(), True), \ 85 | StructField("sex", StringType(), True), \ 86 | StructField("capital_gain", DoubleType(), True), \ 87 | StructField("capital_loss", DoubleType(), True), \ 88 | StructField("hours_per_week", DoubleType(), True), \ 89 | StructField("native_country", StringType(), True), \ 90 | StructField("income", StringType(), True), 91 | ]) 92 | raw_df = spark.read.schema(census_income_schema).options(header='false', delimiter=',').csv(census_income_path) 93 | 94 | display(raw_df) 95 | 96 | # COMMAND ---------- 97 | 98 | raw_df.write.format('delta').mode('overwrite').save(raw_tbl_path) 99 | 100 | # COMMAND ---------- 101 | 102 | display(raw_df) 103 | 104 | # COMMAND ---------- 105 | 106 | # MAGIC %md By clicking on the `Data Profile` tab above we can easily generate descriptive statistics on our dataset. We can also call those results using `dbutils` 107 | 108 | # COMMAND ---------- 109 | 110 | dbutils.data.summarize(raw_df) 111 | 112 | # COMMAND ---------- 113 | 114 | # Create table to query with SQL 115 | spark.sql(''' 116 | CREATE TABLE {0} 117 | USING DELTA 118 | LOCATION '{1}' 119 | '''.format(raw_tbl_name, raw_tbl_path) 120 | ) 121 | 122 | # COMMAND ---------- 123 | 124 | # MAGIC %md 125 | # MAGIC Now let's query our table with SQL! 126 | 127 | # COMMAND ---------- 128 | 129 | # MAGIC %sql 130 | # MAGIC -- Occupations with the highest average age 131 | # MAGIC SELECT occupation, ROUND(AVG(age)) AS avg_age 132 | # MAGIC FROM raw_income 133 | # MAGIC GROUP BY occupation 134 | # MAGIC ORDER BY avg_age DESC 135 | 136 | # COMMAND ---------- 137 | 138 | # MAGIC %md ### Data Visualization 139 | 140 | # COMMAND ---------- 141 | 142 | # MAGIC %md We can also display the results as a table using the built-in visualizations 143 | 144 | # COMMAND ---------- 145 | 146 | # MAGIC %sql 147 | # MAGIC -- Occupations with the highest average age 148 | # MAGIC SELECT occupation, ROUND(AVG(age)) AS avg_age 149 | # MAGIC FROM raw_income 150 | # MAGIC GROUP BY occupation 151 | # MAGIC ORDER BY avg_age DESC 152 | 153 | # COMMAND ---------- 154 | 155 | # MAGIC %md 156 | # MAGIC You can also leverage [`Pandas on Spark`](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) to use your favorite pandas and matplotlib functions for data wrangling and visualization but with the scale and optimizations of Spark ([announcement blog](https://databricks.com/blog/2021/10/04/pandas-api-on-upcoming-apache-spark-3-2.html), [docs](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html), [quickstart](https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html)) 157 | # MAGIC 158 | # MAGIC 159 | 160 | # COMMAND ---------- 161 | 162 | # convert our raw spark distributed dataframe into a distributed pandas dataframe 163 | raw_df_pdf = raw_df.to_pandas_on_spark() 164 | 165 | # perform the same aggregation we did in SQL using familiar Pandas syntax 166 | avg_age_by_occupation = raw_df_pdf.groupby("occupation").mean().round().reset_index()[["occupation", "age"]].sort_values("age", ascending = False) 167 | 168 | # re-create the same plot using familiar pandas and matplotlib syntax distributed with Spark 169 | avg_age_by_occupation.plot(kind = "bar", x = "occupation", y = "age") 170 | 171 | # COMMAND ---------- 172 | 173 | # MAGIC %md 174 | # MAGIC #### Data Wrangling 175 | # MAGIC We can leverage [`Pandas on Spark`](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) to clean and wrangle our data at scale. We are going to drop missing values and clean up category values. 176 | 177 | # COMMAND ---------- 178 | 179 | # Drop missing values 180 | clean_pdf = raw_df_pdf.dropna(axis = 0, how = 'any') 181 | 182 | def category_cleaner(value): 183 | return value.strip().lower().replace('.', '').replace(',', '').replace(' ', '-') 184 | 185 | categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income'] 186 | 187 | for column in categorical_cols: 188 | clean_pdf[column] = clean_pdf[column].apply(lambda value: category_cleaner(value)) 189 | 190 | # COMMAND ---------- 191 | 192 | # MAGIC %md 193 | # MAGIC #### Feature Engineering 194 | # MAGIC 195 | # MAGIC * Bin the age into decades (min age 17 to max age 90) 196 | # MAGIC * take the log of features with skewed distributions (capital_gain, capital_loss) 197 | 198 | # COMMAND ---------- 199 | 200 | import numpy as np 201 | 202 | # bin age into decades 203 | def bin_age(value): 204 | if value <= 19: 205 | return "teens" 206 | elif value in range(20,30): 207 | return "20s" 208 | elif value in range(30,40): 209 | return "30s" 210 | elif value in range(40,50): 211 | return "40s" 212 | elif value in range(50,60): 213 | return "50s" 214 | elif value in range(60,100): 215 | return "60+" 216 | else: 217 | return "other" 218 | 219 | clean_pdf['age_by_decade'] = clean_pdf['age'].apply(bin_age) 220 | 221 | # Take the log of features with skewed distributions 222 | def log_transform(value): 223 | return float(np.log(value + 1)) # for 0 values 224 | 225 | clean_pdf['log_capital_gain'] = clean_pdf['capital_gain'].apply(log_transform) 226 | clean_pdf['log_capital_loss'] = clean_pdf['capital_loss'].apply(log_transform) 227 | 228 | # Drop columns 229 | clean_pdf = clean_pdf.drop(['age', 'capital_gain', 'capital_loss'], axis = 1) 230 | 231 | display(clean_pdf.head(3)) 232 | 233 | # COMMAND ---------- 234 | 235 | # we are now going to save this cleaned table as a delta file in our cloud storage and create a metadata table on top of it 236 | clean_pdf.to_delta(clean_tbl_path) 237 | 238 | spark.sql(''' 239 | CREATE TABLE {0} 240 | USING DELTA 241 | LOCATION '{1}' 242 | '''.format(clean_tbl_name, clean_tbl_path) 243 | ) 244 | 245 | # COMMAND ---------- 246 | 247 | # MAGIC %sql 248 | # MAGIC SELECT * 249 | # MAGIC FROM clean_income 250 | # MAGIC LIMIT 3 251 | 252 | # COMMAND ---------- 253 | 254 | # MAGIC %md 255 | # MAGIC Let's package the all the data processing into a function for later use 256 | 257 | # COMMAND ---------- 258 | 259 | def process_census_data(dataframe): 260 | """ 261 | Function to wrap specific processing for census data tables 262 | Input and output is a pyspark.pandas dataframe 263 | """ 264 | categorical_cols = ['workclass', 'education', 'marital_status', 265 | 'occupation', 'relationship', 'race', 'sex', 266 | 'native_country', 'income'] 267 | 268 | # categorical column cleansing 269 | for column in categorical_cols: 270 | dataframe[column] = dataframe[column].apply(lambda value: category_cleaner(value)) 271 | 272 | # bin age 273 | dataframe['age_by_decade'] = dataframe['age'].apply(bin_age) 274 | 275 | # log transform 276 | dataframe['log_capital_gain'] = dataframe['capital_gain'].apply(log_transform) 277 | dataframe['log_capital_loss'] = dataframe['capital_loss'].apply(log_transform) 278 | 279 | # Drop columns 280 | dataframe = dataframe.drop(['age', 'capital_gain', 'capital_loss'], axis = 1) 281 | 282 | return dataframe 283 | 284 | # COMMAND ---------- 285 | 286 | # MAGIC %md 287 | # MAGIC Last but not least, let's create the same transformations to our inference dataset for testing later 288 | 289 | # COMMAND ---------- 290 | 291 | census_income_test_path = "/databricks-datasets/adult/adult.test" 292 | 293 | inference_pdf = (spark.read.schema(census_income_schema) 294 | .options(header='false', delimiter=',') 295 | .csv(census_income_test_path) 296 | .to_pandas_on_spark() 297 | ) 298 | 299 | inference_pdf = process_census_data(inference_pdf) 300 | inference_pdf.to_delta(inference_tbl_path) 301 | 302 | spark.sql(''' 303 | CREATE TABLE {0} 304 | USING DELTA 305 | LOCATION '{1}' 306 | '''.format(inference_tbl_name, inference_tbl_path) 307 | ) 308 | 309 | # COMMAND ---------- 310 | 311 | # MAGIC %md 312 | # MAGIC Great! Our dataset is ready for us to use with AutoML to train a benchmark model. 313 | 314 | # COMMAND ---------- 315 | 316 | # MAGIC %md 317 | # MAGIC ### AutoML 318 | # MAGIC 319 | # MAGIC 320 | 321 | # COMMAND ---------- 322 | 323 | # MAGIC %md 324 | # MAGIC Now let's create a glassbox AutoML model to help us automatically test different models and parameters and reduce time manually testing and tweaking ML models. We can run AutoML via the `databricks.automl` library or via the UI by creating a new [mlflow automl experiment](#mlflow/experiments). 325 | # MAGIC 326 | # MAGIC Here, we'll run AutoML in the next cell. 327 | 328 | # COMMAND ---------- 329 | 330 | import databricks.automl 331 | 332 | summary = databricks.automl.classify(clean_pdf, target_col='income', primary_metric="f1", data_dir='dbfs:/automl/ml_income_workshop', timeout_minutes=5) 333 | 334 | # COMMAND ---------- 335 | 336 | # MAGIC %md 337 | # MAGIC Check out the screenshots below that walk through this process via the UI. 338 | # MAGIC 339 | # MAGIC 340 | -------------------------------------------------------------------------------- /02 - Data Exploration: Generated by AutoML.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC # Data Exploration 4 | # MAGIC This notebook performs exploratory data analysis on the dataset. 5 | # MAGIC To expand on the analysis, attach this notebook to the **ml-workshop-2.0** cluster, 6 | # MAGIC edit [the options of pandas-profiling](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/advanced_usage.html), and rerun it. 7 | # MAGIC - Explore completed trials in the [MLflow experiment](#mlflow/experiments/2984496825650690/s?orderByKey=metrics.%60val_f1_score%60&orderByAsc=false) 8 | # MAGIC - Navigate to the parent notebook [here](#notebook/2984496825649216) (If you launched the AutoML experiment using the Experiments UI, this link isn't very useful.) 9 | # MAGIC 10 | # MAGIC Runtime Version: _10.2.x-cpu-ml-scala2.12_ 11 | 12 | # COMMAND ---------- 13 | 14 | # Load the data into a pandas DataFrame 15 | import pandas as pd 16 | import databricks.automl_runtime 17 | 18 | df = pd.read_parquet("file:///dbfs/automl/ml_income_workshop/22-02-18-07:07/62666dbf") 19 | 20 | target_col = "income" 21 | 22 | # COMMAND ---------- 23 | 24 | # MAGIC %md 25 | # MAGIC ## Profiling Results 26 | 27 | # COMMAND ---------- 28 | 29 | from pandas_profiling import ProfileReport 30 | df_profile = ProfileReport(df, title="Profiling Report", progress_bar=False, infer_dtypes=False) 31 | profile_html = df_profile.to_html() 32 | 33 | displayHTML(profile_html) 34 | -------------------------------------------------------------------------------- /03 - Best Trial Run: XGBoost training improvements.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC # XGBoost training 4 | # MAGIC This is an auto-generated notebook. To reproduce these results, attach this notebook to the **ml-workshop-2.0** cluster and rerun it. 5 | # MAGIC - Compare trials in the [MLflow experiment](#mlflow/experiments/4212189882465177/s?orderByKey=metrics.%60val_f1_score%60&orderByAsc=false) 6 | # MAGIC - Navigate to the parent notebook [here](#notebook/4212189882465089) (If you launched the AutoML experiment using the Experiments UI, this link isn't very useful.) 7 | # MAGIC - Clone this notebook into your project folder by selecting **File > Clone** in the notebook toolbar. 8 | # MAGIC 9 | # MAGIC Runtime Version: _10.2.x-cpu-ml-scala2.12_ 10 | 11 | # COMMAND ---------- 12 | 13 | # MAGIC %md 14 | # MAGIC **Be sure to update the mlflow experiment path appropriately!** 15 | 16 | # COMMAND ---------- 17 | 18 | import mlflow 19 | import databricks.automl_runtime 20 | 21 | from pyspark.sql.functions import * 22 | 23 | # Use MLflow to track experiments 24 | mlflow.set_experiment("/Shared/ML-Workshop-2.0/End-to-End-ML/ml-workshop-income-classifier") 25 | 26 | target_col = "income" 27 | database_name = 'ml_income_workshop' 28 | user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user') 29 | 30 | 31 | # COMMAND ---------- 32 | 33 | # MAGIC %md 34 | # MAGIC ## Load Data 35 | 36 | # COMMAND ---------- 37 | 38 | # Load input data into a pandas DataFrame. 39 | import pandas as pd 40 | df_loaded = spark.table("ml_income_workshop.clean_income").toPandas() 41 | 42 | ## Data to be Scored 43 | inference_data = spark.read.table('ml_income_workshop.inference_income').toPandas() 44 | 45 | # Preview data 46 | df_loaded.head(5) 47 | 48 | # COMMAND ---------- 49 | 50 | # MAGIC %md 51 | # MAGIC ### Select supported columns 52 | # MAGIC Select only the columns that are supported. This allows us to train a model that can predict on a dataset that has extra columns that are not used in training. 53 | # MAGIC `[]` are dropped in the pipelines. See the Alerts tab of the AutoML Experiment page for details on why these columns are dropped. 54 | 55 | # COMMAND ---------- 56 | 57 | from databricks.automl_runtime.sklearn.column_selector import ColumnSelector 58 | supported_cols = ["age_by_decade", "fnlwgt", "education", "occupation", "hours_per_week", "relationship", "workclass", "log_capital_gain", "log_capital_loss", "native_country"] 59 | col_selector = ColumnSelector(supported_cols) 60 | 61 | # COMMAND ---------- 62 | 63 | # MAGIC %md 64 | # MAGIC ## Preprocessors 65 | 66 | # COMMAND ---------- 67 | 68 | transformers = [] 69 | 70 | # COMMAND ---------- 71 | 72 | # MAGIC %md 73 | # MAGIC ### Incorporating insights from Data Exploration Notebook 74 | # MAGIC 75 | # MAGIC According to the data exploration notebook, we have a few features with high correlation. We can try dropping some of these features to reduce redundant information. We'll also drop features with little correlation to the income column. 76 | # MAGIC 77 | # MAGIC In this case, we'll drop the `workclass`, `sex`, `race`, `education_num`, and `marital_status` columns 78 | 79 | # COMMAND ---------- 80 | 81 | df_loaded.drop(["workclass", "sex", "race", "education_num", "marital_status"], axis=1) 82 | 83 | # COMMAND ---------- 84 | 85 | # MAGIC %md 86 | # MAGIC ### Numerical columns 87 | # MAGIC 88 | # MAGIC Missing values for numerical columns are imputed with mean for consistency 89 | 90 | # COMMAND ---------- 91 | 92 | from sklearn.impute import SimpleImputer 93 | from sklearn.pipeline import Pipeline 94 | from sklearn.preprocessing import FunctionTransformer 95 | 96 | numerical_pipeline = Pipeline(steps=[ 97 | ("converter", FunctionTransformer(lambda df: df.apply(pd.to_numeric, errors="coerce"))), 98 | ("imputer", SimpleImputer(strategy="mean")) 99 | ]) 100 | 101 | transformers.append(("numerical", numerical_pipeline, ["fnlwgt", "hours_per_week", "log_capital_gain", "log_capital_loss"])) 102 | 103 | # COMMAND ---------- 104 | 105 | # MAGIC %md 106 | # MAGIC ### Categorical columns 107 | 108 | # COMMAND ---------- 109 | 110 | # MAGIC %md 111 | # MAGIC #### Low-cardinality categoricals 112 | # MAGIC Convert each low-cardinality categorical column into multiple binary columns through one-hot encoding. 113 | # MAGIC For each input categorical column (string or numeric), the number of output columns is equal to the number of unique values in the input column. 114 | 115 | # COMMAND ---------- 116 | 117 | from sklearn.pipeline import Pipeline 118 | from sklearn.preprocessing import OneHotEncoder 119 | 120 | one_hot_encoder = OneHotEncoder(handle_unknown="ignore") 121 | 122 | transformers.append(("onehot", one_hot_encoder, ["age_by_decade", "education", "occupation", "relationship", "workclass", "native_country"])) 123 | 124 | # COMMAND ---------- 125 | 126 | from sklearn.compose import ColumnTransformer 127 | 128 | preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=0) 129 | 130 | # COMMAND ---------- 131 | 132 | # MAGIC %md 133 | # MAGIC ### Feature standardization 134 | # MAGIC Scale all feature columns to be centered around zero with unit variance. 135 | 136 | # COMMAND ---------- 137 | 138 | from sklearn.preprocessing import StandardScaler 139 | 140 | standardizer = StandardScaler() 141 | 142 | # COMMAND ---------- 143 | 144 | # MAGIC %md 145 | # MAGIC ## Train - Validation - Split 146 | # MAGIC Split the input data into 2 sets: 147 | # MAGIC - Train (80% of the dataset used to train the model) 148 | # MAGIC - Validation (20% of the dataset used to tune the hyperparameters of the model) 149 | 150 | # COMMAND ---------- 151 | 152 | from sklearn.model_selection import train_test_split 153 | 154 | split_X = df_loaded.drop([target_col], axis=1) 155 | split_y = df_loaded[target_col] 156 | 157 | # Split out train data 158 | X_train, X_val, y_train, y_val = train_test_split(split_X, split_y, train_size=0.8, random_state=149849802, stratify=split_y) 159 | 160 | 161 | # COMMAND ---------- 162 | 163 | X_test = inference_data.drop([target_col], axis=1) 164 | y_test = inference_data[target_col] 165 | 166 | # COMMAND ---------- 167 | 168 | # MAGIC %md 169 | # MAGIC ## Train classification model 170 | # MAGIC - Log relevant metrics to MLflow to track runs 171 | # MAGIC - All the runs are logged under [this MLflow experiment](#mlflow/experiments/4212189882465177/s?orderByKey=metrics.%60val_f1_score%60&orderByAsc=false) 172 | # MAGIC - Change the model parameters and re-run the training cell to log a different trial to the MLflow experiment 173 | # MAGIC - To view the full list of tunable hyperparameters, check the output of the cell below 174 | 175 | # COMMAND ---------- 176 | 177 | from xgboost import XGBClassifier 178 | 179 | help(XGBClassifier) 180 | 181 | # COMMAND ---------- 182 | 183 | # MAGIC %md 184 | # MAGIC ### Incorporating insights from Data Exploration: Downsampling 185 | 186 | # COMMAND ---------- 187 | 188 | # RandomUnderSampler for class imbalance (decrease <=50K label count) 189 | from imblearn.under_sampling import RandomUnderSampler 190 | 191 | # From our data exploration notebook, class ratio looks like 75/25 (<=50k/>=50k) 192 | undersampler = RandomUnderSampler(random_state=42) 193 | 194 | # COMMAND ---------- 195 | 196 | import mlflow 197 | import sklearn 198 | from sklearn import set_config 199 | from imblearn.pipeline import make_pipeline 200 | 201 | set_config(display="diagram") 202 | 203 | xgbc_classifier = XGBClassifier( 204 | colsample_bytree=0.5562503325532802, 205 | learning_rate=0.26571572922086373, 206 | max_depth=5, 207 | min_child_weight=5, 208 | n_estimators=30, 209 | n_jobs=100, 210 | subsample=0.6859242756647854, 211 | verbosity=0, 212 | random_state=149849802, 213 | ) 214 | 215 | model = make_pipeline(col_selector, preprocessor, standardizer, undersampler, xgbc_classifier) 216 | 217 | model 218 | 219 | # COMMAND ---------- 220 | 221 | # Enable automatic logging of input samples, metrics, parameters, and models 222 | mlflow.sklearn.autolog(log_input_examples=True, silent=True) 223 | 224 | with mlflow.start_run(run_name="xgboost") as mlflow_run: 225 | model.fit(X_train, y_train) 226 | 227 | # Training metrics are logged by MLflow autologging 228 | # Log metrics for the validation set 229 | xgbc_val_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_val, y_val, prefix="val_") 230 | 231 | # Log metrics for the test set 232 | xgbc_test_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_test, y_test, prefix="test_") 233 | 234 | # Display the logged metrics 235 | xgbc_val_metrics = {k.replace("val_", ""): v for k, v in xgbc_val_metrics.items()} 236 | xgbc_test_metrics = {k.replace("test_", ""): v for k, v in xgbc_test_metrics.items()} 237 | 238 | metrics_pdf = pd.DataFrame([xgbc_val_metrics, xgbc_test_metrics], index=["validation", "test"]) 239 | metrics_pdf["dataset"] = ["ml_income_workshop.clean_income", "ml_income_workshop.inference_income"] 240 | metrics_df = spark.createDataFrame(metrics_pdf) 241 | display(metrics_df) 242 | 243 | # COMMAND ---------- 244 | 245 | # Save metrics to a delta table 246 | metrics_df.write.mode("overwrite").saveAsTable(f"{database_name}.metric_data_bronze") 247 | 248 | # COMMAND ---------- 249 | 250 | # Patch requisite packages to the model environment YAML for model serving 251 | import os 252 | import shutil 253 | import uuid 254 | import yaml 255 | 256 | None 257 | 258 | import xgboost 259 | from mlflow.tracking import MlflowClient 260 | 261 | xgbc_temp_dir = os.path.join(os.environ["SPARK_LOCAL_DIRS"], str(uuid.uuid4())[:8]) 262 | os.makedirs(xgbc_temp_dir) 263 | xgbc_client = MlflowClient() 264 | xgbc_model_env_path = xgbc_client.download_artifacts(mlflow_run.info.run_id, "model/conda.yaml", xgbc_temp_dir) 265 | xgbc_model_env_str = open(xgbc_model_env_path) 266 | xgbc_parsed_model_env_str = yaml.load(xgbc_model_env_str, Loader=yaml.FullLoader) 267 | 268 | xgbc_parsed_model_env_str["dependencies"][-1]["pip"].append(f"xgboost=={xgboost.__version__}") 269 | 270 | with open(xgbc_model_env_path, "w") as f: 271 | f.write(yaml.dump(xgbc_parsed_model_env_str)) 272 | xgbc_client.log_artifact(run_id=mlflow_run.info.run_id, local_path=xgbc_model_env_path, artifact_path="model") 273 | shutil.rmtree(xgbc_temp_dir) 274 | 275 | # COMMAND ---------- 276 | 277 | # MAGIC %md 278 | # MAGIC ## Feature importance 279 | # MAGIC 280 | # MAGIC SHAP is a game-theoretic approach to explain machine learning models, providing a summary plot 281 | # MAGIC of the relationship between features and model output. Features are ranked in descending order of 282 | # MAGIC importance, and impact/color describe the correlation between the feature and the target variable. 283 | # MAGIC - Generating SHAP feature importance is a very memory intensive operation, so to ensure that AutoML can run trials without 284 | # MAGIC running out of memory, we disable SHAP by default.
285 | # MAGIC You can set the flag defined below to `shap_enabled = True` and re-run this notebook to see the SHAP plots. 286 | # MAGIC - To reduce the computational overhead of each trial, a single example is sampled from the validation set to explain.
287 | # MAGIC For more thorough results, increase the sample size of explanations, or provide your own examples to explain. 288 | # MAGIC - SHAP cannot explain models using data with nulls; if your dataset has any, both the background data and 289 | # MAGIC examples to explain will be imputed using the mode (most frequent values). This affects the computed 290 | # MAGIC SHAP values, as the imputed samples may not match the actual data distribution. 291 | # MAGIC 292 | # MAGIC For more information on how to read Shapley values, see the [SHAP documentation](https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html). 293 | 294 | # COMMAND ---------- 295 | 296 | # Set this flag to True and re-run the notebook to see the SHAP plots 297 | shap_enabled = True 298 | 299 | # COMMAND ---------- 300 | 301 | if shap_enabled: 302 | from shap import KernelExplainer, summary_plot 303 | # Sample background data for SHAP Explainer. Increase the sample size to reduce variance. 304 | sample_size = 500 305 | train_sample = X_train.sample(n=sample_size) 306 | 307 | # Sample a single example from the validation set to explain. Increase the sample size and rerun for more thorough results. 308 | example = X_val.sample(n=1) 309 | 310 | # Use Kernel SHAP to explain feature importance on the example from the validation set. 311 | predict = lambda x: model.predict_proba(pd.DataFrame(x, columns=X_train.columns)) 312 | explainer = KernelExplainer(predict, train_sample, link="logit") 313 | shap_values = explainer.shap_values(example, l1_reg=False) 314 | summary_plot(shap_values, example, class_names=model.classes_) 315 | 316 | # COMMAND ---------- 317 | 318 | # MAGIC %md 319 | # MAGIC ## Inference 320 | # MAGIC [The MLflow Model Registry](https://docs.databricks.com/applications/mlflow/model-registry.html) is a collaborative hub where teams can share ML models, work together from experimentation to online testing and production, integrate with approval and governance workflows, and monitor ML deployments and their performance. The snippets below show how to add the model trained in this notebook to the model registry and to retrieve it later for inference. 321 | # MAGIC 322 | # MAGIC > **NOTE:** The `model_uri` for the model already trained in this notebook can be found in the cell below 323 | # MAGIC 324 | # MAGIC ### Register to Model Registry 325 | # MAGIC ``` 326 | # MAGIC model_name = "Example" 327 | # MAGIC 328 | # MAGIC model_uri = f"runs:/{ mlflow_run.info.run_id }/model" 329 | # MAGIC registered_model_version = mlflow.register_model(model_uri, model_name) 330 | # MAGIC ``` 331 | # MAGIC 332 | # MAGIC ### Load from Model Registry 333 | # MAGIC ``` 334 | # MAGIC model_name = "Example" 335 | # MAGIC model_version = registered_model_version.version 336 | # MAGIC 337 | # MAGIC model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}") 338 | # MAGIC model.predict(input_X) 339 | # MAGIC ``` 340 | # MAGIC 341 | # MAGIC ### Load model without registering 342 | # MAGIC ``` 343 | # MAGIC model_uri = f"runs:/{ mlflow_run.info.run_id }/model" 344 | # MAGIC 345 | # MAGIC model = mlflow.pyfunc.load_model(model_uri) 346 | # MAGIC model.predict(input_X) 347 | # MAGIC ``` 348 | 349 | # COMMAND ---------- 350 | 351 | # model_uri for the generated model 352 | print(f"runs:/{ mlflow_run.info.run_id }/model") 353 | 354 | # COMMAND ---------- 355 | 356 | # MAGIC %md 357 | # MAGIC ## MLflow stats to Delta Lake Table 358 | 359 | # COMMAND ---------- 360 | 361 | expId = mlflow.get_experiment_by_name("/Shared/ML-Workshop-2.0/End-to-End-ML/ml-workshop-income-classifier").experiment_id 362 | 363 | mlflow_df = spark.read.format("mlflow-experiment").load(expId) 364 | 365 | refined_mlflow_df = mlflow_df.select(col('run_id'), col("experiment_id"), explode(map_concat(col("metrics"), col("params"))), col('start_time'), col("end_time")) \ 366 | .filter("key != 'model'") \ 367 | .select("run_id", "experiment_id", "key", col("value").cast("float"), col('start_time'), col("end_time")) \ 368 | .groupBy("run_id", "experiment_id", "start_time", "end_time") \ 369 | .pivot("key") \ 370 | .sum("value") \ 371 | .withColumn("trainingDuration", col("end_time").cast("integer")-col("start_time").cast("integer")) # example of added column 372 | 373 | # COMMAND ---------- 374 | 375 | refined_mlflow_df.write.mode("overwrite").saveAsTable(f"{database_name}.experiment_data_bronze") 376 | 377 | # COMMAND ---------- 378 | 379 | # MAGIC %md 380 | # MAGIC We can also save our AutoML experiment results to a Delta Table 381 | 382 | # COMMAND ---------- 383 | 384 | automl_mlflow = "/Users/salma.mayorquin@databricks.com/databricks_automl/22-02-20-03:37-01 - Data Preparation: Feature Engineering and AutoML-7b753624/01 - Data Preparation: Feature Engineering and AutoML-Experiment-7b753624" 385 | 386 | automl_expId = mlflow.get_experiment_by_name(automl_mlflow).experiment_id 387 | 388 | automl_mlflow_df = spark.read.format("mlflow-experiment").load(automl_expId) 389 | 390 | refined_automl_mlflow_df = automl_mlflow_df.select(col('run_id'), col("experiment_id"), explode(map_concat(col("metrics"), col("params"))), col('start_time'), col("end_time")) \ 391 | .filter("key != 'model'") \ 392 | .select("run_id", "experiment_id", "key", col("value").cast("float"), col('start_time'), col("end_time")) \ 393 | .groupBy("run_id", "experiment_id", "start_time", "end_time") \ 394 | .pivot("key") \ 395 | .sum("value") \ 396 | .withColumn("trainingDuration", col("end_time").cast("integer")-col("start_time").cast("integer")) # example of added column 397 | 398 | # COMMAND ---------- 399 | 400 | refined_automl_mlflow_df.write.mode("overwrite").option("mergeSchema", "true").saveAsTable(f"{database_name}.automl_data_bronze") 401 | 402 | # COMMAND ---------- 403 | 404 | # MAGIC %md 405 | # MAGIC ### Calculate Data Drift 406 | # MAGIC 407 | # MAGIC Understanding data drift is key to understanding when it is time to retrain your model. When you train a model, you are training it on a sample of data. While these training datasets are usually quite large, they don't represent changes that may happend to the data in the future. For instance, as new US census data gets collected, new societal factors could appear in the data coming into the model to be scored that the model does not know how to properly score. 408 | # MAGIC 409 | # MAGIC Monitoring for this drift is important so that you can retrain and refresh the model to allow for the model to adapt. 410 | # MAGIC 411 | # MAGIC The short example of this that we are showing today uses the [Kolmogorov-Smirnov test](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html) to compare the distribution of the training dataset with the incoming data that is being scored by the model. 412 | 413 | # COMMAND ---------- 414 | 415 | # running Kolmogorov-Smirnov test for numerical columns 416 | from scipy import stats 417 | from pyspark.sql.types import * 418 | 419 | from datetime import datetime 420 | 421 | def calculate_numerical_drift(training_dataset, comparison_dataset, comparison_dataset_name, cols, p_value, date): 422 | drift_data = [] 423 | for col in cols: 424 | passed = 1 425 | test = stats.ks_2samp(training_dataset[col], comparison_dataset[col]) 426 | if test[1] < p_value: 427 | passed = 0 428 | drift_data.append((date, comparison_dataset_name, col, float(test[0]), float(test[1]), passed)) 429 | return drift_data 430 | 431 | # COMMAND ---------- 432 | 433 | p_value = 0.05 434 | numerical_cols = ["fnlwgt", "hours_per_week", "log_capital_gain", "log_capital_loss"] 435 | 436 | dataset_name = "ml_income_workshop.inference_income" 437 | date = datetime.strptime("2000-01-01", '%Y-%m-%d').date() # simulated date for demo purpose 438 | 439 | numerical_cols = ["fnlwgt", "hours_per_week", "log_capital_gain", "log_capital_loss"] 440 | 441 | drift_data = calculate_numerical_drift(df_loaded, inference_data, dataset_name, numerical_cols, p_value, date) 442 | 443 | # COMMAND ---------- 444 | 445 | driftSchema = StructType([StructField("date", DateType(), True), \ 446 | StructField("dataset", StringType(), True), \ 447 | StructField("column", StringType(), True), \ 448 | StructField("statistic", FloatType(), True), \ 449 | StructField("pvalue", FloatType(), True), \ 450 | StructField("passed", IntegerType(), True)\ 451 | ]) 452 | 453 | numerical_data_drift_df = spark.createDataFrame(data=drift_data, schema=driftSchema) 454 | display(numerical_data_drift_df) 455 | 456 | # COMMAND ---------- 457 | 458 | # MAGIC %sql 459 | # MAGIC DROP TABLE IF EXISTS ml_income_workshop.numerical_drift_income 460 | 461 | # COMMAND ---------- 462 | 463 | # Write results to a delta table for future analysis 464 | numerical_data_drift_df.write.mode("overwrite").saveAsTable(f"{database_name}.numerical_drift_income") 465 | 466 | # COMMAND ---------- 467 | 468 | # MAGIC %md 469 | # MAGIC We can perturbe our inference dataset to simulate how data can change over time. 470 | 471 | # COMMAND ---------- 472 | 473 | # MAGIC %sql 474 | # MAGIC DROP TABLE IF EXISTS ml_income_workshop.modified_inference_data 475 | 476 | # COMMAND ---------- 477 | 478 | import random 479 | 480 | def add_noise(value, max_noise=20): 481 | """ 482 | Simulate change in distribution by adding random noise 483 | """ 484 | noise = random.randint(0, max_noise) 485 | return value + noise 486 | 487 | modified_inference_data = inference_data.copy() 488 | modified_inference_data[numerical_cols] = modified_inference_data[numerical_cols].apply(add_noise, axis = 1) 489 | 490 | modified_inference_data_df = spark.createDataFrame(modified_inference_data) 491 | 492 | # Write for future reference 493 | modified_inference_data_df.write.mode("overwrite").saveAsTable(f"{database_name}.modified_inference_data") 494 | display(modified_inference_data_df) 495 | 496 | # COMMAND ---------- 497 | 498 | date = datetime.strptime("2010-01-01", '%Y-%m-%d').date() # simulated date for demo purpose 499 | dataset_name = "ml_income_workshop.modified_inference_income" 500 | 501 | modified_drift_data = calculate_numerical_drift(df_loaded, modified_inference_data, dataset_name, numerical_cols, p_value, date) 502 | 503 | modified_numerical_drift_data = spark.createDataFrame(data=modified_drift_data, schema=driftSchema) 504 | display(modified_numerical_drift_data) 505 | 506 | # COMMAND ---------- 507 | 508 | # append this new data to our drift table 509 | modified_numerical_drift_data.write.format("delta").mode("append").saveAsTable("ml_income_workshop.numerical_drift_income") 510 | 511 | # COMMAND ---------- 512 | 513 | display(spark.table("ml_income_workshop.numerical_drift_income")) 514 | 515 | # COMMAND ---------- 516 | 517 | # MAGIC %md 518 | # MAGIC We can also see how our model scores on this modified data 519 | 520 | # COMMAND ---------- 521 | 522 | X_modified = modified_inference_data.drop([target_col], axis=1) 523 | y_modified = modified_inference_data[target_col] 524 | 525 | # Log metrics for the modified set 526 | xgbc_mod_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_modified, y_modified, prefix="mod_") 527 | 528 | xgbc_mod_metrics = {k.replace("mod_", ""): v for k, v in xgbc_mod_metrics.items()} 529 | 530 | mod_metrics_pdf = pd.DataFrame([xgbc_mod_metrics]) 531 | mod_metrics_pdf["dataset"] = ["ml_income_workshop.modified_inference_income"] 532 | mod_metrics_df = spark.createDataFrame(mod_metrics_pdf) 533 | display(mod_metrics_df) 534 | 535 | # COMMAND ---------- 536 | 537 | # append this new data to our metrics table 538 | mod_metrics_df.write.format("delta").mode("append").saveAsTable("ml_income_workshop.metric_data_bronze") 539 | 540 | # COMMAND ---------- 541 | 542 | display(spark.table("ml_income_workshop.metric_data_bronze")) 543 | 544 | # COMMAND ---------- 545 | 546 | # MAGIC %md 547 | # MAGIC ## Drift Monitoring 548 | # MAGIC 549 | # MAGIC From here, you can visualize and query the various tables we created from training and data metadata using [Databricks SQL](https://databricks.com/product/databricks-sql). You can trigger alerts on custom queries to notify you when you should consider retraining your model. 550 | 551 | # COMMAND ---------- 552 | 553 | # MAGIC %md 554 | # MAGIC [Here](https://e2-demo-field-eng.cloud.databricks.com/sql/dashboards/64cbc6a0-bbd8-4612-9275-67327099a6dd-end-to-end-ml-workshop?o=1444828305810485) is our simple dashboard example 555 | -------------------------------------------------------------------------------- /End-to-End ML Workshop DB SQL Dashboard.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smellslikeml/end-to-end-ml-workshop/9f28da31cda17a2ace3f0817eedffd3bb2756cf5/End-to-End ML Workshop DB SQL Dashboard.pdf -------------------------------------------------------------------------------- /End-to-End ML Workshop.dbdash: -------------------------------------------------------------------------------- 1 | { 2 | "queries": [ 3 | { 4 | "id": "db8e820b-229a-4900-aa64-8b7f85445ebd", 5 | "name": "AutoML MLflow Statistics", 6 | "description": null, 7 | "query": "SELECT\n run_id, experiment_id, val_accuracy_score, val_f1_score, val_log_loss, val_precision_score, val_recall_score, val_roc_auc_score\nFROM\n ml_income_workshop.automl_data_bronze", 8 | "options": { 9 | "run_as_role": "viewer", 10 | "apply_auto_limit": true, 11 | "visualization_control_order": [], 12 | "parameters": [] 13 | }, 14 | "visualizations": [ 15 | { 16 | "id": "5d0b079b-9d4f-4f63-8c65-1826ba1baaed", 17 | "type": "TABLE", 18 | "name": "Table", 19 | "description": "", 20 | "options": { 21 | "version": 2 22 | }, 23 | "query_plan": null 24 | }, 25 | { 26 | "id": "62d4b1b3-3f06-4d8e-9742-636b282e3c95", 27 | "type": "CHART", 28 | "name": "AutoML Result Scores", 29 | "description": "", 30 | "options": { 31 | "version": 2, 32 | "globalSeriesType": "line", 33 | "sortX": true, 34 | "sortY": true, 35 | "legend": { 36 | "traceorder": "normal" 37 | }, 38 | "xAxis": { 39 | "type": "-", 40 | "labels": { 41 | "enabled": false 42 | } 43 | }, 44 | "yAxis": [ 45 | { 46 | "type": "-", 47 | "title": { 48 | "text": "score" 49 | } 50 | }, 51 | { 52 | "type": "-", 53 | "opposite": true 54 | } 55 | ], 56 | "alignYAxesAtZero": true, 57 | "error_y": { 58 | "type": "data", 59 | "visible": true 60 | }, 61 | "series": { 62 | "stacking": null, 63 | "error_y": { 64 | "type": "data", 65 | "visible": true 66 | } 67 | }, 68 | "seriesOptions": { 69 | "column_967a7e074945": { 70 | "yAxis": 0, 71 | "type": "line", 72 | "color": "#FB8D3D" 73 | }, 74 | "column_967a7e074947": { 75 | "yAxis": 0, 76 | "type": "line", 77 | "color": "#799CFF" 78 | } 79 | }, 80 | "valuesOptions": {}, 81 | "direction": { 82 | "type": "counterclockwise" 83 | }, 84 | "sizemode": "diameter", 85 | "coefficient": 1, 86 | "numberFormat": "0,0[.]00000", 87 | "percentFormat": "0[.]00%", 88 | "textFormat": "", 89 | "missingValuesAsZero": true, 90 | "useAggregationsUi": true, 91 | "swappedAxes": false, 92 | "dateTimeFormat": "YYYY-MM-DD HH:mm", 93 | "showDataLabels": false, 94 | "columnConfigurationMap": { 95 | "x": { 96 | "column": "run_id", 97 | "id": "column_967a7e074943" 98 | }, 99 | "y": [ 100 | { 101 | "id": "column_967a7e074945", 102 | "column": "val_accuracy_score", 103 | "transform": "SUM" 104 | }, 105 | { 106 | "id": "column_967a7e074947", 107 | "column": "val_roc_auc_score", 108 | "transform": "SUM" 109 | } 110 | ] 111 | }, 112 | "isAggregationOn": true, 113 | "condensed": true, 114 | "withRowNumber": true 115 | }, 116 | "query_plan": { 117 | "selects": [ 118 | { 119 | "column": "run_id" 120 | }, 121 | { 122 | "function": "SUM", 123 | "args": [ 124 | { 125 | "column": "val_accuracy_score" 126 | } 127 | ], 128 | "alias": "column_967a7e074945" 129 | }, 130 | { 131 | "function": "SUM", 132 | "args": [ 133 | { 134 | "column": "val_roc_auc_score" 135 | } 136 | ], 137 | "alias": "column_967a7e074947" 138 | } 139 | ], 140 | "groups": [ 141 | { 142 | "column": "run_id" 143 | } 144 | ] 145 | } 146 | } 147 | ] 148 | }, 149 | { 150 | "id": "73855693-9186-4c2c-b0b8-eb19b9b65301", 151 | "name": "ml_workshop_jja", 152 | "description": null, 153 | "query": "SELECT\n CASE\n WHEN dataset == \"ml_income_workshop.modified_inference_income\" THEN \"2010\"\n ELSE \"2000\"\n END year,\n roc_auc_score\nFROM\n ml_income_workshop.metric_data_bronze\nWHERE\n (\n dataset == \"ml_income_workshop.modified_inference_income\"\n )\n OR (dataset == \"ml_income_workshop.inference_income\")", 154 | "options": { 155 | "run_as_role": "viewer", 156 | "apply_auto_limit": true, 157 | "visualization_control_order": [], 158 | "parameters": [], 159 | "schema": "ml_income_workshop" 160 | }, 161 | "visualizations": [ 162 | { 163 | "id": "32cc71d3-bdd4-44e7-8024-a889c1a590bd", 164 | "type": "CHART", 165 | "name": "Metric Drift", 166 | "description": "", 167 | "options": { 168 | "version": 2, 169 | "globalSeriesType": "column", 170 | "sortX": true, 171 | "sortY": true, 172 | "legend": { 173 | "traceorder": "normal" 174 | }, 175 | "xAxis": { 176 | "type": "-", 177 | "labels": { 178 | "enabled": true 179 | }, 180 | "title": { 181 | "text": "Year" 182 | } 183 | }, 184 | "yAxis": [ 185 | { 186 | "type": "-", 187 | "title": { 188 | "text": "ROC AUC Score" 189 | } 190 | }, 191 | { 192 | "type": "-", 193 | "opposite": true, 194 | "title": { 195 | "text": null 196 | } 197 | } 198 | ], 199 | "alignYAxesAtZero": false, 200 | "error_y": { 201 | "type": "data", 202 | "visible": true 203 | }, 204 | "series": { 205 | "stacking": null, 206 | "error_y": { 207 | "type": "data", 208 | "visible": true 209 | } 210 | }, 211 | "seriesOptions": { 212 | "2000": { 213 | "zIndex": 0 214 | }, 215 | "2010": { 216 | "zIndex": 1 217 | }, 218 | "column_98f9dd6d178165": { 219 | "yAxis": 0 220 | } 221 | }, 222 | "valuesOptions": {}, 223 | "direction": { 224 | "type": "counterclockwise" 225 | }, 226 | "sizemode": "diameter", 227 | "coefficient": 1, 228 | "numberFormat": "0,0[.]00", 229 | "percentFormat": "0[.]00%", 230 | "textFormat": "", 231 | "missingValuesAsZero": true, 232 | "useAggregationsUi": true, 233 | "swappedAxes": false, 234 | "dateTimeFormat": "YYYY-MM-DD HH:mm", 235 | "showDataLabels": true, 236 | "columnConfigurationMap": { 237 | "y": [ 238 | { 239 | "id": "column_98f9dd6d178165", 240 | "column": "roc_auc_score", 241 | "transform": "SUM" 242 | } 243 | ], 244 | "series": { 245 | "column": "year", 246 | "id": "column_98f9dd6d178166" 247 | } 248 | }, 249 | "reverseX": false 250 | }, 251 | "query_plan": { 252 | "selects": [ 253 | { 254 | "function": "SUM", 255 | "args": [ 256 | { 257 | "column": "roc_auc_score" 258 | } 259 | ], 260 | "alias": "column_98f9dd6d178165" 261 | }, 262 | { 263 | "column": "year" 264 | } 265 | ], 266 | "groups": [ 267 | { 268 | "column": "year" 269 | } 270 | ] 271 | } 272 | }, 273 | { 274 | "id": "9a26ccf2-ab2c-4f24-997c-52496889656e", 275 | "type": "TABLE", 276 | "name": "Table", 277 | "description": "", 278 | "options": { 279 | "version": 2 280 | }, 281 | "query_plan": null 282 | } 283 | ] 284 | }, 285 | { 286 | "id": "7c24cc9b-1208-4ac8-a5c8-48d881370300", 287 | "name": "Numerical Data Drift", 288 | "description": null, 289 | "query": "-- SELECT\n-- SUM(\n-- CASE\n-- WHEN passed = 0 THEN 1\n-- ELSE 0\n-- END\n-- ) AS TOTAL_FAILURES\n-- FROM\n-- ml_income_workshop.numerical_drift_income\nSELECT\n column feature,\n statistic,\n pvalue,\n passed\nFROM\n ml_income_workshop.numerical_drift_income\nWHERE\n date = \"2010-01-01\"", 290 | "options": { 291 | "run_as_role": "viewer", 292 | "apply_auto_limit": true, 293 | "parameters": [] 294 | }, 295 | "visualizations": [ 296 | { 297 | "id": "ce1b8be7-5199-47cd-85f0-2c534c157dd6", 298 | "type": "COUNTER", 299 | "name": "Numerical Data Drift", 300 | "description": "", 301 | "options": { 302 | "counterLabel": "Numerical Data Drift", 303 | "counterColName": "TOTAL_FAILURES", 304 | "rowNumber": 1, 305 | "targetRowNumber": 1, 306 | "stringDecimal": 0, 307 | "stringDecChar": ".", 308 | "stringThouSep": ",", 309 | "tooltipFormat": "0,0.000", 310 | "showPlotlyControls": true 311 | }, 312 | "query_plan": null 313 | }, 314 | { 315 | "id": "cec38337-543e-44c3-86cf-32f6c849bf30", 316 | "type": "TABLE", 317 | "name": "Table", 318 | "description": "", 319 | "options": { 320 | "itemsPerPage": 25, 321 | "condensed": false, 322 | "columns": [ 323 | { 324 | "dateTimeFormat": "YYYY-MM-DD", 325 | "booleanValues": [ 326 | "false", 327 | "true" 328 | ], 329 | "imageUrlTemplate": "{{ @ }}", 330 | "imageTitleTemplate": "{{ @ }}", 331 | "imageWidth": "", 332 | "imageHeight": "", 333 | "linkUrlTemplate": "{{ @ }}", 334 | "linkTextTemplate": "{{ @ }}", 335 | "linkTitleTemplate": "{{ @ }}", 336 | "linkOpenInNewTab": true, 337 | "name": "date", 338 | "type": "date", 339 | "displayAs": "datetime", 340 | "visible": true, 341 | "order": 100000, 342 | "title": "date", 343 | "allowSearch": false, 344 | "alignContent": "right", 345 | "allowHTML": false, 346 | "highlightLinks": false, 347 | "useMonospaceFont": false, 348 | "preserveWhitespace": false 349 | }, 350 | { 351 | "booleanValues": [ 352 | "false", 353 | "true" 354 | ], 355 | "imageUrlTemplate": "{{ @ }}", 356 | "imageTitleTemplate": "{{ @ }}", 357 | "imageWidth": "", 358 | "imageHeight": "", 359 | "linkUrlTemplate": "{{ @ }}", 360 | "linkTextTemplate": "{{ @ }}", 361 | "linkTitleTemplate": "{{ @ }}", 362 | "linkOpenInNewTab": true, 363 | "name": "dataset", 364 | "type": "string", 365 | "displayAs": "string", 366 | "visible": true, 367 | "order": 100001, 368 | "title": "dataset", 369 | "allowSearch": false, 370 | "alignContent": "left", 371 | "allowHTML": false, 372 | "highlightLinks": false, 373 | "useMonospaceFont": false, 374 | "preserveWhitespace": false 375 | }, 376 | { 377 | "booleanValues": [ 378 | "false", 379 | "true" 380 | ], 381 | "imageUrlTemplate": "{{ @ }}", 382 | "imageTitleTemplate": "{{ @ }}", 383 | "imageWidth": "", 384 | "imageHeight": "", 385 | "linkUrlTemplate": "{{ @ }}", 386 | "linkTextTemplate": "{{ @ }}", 387 | "linkTitleTemplate": "{{ @ }}", 388 | "linkOpenInNewTab": true, 389 | "name": "column", 390 | "type": "string", 391 | "displayAs": "string", 392 | "visible": true, 393 | "order": 100002, 394 | "title": "column", 395 | "allowSearch": false, 396 | "alignContent": "left", 397 | "allowHTML": false, 398 | "highlightLinks": false, 399 | "useMonospaceFont": false, 400 | "preserveWhitespace": false 401 | }, 402 | { 403 | "numberFormat": "0.0000", 404 | "booleanValues": [ 405 | "false", 406 | "true" 407 | ], 408 | "imageUrlTemplate": "{{ @ }}", 409 | "imageTitleTemplate": "{{ @ }}", 410 | "imageWidth": "", 411 | "imageHeight": "", 412 | "linkUrlTemplate": "{{ @ }}", 413 | "linkTextTemplate": "{{ @ }}", 414 | "linkTitleTemplate": "{{ @ }}", 415 | "linkOpenInNewTab": true, 416 | "name": "statistic", 417 | "type": "float", 418 | "displayAs": "number", 419 | "visible": true, 420 | "order": 100003, 421 | "title": "statistic", 422 | "allowSearch": false, 423 | "alignContent": "right", 424 | "allowHTML": false, 425 | "highlightLinks": false, 426 | "useMonospaceFont": false, 427 | "preserveWhitespace": false 428 | }, 429 | { 430 | "numberFormat": "0.0000", 431 | "booleanValues": [ 432 | "false", 433 | "true" 434 | ], 435 | "imageUrlTemplate": "{{ @ }}", 436 | "imageTitleTemplate": "{{ @ }}", 437 | "imageWidth": "", 438 | "imageHeight": "", 439 | "linkUrlTemplate": "{{ @ }}", 440 | "linkTextTemplate": "{{ @ }}", 441 | "linkTitleTemplate": "{{ @ }}", 442 | "linkOpenInNewTab": true, 443 | "name": "pvalue", 444 | "type": "float", 445 | "displayAs": "number", 446 | "visible": true, 447 | "order": 100004, 448 | "title": "pvalue", 449 | "allowSearch": false, 450 | "alignContent": "right", 451 | "allowHTML": false, 452 | "highlightLinks": false, 453 | "useMonospaceFont": false, 454 | "preserveWhitespace": false 455 | }, 456 | { 457 | "numberFormat": "0", 458 | "booleanValues": [ 459 | "false", 460 | "true" 461 | ], 462 | "imageUrlTemplate": "{{ @ }}", 463 | "imageTitleTemplate": "{{ @ }}", 464 | "imageWidth": "", 465 | "imageHeight": "", 466 | "linkUrlTemplate": "{{ @ }}", 467 | "linkTextTemplate": "{{ @ }}", 468 | "linkTitleTemplate": "{{ @ }}", 469 | "linkOpenInNewTab": true, 470 | "name": "passed", 471 | "type": "integer", 472 | "displayAs": "number", 473 | "visible": true, 474 | "order": 100005, 475 | "title": "passed", 476 | "allowSearch": false, 477 | "alignContent": "right", 478 | "allowHTML": false, 479 | "highlightLinks": false, 480 | "useMonospaceFont": false, 481 | "preserveWhitespace": false 482 | } 483 | ], 484 | "version": 2, 485 | "showPlotlyControls": true 486 | }, 487 | "query_plan": null 488 | } 489 | ] 490 | }, 491 | { 492 | "id": "9e922cf6-b6ba-4833-8049-621944546a94", 493 | "name": "Data Drift", 494 | "description": null, 495 | "query": "SELECT\n avg(fnlwgt),\n avg(hours_per_week),\n avg(log_capital_gain),\n avg(log_capital_loss),\n '2010' year\nFROM\n ml_income_workshop.modified_inference_data\nGROUP BY\n year\nUNION\n (\n SELECT\n avg(fnlwgt),\n avg(hours_per_week),\n avg(log_capital_gain),\n avg(log_capital_loss),\n '2000' year\n FROM\n ml_income_workshop.inference_income\n GROUP BY\n year\n )", 496 | "options": { 497 | "run_as_role": "viewer", 498 | "apply_auto_limit": false, 499 | "parameters": [] 500 | }, 501 | "visualizations": [ 502 | { 503 | "id": "120fbbef-e3a0-4bcf-979b-d54ce9e8d7d1", 504 | "type": "CHART", 505 | "name": "Avg Log of Capital Gain by Census Year", 506 | "description": "", 507 | "options": { 508 | "version": 2, 509 | "globalSeriesType": "column", 510 | "sortX": true, 511 | "sortY": true, 512 | "legend": { 513 | "traceorder": "normal" 514 | }, 515 | "xAxis": { 516 | "type": "-", 517 | "labels": { 518 | "enabled": true 519 | }, 520 | "title": { 521 | "text": "Census Year" 522 | } 523 | }, 524 | "yAxis": [ 525 | { 526 | "type": "-", 527 | "title": { 528 | "text": "Average Log of Capital Gain" 529 | } 530 | }, 531 | { 532 | "type": "-", 533 | "opposite": true 534 | } 535 | ], 536 | "alignYAxesAtZero": false, 537 | "error_y": { 538 | "type": "data", 539 | "visible": true 540 | }, 541 | "series": { 542 | "stacking": "", 543 | "error_y": { 544 | "type": "data", 545 | "visible": true 546 | } 547 | }, 548 | "seriesOptions": { 549 | "column_98f9dd6d93526": { 550 | "yAxis": 0 551 | } 552 | }, 553 | "valuesOptions": {}, 554 | "direction": { 555 | "type": "counterclockwise" 556 | }, 557 | "sizemode": "diameter", 558 | "coefficient": 1, 559 | "numberFormat": "0,0[.]00", 560 | "percentFormat": "0[.]00%", 561 | "textFormat": "", 562 | "missingValuesAsZero": true, 563 | "useAggregationsUi": true, 564 | "swappedAxes": true, 565 | "dateTimeFormat": "YYYY-MM-DD HH:mm", 566 | "showDataLabels": true, 567 | "columnConfigurationMap": { 568 | "y": [ 569 | { 570 | "id": "column_98f9dd6d93526", 571 | "column": "avg(log_capital_gain)", 572 | "transform": "AVG" 573 | } 574 | ], 575 | "series": { 576 | "column": "year", 577 | "id": "column_98f9dd6d93527" 578 | } 579 | } 580 | }, 581 | "query_plan": { 582 | "selects": [ 583 | { 584 | "function": "AVG", 585 | "args": [ 586 | { 587 | "column": "avg(log_capital_gain)" 588 | } 589 | ], 590 | "alias": "column_98f9dd6d93526" 591 | }, 592 | { 593 | "column": "year" 594 | } 595 | ], 596 | "groups": [ 597 | { 598 | "column": "year" 599 | } 600 | ] 601 | } 602 | }, 603 | { 604 | "id": "1e9f8ad3-325e-420b-9b3e-de9a119b1073", 605 | "type": "TABLE", 606 | "name": "Table", 607 | "description": "", 608 | "options": { 609 | "version": 2 610 | }, 611 | "query_plan": null 612 | }, 613 | { 614 | "id": "7a94f3ab-f46c-4320-9098-eb521f27f7a5", 615 | "type": "CHART", 616 | "name": "Avg Final Weight by Census Year", 617 | "description": "", 618 | "options": { 619 | "version": 2, 620 | "globalSeriesType": "column", 621 | "sortX": true, 622 | "sortY": true, 623 | "legend": { 624 | "traceorder": "normal" 625 | }, 626 | "xAxis": { 627 | "type": "-", 628 | "labels": { 629 | "enabled": true 630 | }, 631 | "title": { 632 | "text": "Census Year" 633 | } 634 | }, 635 | "yAxis": [ 636 | { 637 | "type": "-", 638 | "title": { 639 | "text": "Final Weight" 640 | } 641 | }, 642 | { 643 | "type": "-", 644 | "opposite": true 645 | } 646 | ], 647 | "alignYAxesAtZero": false, 648 | "error_y": { 649 | "type": "data", 650 | "visible": true 651 | }, 652 | "series": { 653 | "stacking": "", 654 | "error_y": { 655 | "type": "data", 656 | "visible": true 657 | } 658 | }, 659 | "seriesOptions": { 660 | "column_98f9dd6d93526": { 661 | "yAxis": 0 662 | } 663 | }, 664 | "valuesOptions": {}, 665 | "direction": { 666 | "type": "counterclockwise" 667 | }, 668 | "sizemode": "diameter", 669 | "coefficient": 1, 670 | "numberFormat": "0,0[.]", 671 | "percentFormat": "0[.]00%", 672 | "textFormat": "", 673 | "missingValuesAsZero": true, 674 | "useAggregationsUi": true, 675 | "swappedAxes": true, 676 | "dateTimeFormat": "YYYY-MM-DD HH:mm", 677 | "showDataLabels": true, 678 | "columnConfigurationMap": { 679 | "y": [ 680 | { 681 | "id": "column_98f9dd6d93526", 682 | "column": "avg(fnlwgt)", 683 | "transform": "AVG" 684 | } 685 | ], 686 | "series": { 687 | "column": "year", 688 | "id": "column_98f9dd6d93527" 689 | } 690 | } 691 | }, 692 | "query_plan": { 693 | "selects": [ 694 | { 695 | "function": "AVG", 696 | "args": [ 697 | { 698 | "column": "avg(fnlwgt)" 699 | } 700 | ], 701 | "alias": "column_98f9dd6d93526" 702 | }, 703 | { 704 | "column": "year" 705 | } 706 | ], 707 | "groups": [ 708 | { 709 | "column": "year" 710 | } 711 | ] 712 | } 713 | }, 714 | { 715 | "id": "81bb793f-c7ee-44ab-a62b-83ddd1ff7c71", 716 | "type": "CHART", 717 | "name": "Avg Hours Per Week by Census Year", 718 | "description": "", 719 | "options": { 720 | "version": 2, 721 | "globalSeriesType": "column", 722 | "sortX": true, 723 | "sortY": true, 724 | "legend": { 725 | "traceorder": "normal" 726 | }, 727 | "xAxis": { 728 | "type": "-", 729 | "labels": { 730 | "enabled": true 731 | }, 732 | "title": { 733 | "text": "Census Year" 734 | } 735 | }, 736 | "yAxis": [ 737 | { 738 | "type": "-", 739 | "title": { 740 | "text": "Average Hours Per Week" 741 | } 742 | }, 743 | { 744 | "type": "-", 745 | "opposite": true 746 | } 747 | ], 748 | "alignYAxesAtZero": false, 749 | "error_y": { 750 | "type": "data", 751 | "visible": true 752 | }, 753 | "series": { 754 | "stacking": "", 755 | "error_y": { 756 | "type": "data", 757 | "visible": true 758 | } 759 | }, 760 | "seriesOptions": { 761 | "column_98f9dd6d93526": { 762 | "yAxis": 0 763 | } 764 | }, 765 | "valuesOptions": {}, 766 | "direction": { 767 | "type": "counterclockwise" 768 | }, 769 | "sizemode": "diameter", 770 | "coefficient": 1, 771 | "numberFormat": "0,0[.]00", 772 | "percentFormat": "0[.]00%", 773 | "textFormat": "", 774 | "missingValuesAsZero": true, 775 | "useAggregationsUi": true, 776 | "swappedAxes": true, 777 | "dateTimeFormat": "YYYY-MM-DD HH:mm", 778 | "showDataLabels": true, 779 | "columnConfigurationMap": { 780 | "y": [ 781 | { 782 | "id": "column_98f9dd6d93526", 783 | "column": "avg(hours_per_week)", 784 | "transform": "SUM" 785 | } 786 | ], 787 | "series": { 788 | "column": "year", 789 | "id": "column_98f9dd6d93527" 790 | } 791 | } 792 | }, 793 | "query_plan": { 794 | "selects": [ 795 | { 796 | "function": "SUM", 797 | "args": [ 798 | { 799 | "column": "avg(hours_per_week)" 800 | } 801 | ], 802 | "alias": "column_98f9dd6d93526" 803 | }, 804 | { 805 | "column": "year" 806 | } 807 | ], 808 | "groups": [ 809 | { 810 | "column": "year" 811 | } 812 | ] 813 | } 814 | }, 815 | { 816 | "id": "f222a83d-c9cd-4d7b-b016-49e3dbfe4653", 817 | "type": "CHART", 818 | "name": "Avg Log of Capital Loss by Census Year", 819 | "description": "", 820 | "options": { 821 | "version": 2, 822 | "globalSeriesType": "column", 823 | "sortX": true, 824 | "sortY": true, 825 | "legend": { 826 | "traceorder": "normal" 827 | }, 828 | "xAxis": { 829 | "type": "-", 830 | "labels": { 831 | "enabled": true 832 | }, 833 | "title": { 834 | "text": "Census Year" 835 | } 836 | }, 837 | "yAxis": [ 838 | { 839 | "type": "-", 840 | "title": { 841 | "text": "Average Log of Capital Loss" 842 | } 843 | }, 844 | { 845 | "type": "-", 846 | "opposite": true 847 | } 848 | ], 849 | "alignYAxesAtZero": false, 850 | "error_y": { 851 | "type": "data", 852 | "visible": true 853 | }, 854 | "series": { 855 | "stacking": "", 856 | "error_y": { 857 | "type": "data", 858 | "visible": true 859 | } 860 | }, 861 | "seriesOptions": { 862 | "column_98f9dd6d93526": { 863 | "yAxis": 0 864 | } 865 | }, 866 | "valuesOptions": {}, 867 | "direction": { 868 | "type": "counterclockwise" 869 | }, 870 | "sizemode": "diameter", 871 | "coefficient": 1, 872 | "numberFormat": "0,0[.]00", 873 | "percentFormat": "0[.]00%", 874 | "textFormat": "", 875 | "missingValuesAsZero": true, 876 | "useAggregationsUi": true, 877 | "swappedAxes": true, 878 | "dateTimeFormat": "YYYY-MM-DD HH:mm", 879 | "showDataLabels": true, 880 | "columnConfigurationMap": { 881 | "y": [ 882 | { 883 | "id": "column_98f9dd6d93526", 884 | "column": "avg(log_capital_loss)", 885 | "transform": "AVG" 886 | } 887 | ], 888 | "series": { 889 | "column": "year", 890 | "id": "column_98f9dd6d93527" 891 | } 892 | } 893 | }, 894 | "query_plan": { 895 | "selects": [ 896 | { 897 | "function": "AVG", 898 | "args": [ 899 | { 900 | "column": "avg(log_capital_loss)" 901 | } 902 | ], 903 | "alias": "column_98f9dd6d93526" 904 | }, 905 | { 906 | "column": "year" 907 | } 908 | ], 909 | "groups": [ 910 | { 911 | "column": "year" 912 | } 913 | ] 914 | } 915 | } 916 | ] 917 | } 918 | ], 919 | "dashboard": { 920 | "id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 921 | "name": "End-to-End ML Workshop", 922 | "widgets": [ 923 | { 924 | "id": "6b658518-d04d-4fe4-b240-dd073193eef2", 925 | "width": 1, 926 | "options": { 927 | "parameterMappings": {}, 928 | "isHidden": false, 929 | "position": { 930 | "autoHeight": false, 931 | "sizeX": 1, 932 | "sizeY": 6, 933 | "minSizeX": 1, 934 | "maxSizeX": 6, 935 | "minSizeY": 5, 936 | "maxSizeY": 1000, 937 | "col": 0, 938 | "row": 15 939 | }, 940 | "title": "Avg Hours Per Week by Census Year", 941 | "description": "", 942 | "overrideColors": false 943 | }, 944 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 945 | "text": "", 946 | "visualization_id": "81bb793f-c7ee-44ab-a62b-83ddd1ff7c71" 947 | }, 948 | { 949 | "id": "b8698523-1924-41f1-8ca5-9177fd8f5503", 950 | "width": 1, 951 | "options": { 952 | "parameterMappings": {}, 953 | "isHidden": false, 954 | "position": { 955 | "autoHeight": false, 956 | "sizeX": 1, 957 | "sizeY": 6, 958 | "minSizeX": 1, 959 | "maxSizeX": 6, 960 | "minSizeY": 5, 961 | "maxSizeY": 1000, 962 | "col": 2, 963 | "row": 15 964 | }, 965 | "title": "Avg Log of Capital Loss by Census Year", 966 | "description": "", 967 | "overrideColors": false 968 | }, 969 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 970 | "text": "", 971 | "visualization_id": "f222a83d-c9cd-4d7b-b016-49e3dbfe4653" 972 | }, 973 | { 974 | "id": "5d14c446-8b85-42ef-9f69-8bdddfb39ac6", 975 | "width": 1, 976 | "options": { 977 | "parameterMappings": {}, 978 | "isHidden": false, 979 | "position": { 980 | "autoHeight": false, 981 | "sizeX": 1, 982 | "sizeY": 6, 983 | "minSizeX": 1, 984 | "maxSizeX": 6, 985 | "minSizeY": 5, 986 | "maxSizeY": 1000, 987 | "col": 1, 988 | "row": 15 989 | }, 990 | "title": "Avg Log of Capital Gain by Census Year", 991 | "description": "", 992 | "overrideColors": false 993 | }, 994 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 995 | "text": "", 996 | "visualization_id": "120fbbef-e3a0-4bcf-979b-d54ce9e8d7d1" 997 | }, 998 | { 999 | "id": "a5ba281b-d57e-4cfd-a79f-22cb6a3ff444", 1000 | "width": 1, 1001 | "options": { 1002 | "parameterMappings": {}, 1003 | "isHidden": false, 1004 | "position": { 1005 | "autoHeight": false, 1006 | "sizeX": 1, 1007 | "sizeY": 6, 1008 | "minSizeX": 1, 1009 | "maxSizeX": 6, 1010 | "minSizeY": 5, 1011 | "maxSizeY": 1000, 1012 | "col": 3, 1013 | "row": 15 1014 | }, 1015 | "title": "Avg Final Weight by Census Year", 1016 | "description": "", 1017 | "overrideColors": false 1018 | }, 1019 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 1020 | "text": "", 1021 | "visualization_id": "7a94f3ab-f46c-4320-9098-eb521f27f7a5" 1022 | }, 1023 | { 1024 | "id": "b6aa7926-4118-4ae3-badc-ee25be85cf8a", 1025 | "width": 1, 1026 | "options": { 1027 | "parameterMappings": {}, 1028 | "isHidden": false, 1029 | "position": { 1030 | "autoHeight": false, 1031 | "sizeX": 2, 1032 | "sizeY": 6, 1033 | "minSizeX": 2, 1034 | "maxSizeX": 6, 1035 | "minSizeY": 1, 1036 | "maxSizeY": 1000, 1037 | "col": 4, 1038 | "row": 15 1039 | } 1040 | }, 1041 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 1042 | "text": "", 1043 | "visualization_id": "cec38337-543e-44c3-86cf-32f6c849bf30" 1044 | }, 1045 | { 1046 | "id": "b12de187-7388-458a-ad56-88823d68f110", 1047 | "width": 1, 1048 | "options": { 1049 | "parameterMappings": {}, 1050 | "isHidden": false, 1051 | "position": { 1052 | "autoHeight": false, 1053 | "sizeX": 2, 1054 | "sizeY": 5, 1055 | "minSizeX": 1, 1056 | "maxSizeX": 6, 1057 | "minSizeY": 5, 1058 | "maxSizeY": 1000, 1059 | "col": 0, 1060 | "row": 23 1061 | }, 1062 | "title": "Metric Drift", 1063 | "description": "", 1064 | "overrideColors": false 1065 | }, 1066 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 1067 | "text": "", 1068 | "visualization_id": "32cc71d3-bdd4-44e7-8024-a889c1a590bd" 1069 | }, 1070 | { 1071 | "id": "f6898369-3f86-4019-9000-5b6bd2e2df3e", 1072 | "width": 1, 1073 | "options": { 1074 | "parameterMappings": {}, 1075 | "isHidden": false, 1076 | "position": { 1077 | "autoHeight": false, 1078 | "sizeX": 6, 1079 | "sizeY": 6, 1080 | "minSizeX": 2, 1081 | "maxSizeX": 6, 1082 | "minSizeY": 1, 1083 | "maxSizeY": 1000, 1084 | "col": 0, 1085 | "row": 7 1086 | } 1087 | }, 1088 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 1089 | "text": "", 1090 | "visualization_id": "5d0b079b-9d4f-4f63-8c65-1826ba1baaed" 1091 | }, 1092 | { 1093 | "id": "caae5f13-2fd2-4d3f-90bd-28b630d10876", 1094 | "width": 1, 1095 | "options": { 1096 | "isHidden": false, 1097 | "position": { 1098 | "autoHeight": false, 1099 | "sizeX": 6, 1100 | "sizeY": 2, 1101 | "minSizeX": 1, 1102 | "maxSizeX": 6, 1103 | "minSizeY": 1, 1104 | "maxSizeY": 1000, 1105 | "col": 0, 1106 | "row": 0 1107 | } 1108 | }, 1109 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 1110 | "text": "## AutoML Experiments (from MLflow)" 1111 | }, 1112 | { 1113 | "id": "ed0c090f-9bee-48b7-b790-6502955cbea2", 1114 | "width": 1, 1115 | "options": { 1116 | "isHidden": false, 1117 | "position": { 1118 | "autoHeight": false, 1119 | "sizeX": 6, 1120 | "sizeY": 2, 1121 | "minSizeX": 1, 1122 | "maxSizeX": 6, 1123 | "minSizeY": 1, 1124 | "maxSizeY": 1000, 1125 | "col": 0, 1126 | "row": 13 1127 | }, 1128 | "parameterMappings": {} 1129 | }, 1130 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 1131 | "text": "## Data Drift Detection" 1132 | }, 1133 | { 1134 | "id": "8201b6eb-91ac-4e6d-abd2-7c8dc6c98090", 1135 | "width": 1, 1136 | "options": { 1137 | "isHidden": false, 1138 | "position": { 1139 | "autoHeight": false, 1140 | "sizeX": 6, 1141 | "sizeY": 2, 1142 | "minSizeX": 1, 1143 | "maxSizeX": 6, 1144 | "minSizeY": 1, 1145 | "maxSizeY": 1000, 1146 | "col": 0, 1147 | "row": 21 1148 | }, 1149 | "parameterMappings": {} 1150 | }, 1151 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 1152 | "text": "## Metric Drift Detection" 1153 | }, 1154 | { 1155 | "id": "d2320e3b-268c-4577-8c42-23fdb61c5d50", 1156 | "width": 1, 1157 | "options": { 1158 | "parameterMappings": {}, 1159 | "isHidden": false, 1160 | "position": { 1161 | "autoHeight": false, 1162 | "sizeX": 6, 1163 | "sizeY": 5, 1164 | "minSizeX": 1, 1165 | "maxSizeX": 6, 1166 | "minSizeY": 5, 1167 | "maxSizeY": 1000, 1168 | "col": 0, 1169 | "row": 2 1170 | }, 1171 | "description": "", 1172 | "title": "AutoML MLflow Statistics", 1173 | "overrideColors": false 1174 | }, 1175 | "dashboard_id": "64cbc6a0-bbd8-4612-9275-67327099a6dd", 1176 | "text": "", 1177 | "visualization_id": "62d4b1b3-3f06-4d8e-9742-636b282e3c95" 1178 | } 1179 | ], 1180 | "tags": [], 1181 | "color_palette": null, 1182 | "dashboard_filters_enabled": false, 1183 | "parent": "folders/3715387154823535", 1184 | "is_archived": false 1185 | }, 1186 | "version": "1.0" 1187 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # End-to-End Machine Learning on Databricks Workshop 2 | 3 | This 3 part workshop will walk you through a full model lifecycle, from processing input data to monitoring model performance over time. 4 | 5 | ## Requirements 6 | To access the latest ML features on databricks, including AutoML and `pyspark.pandas`, please use [Databricks Runtime 10.0 ML](https://docs.databricks.com/release-notes/runtime/10.0ml.html) or above. 7 | 8 | ## Getting Started 9 | Please clone this repo into your Databricks workspace using the [Repos](https://docs.databricks.com/repos/index.html) feature. 10 | 11 | To create vizualizations and a dashboard to the one attached, make sure Databricks SQL is available for your workspace. 12 | 13 | Thank you! Have a nice day --------------------------------------------------------------------------------