├── README.md
├── SECURITY.md
├── Retail
├── includes
│ ├── SetupLab.py
│ └── CreateRawData.py
├── 02.1 - Machine Learning - Inference.py
├── 00 - Introduction.py
├── 01.2 - Delta Live Tables - Python.py
├── 03 - BI and Data Warehousing.py
├── 01.2 - Delta Live Tables - SQL.sql
├── 01 - Data Engineering with Delta.py
└── 02 - Machine Learning with MLflow.py
├── includes
└── CloudLakehouseLabsContext.py
└── LICENSE.md
/README.md:
--------------------------------------------------------------------------------
1 | # Cloud Lakehouse Labs
2 | This repository contains the content for running the Databricks Cloud Lakehouse labs in a virtual or an in-person session.
3 |
4 | The instructions of running the labs are documented in the notebooks.
5 | For each lab (a subfolder under the root directory), start with the notebook **00_Introduction**.
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 |
3 | ## Reporting a Vulnerability
4 |
5 | Please email bugbounty@databricks.com to report any security vulnerabilities. We will acknowledge receipt of your vulnerability and strive to send you regular updates about our progress. If you're curious about the status of your disclosure please feel free to email us again. If you want to encrypt your disclosure email, you can use [this PGP key](https://keybase.io/arikfr/key.asc).
6 |
7 |
--------------------------------------------------------------------------------
/Retail/includes/SetupLab.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %run ../../includes/CloudLakehouseLabsContext
3 |
4 | # COMMAND ----------
5 |
6 | class RetailCloudLakehouseLabsContext(CloudLakehouseLabsContext):
7 | def __init__(self):
8 | super().__init__('retail')
9 | self.__databaseForDLT = self.schema() + "_dlt"
10 | self.__rawDataDirectory = "/cloud_lakehouse_labs/retail/raw"
11 | self.__rawDataVolume = self.workingVolumeDirectory()
12 | self.__deltaTablesDirectory = self.workingDirectory() + "/delta_tables"
13 | self.__dltPipelinesOutputDataDirectory = self.__rawDataVolume + "/dlt_pipelines"
14 |
15 | def dropAllDataAndSchema(self):
16 | super().dropAllDataAndSchema()
17 | try:
18 | spark.sql('DROP DATABASE IF EXISTS hive_metastore.' + self.__databaseForDLT + ' CASCADE')
19 | except Exception as e:
20 | pass
21 |
22 |
23 | def databaseForDLT(self): return self.__databaseForDLT
24 | def databaseName(self): return self.schema()
25 | def userNameId(self): return self.userId()
26 | def rawDataDirectory(self): return self.__rawDataDirectory
27 | def rawDataVolume(self): return self.__rawDataVolume
28 | def deltaTablesDirectory(self): return self.__deltaTablesDirectory
29 | def dltPipelinesOutputDataDirectory(self): return self.__dltPipelinesOutputDataDirectory
30 | def modelNameForUser(self): return "retail_churn_" + self.userId()
31 |
32 | # COMMAND ----------
33 |
34 | labContext = RetailCloudLakehouseLabsContext()
35 | databaseName = labContext.databaseName()
36 | userName = labContext.userNameId()
37 | databaseForDLT = labContext.databaseForDLT()
38 | rawDataDirectory = labContext.rawDataDirectory()
39 | rawDataVolume = labContext.rawDataVolume()
40 | deltaTablesDirectory = labContext.deltaTablesDirectory()
41 | dltPipelinesOutputDataDirectory = labContext.dltPipelinesOutputDataDirectory()
42 | modelName = labContext.modelNameForUser()
43 |
--------------------------------------------------------------------------------
/includes/CloudLakehouseLabsContext.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # Helper class that captures the execution context
3 |
4 | import unicodedata
5 | import re
6 |
7 | class CloudLakehouseLabsContext:
8 | def __init__(self, useCase: str):
9 | self.__useCase = useCase
10 | self.__cloud = spark.conf.get("spark.databricks.clusterUsageTags.cloudProvider").lower()
11 | self.__user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
12 | text = self.__user
13 | try: text = unicode(text, 'utf-8')
14 | except (TypeError, NameError): pass
15 | text = unicodedata.normalize('NFD', text)
16 | text = text.encode('ascii', 'ignore').decode("utf-8").lower()
17 | self.__user_id = re.sub("[^a-zA-Z0-9]", "_", text)
18 | self.__volumeName = useCase
19 |
20 | # Create the working schema
21 | catalogName = None
22 | databaseName = self.__user_id + '_' + self.__useCase
23 | volumeName = self.__volumeName
24 | for catalog in ['cloud_lakehouse_labs', 'main', 'dbdemos', 'hive_metastore']:
25 | try:
26 | catalogName = catalog
27 | if catalogName != 'hive_metastore':
28 | self.__catalog = catalogName
29 | spark.sql("create database if not exists " + catalog + "." + databaseName)
30 | spark.sql("CREATE VOLUME " + catalog + "." + databaseName + "." + volumeName)
31 | else:
32 | self.__catalog = catalogName
33 | spark.sql("create database if not exists " + databaseName)
34 | break
35 | except Exception as e:
36 | pass
37 | if catalogName is None: raise Exception("No catalog found with CREATE SCHEMA privileges for user '" + self.__user + "'")
38 | self.__schema = databaseName
39 | if catalogName != 'hive_metastore': spark.sql('use catalog ' + self.__catalog)
40 | spark.sql('use database ' + self.__schema)
41 |
42 | # Create the working directory under DBFS
43 | self.__workingDirectory = '/Users/' + self.__user_id + '/' + self.__useCase
44 | dbutils.fs.mkdirs(self.__workingDirectory)
45 |
46 | def cloud(self): return self.__cloud
47 |
48 | def user(self): return self.__user
49 |
50 | def schema(self): return self.__schema
51 |
52 | def volumeName(self): return self.volumeName
53 |
54 | def catalog(self): return self.__catalog
55 |
56 | def catalogAndSchema(self): return self.__catalog + '.' + self.__schema
57 |
58 | def workingDirectory(self): return self.__workingDirectory
59 |
60 | def workingVolumeDirectory(self): return "/Volumes/main/"+self.__schema+"/"+self.__volumeName
61 |
62 | def useCase(self): return self.__useCase
63 |
64 | def userId(self): return self.__user_id
65 |
66 | def dropAllDataAndSchema(self):
67 | try:
68 | spark.sql('DROP DATABASE IF EXISTS ' + self.catalogAndSchema() + ' CASCADE')
69 | except Exception as e:
70 | print(str(e))
71 | try:
72 | dbutils.fs.rm(self.__workingDirectory, recurse=True)
73 | except Exception as e:
74 | print(str(e))
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (2022) Databricks, Inc.
2 |
3 | This library (the "Software") may not be used except in connection with the Licensee's use of the Databricks Platform Services pursuant to an Agreement (defined below) between Licensee (defined below) and Databricks, Inc. ("Databricks"). The Object Code version of the Software shall be deemed part of the Downloadable Services under the Agreement, or if the Agreement does not define Downloadable Services, Subscription Services, or if neither are defined then the term in such Agreement that refers to the applicable Databricks Platform Services (as defined below) shall be substituted herein for “Downloadable Services.” Licensee's use of the Software must comply at all times with any restrictions applicable to the Downlodable Services and Subscription Services, generally, and must be used in accordance with any applicable documentation. For the avoidance of doubt, the Software constitutes Databricks Confidential Information under the Agreement.
4 |
5 | Additionally, and notwithstanding anything in the Agreement to the contrary:
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | you may view, make limited copies of, and may compile the Source Code version of the Software into an Object Code version of the Software. For the avoidance of doubt, you may not make derivative works of Software (or make any any changes to the Source Code version of the unless you have agreed to separate terms with Databricks permitting such modifications (e.g., a contribution license agreement)).
9 | If you have not agreed to an Agreement or otherwise do not agree to these terms, you may not use the Software or view, copy or compile the Source Code of the Software.
10 |
11 | This license terminates automatically upon the termination of the Agreement or Licensee's breach of these terms. Additionally, Databricks may terminate this license at any time on notice. Upon termination, you must permanently delete the Software and all copies thereof (including the Source Code).
12 |
13 | Agreement: the agreement between Databricks and Licensee governing the use of the Databricks Platform Services, which shall be, with respect to Databricks, the Databricks Terms of Service located at www.databricks.com/termsofservice, and with respect to Databricks Community Edition, the Community Edition Terms of Service located at www.databricks.com/ce-termsofuse, in each case unless Licensee has entered into a separate written agreement with Databricks governing the use of the applicable Databricks Platform Services.
14 |
15 | Databricks Platform Services: the Databricks services or the Databricks Community Edition services, according to where the Software is used.
16 |
17 | Licensee: the user of the Software, or, if the Software is being used on behalf of a company, the company.
18 |
19 | Object Code: is version of the Software produced when an interpreter or a compiler translates the Source Code into recognizable and executable machine code.
20 |
21 | Source Code: the human readable portion of the Software.
22 |
--------------------------------------------------------------------------------
/Retail/02.1 - Machine Learning - Inference.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md
3 | # MAGIC # Churn Prediction Inference - Batch or serverless real-time
4 | # MAGIC
5 | # MAGIC
6 | # MAGIC After running AutoML we saved our best model our MLflow registry.
7 | # MAGIC
8 | # MAGIC All we need to do now is use this model to run Inferences. A simple solution is to share the model name to our Data Engineering team and they'll be able to call this model within the pipeline they maintained.
9 | # MAGIC
10 | # MAGIC This can be done as part of a DLT pipeline or a Workflow in a separate job.
11 | # MAGIC Here is an example to show you how MLflow can be directly used to retrieve the model and run inferences.
12 |
13 | # COMMAND ----------
14 |
15 | # MAGIC %md-sandbox
16 | # MAGIC ##Deploying the model for batch inferences
17 | # MAGIC
18 | # MAGIC
19 | # MAGIC
20 | # MAGIC Now that our model is available in the Registry, we can load it to compute our inferences and save them in a table to start building dashboards.
21 | # MAGIC
22 | # MAGIC We will use MLFlow function to load a pyspark UDF and distribute our inference in the entire cluster. If the data is small, we can also load the model with plain python and use a pandas Dataframe.
23 | # MAGIC
24 | # MAGIC If you don't know how to start, Databricks can generate a batch inference notebook in just one click from the model registry: Open MLFlow model registry and click the "User model for inference" button!
25 |
26 | # COMMAND ----------
27 |
28 | # MAGIC %md-sandbox
29 | # MAGIC ## 5/ Enriching the gold data with a ML model
30 | # MAGIC
32 | # MAGIC
22 | # MAGIC
23 | # MAGIC 1. Ingest and create our Customer 360 database, with tables easy to query in SQL
24 | # MAGIC 2. Secure data and grant read access to the Data Analyst and Data Science teams.
25 | # MAGIC 3. Run BI queries to analyse existing churn
26 | # MAGIC 4. Build ML model to predict which customer is going to churn and why
27 | # MAGIC
28 | # MAGIC As a result, we will have all the information required to trigger custom actions to increase retention (email personalized, special offers, phone call...)
29 | # MAGIC
30 | # MAGIC ### Our dataset
31 | # MAGIC
32 | # MAGIC For simplicity, we will assume that an external system is periodically sending data into our blob cloud storage:
33 | # MAGIC
34 | # MAGIC - Customer profile data *(name, age, address etc)*
35 | # MAGIC - Orders history *(what ours customers have bought over time)*
36 | # MAGIC - Events from our application *(when was the last time a customer used the application, what clics were recorded, typically collected through a stream)*
37 | # MAGIC
38 | # MAGIC *Note that at our data could be arriving from any source. Databricks can ingest data from any system (SalesForce, Fivetran, queuing message like kafka, blob storage, SQL & NoSQL databases...).*
39 |
40 | # COMMAND ----------
41 |
42 | # MAGIC %md
43 | # MAGIC ### Raw data generation
44 | # MAGIC
45 | # MAGIC For this demonstration we will not be using real data or an existing dataset, but will rather generate them.
46 | # MAGIC
47 | # MAGIC The cell below will execute a notebook that will generate the data and store it in a S3 bucket and governed by a unity catalog volume.
48 | # MAGIC
49 |
50 | # COMMAND ----------
51 |
52 | # MAGIC %run ./includes/CreateRawData
53 |
54 | # COMMAND ----------
55 |
56 | # DBTITLE 1,The raw data on the volume
57 | ordersFolder = rawDataVolume + '/orders'
58 | usersFolder = rawDataVolume + '/users'
59 | eventsFolder = rawDataVolume + '/events'
60 | print('Order raw data stored under the folder "' + ordersFolder + '"')
61 | print('User raw data stored under the folder "' + usersFolder + '"')
62 | print('Website event raw data stored under the folder "' + eventsFolder + '"')
63 |
64 | # COMMAND ----------
65 |
66 | # MAGIC %md-sandbox
67 | # MAGIC ## What we are going to implement
68 | # MAGIC
69 | # MAGIC We will initially load the raw data with the autoloader,
70 | # MAGIC perform some cleaning and enrichment operations,
71 | # MAGIC develop and load a model from MLFlow to predict our customer churn,
72 | # MAGIC and finally use this information to build our DBSQL dashboard to track customer behavior and churn.
73 | # MAGIC
74 | # MAGIC 
18 | # MAGIC
66 | # MAGIC
117 | # MAGIC
10 | # MAGIC
24 | # MAGIC
25 | # MAGIC Our datasets are now properly ingested, secured, with a high quality and easily discoverable within our organization.
26 | # MAGIC
27 | # MAGIC Let's explore how Databricks SQL support your Data Analyst team with interactive BI and start analyzing our customer Churn.
28 | # MAGIC
29 | # MAGIC To start with Databricks SQL, open the SQL view on the top left menu.
30 | # MAGIC
31 | # MAGIC You'll be able to:
32 | # MAGIC
33 | # MAGIC - Create a SQL Warehouse to run your queries
34 | # MAGIC - Use DBSQL to build your own dashboards
35 | # MAGIC - Plug any BI tools (Tableau/PowerBI/..) to run your analysis
36 | # MAGIC
37 | # MAGIC
38 | # MAGIC
66 | # MAGIC
67 | # MAGIC Our users can now start running SQL queries using the SQL editor and add new visualizations.
68 | # MAGIC
69 | # MAGIC By leveraging auto-completion and the schema browser, we can start running adhoc queries on top of our data.
70 | # MAGIC
71 | # MAGIC While this is ideal for Data Analyst to start analysing our customer Churn, other personas can also leverage DBSQL to track our data ingestion pipeline, the data quality, model behavior etc.
72 | # MAGIC
73 | # MAGIC Open the [Queries menu](/sql/queries) to start writting your first analysis.
74 |
75 | # COMMAND ----------
76 |
77 | # MAGIC %md
78 | # MAGIC ## Lab exercise
79 | # MAGIC **1. Total MRR**
80 | # MAGIC ```
81 | # MAGIC SELECT
82 | # MAGIC sum(amount)/1000 as MRR
83 | # MAGIC FROM churn_orders
84 | # MAGIC WHERE
85 | # MAGIC month(to_timestamp(creation_date, 'MM-dd-yyyy HH:mm:ss')) =
86 | # MAGIC (
87 | # MAGIC select max(month(to_timestamp(creation_date, 'MM-dd-yyyy HH:mm:ss')))
88 | # MAGIC from churn_orders
89 | # MAGIC );
90 | # MAGIC
91 | # MAGIC ```
92 | # MAGIC Create a *counter* visualisation
93 | # MAGIC
94 | # MAGIC **2. Customer Tenure - Historical**
95 | # MAGIC ```
96 | # MAGIC SELECT cast(days_since_creation/30 as int) as days_since_creation, churn, count(*) as customers
97 | # MAGIC FROM churn_features
98 | # MAGIC GROUP BY days_since_creation, churn
99 | # MAGIC HAVING days_since_creation < 1000
100 | # MAGIC ```
101 | # MAGIC **3. Subscriptions by Internet Service - Historical**
102 | # MAGIC ```
103 | # MAGIC select platform, churn, count(*) as event_count
104 | # MAGIC from churn_app_events
105 | # MAGIC inner join churn_users using (user_id)
106 | # MAGIC where platform is not null
107 | # MAGIC group by platform, churn
108 | # MAGIC ```
109 | # MAGIC Create a *horizontal bar* visualisation
110 | # MAGIC
111 | # MAGIC
112 | # MAGIC **4. MRR at Risk**
113 | # MAGIC ```
114 | # MAGIC SELECT
115 | # MAGIC sum(amount)/1000 as MRR_at_risk
116 | # MAGIC FROM churn_orders
117 | # MAGIC WHERE month(to_timestamp(churn_orders.creation_date, 'MM-dd-yyyy HH:mm:ss')) =
118 | # MAGIC (
119 | # MAGIC select max(month(to_timestamp(churn_orders.creation_date, 'MM-dd-yyyy HH:mm:ss')))
120 | # MAGIC from churn_orders
121 | # MAGIC )
122 | # MAGIC and user_id in
123 | # MAGIC (
124 | # MAGIC SELECT user_id FROM churn_prediction WHERE churn_prediction=1
125 | # MAGIC )
126 | # MAGIC
127 | # MAGIC ```
128 |
129 | # COMMAND ----------
130 |
131 | # MAGIC %md
132 | # MAGIC
133 | # MAGIC **5. Customers at risk**
134 | # MAGIC ```
135 | # MAGIC SELECT count(*) as Customers, cast(churn_prediction as boolean) as `At Risk`
136 | # MAGIC FROM churn_prediction GROUP BY churn_prediction;
137 | # MAGIC
138 | # MAGIC ```
139 | # MAGIC **6. Predicted to churn by channel**
140 | # MAGIC ```
141 | # MAGIC SELECT channel, count(*) as users
142 | # MAGIC FROM churn_prediction
143 | # MAGIC WHERE churn_prediction=1 and channel is not null
144 | # MAGIC GROUP BY channel
145 | # MAGIC ```
146 | # MAGIC Create a *pie chart* visualisation
147 | # MAGIC
148 | # MAGIC **7. Predicted to churn by country**
149 | # MAGIC ```
150 | # MAGIC SELECT country, churn_prediction, count(*) as customers
151 | # MAGIC FROM churn_prediction
152 | # MAGIC GROUP BY country, churn_prediction
153 | # MAGIC ```
154 | # MAGIC Create a *bar* visualisation
155 | # MAGIC
156 | # MAGIC
157 |
158 | # COMMAND ----------
159 |
160 | # MAGIC %md-sandbox
161 | # MAGIC
162 | # MAGIC ## Creating our Churn Dashboard
163 | # MAGIC
164 | # MAGIC
165 | # MAGIC
166 | # MAGIC The next step is now to assemble our queries and their visualization in a comprehensive SQL dashboard that our business will be able to track.
167 | # MAGIC
168 | # MAGIC ### Lab exercise
169 | # MAGIC Assemple the visualisations defined with the above queries into a dashboard
170 |
171 | # COMMAND ----------
172 |
173 | # MAGIC %md-sandbox
174 | # MAGIC
175 | # MAGIC ## Using Third party BI tools
176 | # MAGIC
177 | # MAGIC
178 | # MAGIC
179 | # MAGIC SQL warehouse can also be used with an external BI tool such as Tableau or PowerBI.
180 | # MAGIC
181 | # MAGIC This will allow you to run direct queries on top of your table, with a unified security model and Unity Catalog (ex: through SSO). Now analysts can use their favorite tools to discover new business insights on the most complete and freshest data.
182 | # MAGIC
183 | # MAGIC To start using your Warehouse with third party BI tool, click on "Partner Connect" on the bottom left and chose your provider.
184 |
185 | # COMMAND ----------
186 |
187 | # MAGIC %md-sandbox
188 | # MAGIC ## Going further with DBSQL & Databricks Warehouse
189 | # MAGIC
190 | # MAGIC Databricks SQL offers much more and provides a full warehouse capabilities
191 | # MAGIC
192 | # MAGIC
193 | # MAGIC
194 | # MAGIC ### Data modeling
195 | # MAGIC
196 | # MAGIC Comprehensive data modeling. Save your data based on your requirements: Data vault, Star schema, Inmon...
197 | # MAGIC
198 | # MAGIC Databricks let you create your PK/FK, identity columns (auto-increment)
199 | # MAGIC
200 | # MAGIC ### Data ingestion made easy with DBSQL & DBT
201 | # MAGIC
202 | # MAGIC Turnkey capabilities allow analysts and analytic engineers to easily ingest data from anything like cloud storage to enterprise applications such as Salesforce, Google Analytics, or Marketo using Fivetran. It’s just one click away.
203 | # MAGIC
204 | # MAGIC Then, simply manage dependencies and transform data in-place with built-in ETL capabilities on the Lakehouse (Delta Live Table), or using your favorite tools like dbt on Databricks SQL for best-in-class performance.
205 | # MAGIC
206 | # MAGIC ### Query federation
207 | # MAGIC
208 | # MAGIC Need to access cross-system data? Databricks SQL query federation let you define datasources outside of databricks (ex: PostgreSQL)
209 | # MAGIC
210 | # MAGIC ### Materialized views
211 | # MAGIC
212 | # MAGIC Avoid expensive queries and materialize your tables. The engine will recompute only what's required when your data get updated.
213 |
--------------------------------------------------------------------------------
/Retail/01.2 - Delta Live Tables - SQL.sql:
--------------------------------------------------------------------------------
1 | -- Databricks notebook source
2 | -- MAGIC %md-sandbox
3 | -- MAGIC # Data engineering with Databricks - Building our C360 database
4 | -- MAGIC
5 | -- MAGIC Building a C360 database requires to ingest multiple datasources.
6 | -- MAGIC
7 | -- MAGIC It's a complex process requiring batch loads and streaming ingestion to support real-time insights, used for personalization and marketing targeting among other.
8 | -- MAGIC
9 | -- MAGIC Ingesting, transforming and cleaning data to create clean SQL tables for our downstream user (Data Analysts and Data Scientists) is complex.
10 | -- MAGIC
11 | -- MAGIC
12 | -- MAGIC
John, as Data engineer, spends immense time….
25 | -- MAGIC
26 | -- MAGIC
27 | -- MAGIC * Hand-coding data ingestion & transformations and dealing with technical challenges:
43 | -- MAGIC
44 | -- MAGIC In this notebook, we'll work as a Data Engineer to build our c360 database.
63 | -- MAGIC
64 | -- MAGIC Accelerate ETL development
65 | -- MAGIC Enable analysts and data engineers to innovate rapidly with simple pipeline development and maintenance
66 | -- MAGIC
68 | -- MAGIC
69 | -- MAGIC Remove operational complexity
70 | -- MAGIC By automating complex administrative tasks and gaining broader visibility into pipeline operations
71 | -- MAGIC
75 | -- MAGIC
76 | -- MAGIC Trust your data
77 | -- MAGIC With built-in quality controls and quality monitoring to ensure accurate and useful BI, Data Science, and ML
78 | -- MAGIC
80 | -- MAGIC
81 | -- MAGIC Simplify batch and streaming
82 | -- MAGIC With self-optimization and auto-scaling data pipelines for batch or streaming processing
83 | -- MAGIC
90 | -- MAGIC
91 | -- MAGIC ## Delta Lake
92 | -- MAGIC
93 | -- MAGIC All the tables we'll create in the Lakehouse will be stored as Delta Lake table. Delta Lake is an open storage framework for reliability and performance.
107 | -- MAGIC
146 | -- MAGIC
200 | -- MAGIC
37 |
38 | # COMMAND ----------
39 |
40 | # MAGIC %md
41 | # MAGIC ##  Exploring the dataset
42 | # MAGIC
43 | # MAGIC Let's review first the raw data landed on our blob storage
44 |
45 | # COMMAND ----------
46 |
47 | # MAGIC %run ./includes/SetupLab
48 |
49 | # COMMAND ----------
50 |
51 | userRawDataVolume = rawDataVolume + '/events'
52 | print('User raw data under folder: ' + userRawDataVolume)
53 |
54 | #Listing the files under the directory
55 | for fileInfo in dbutils.fs.ls(userRawDataVolume): print(fileInfo.name)
56 |
57 |
58 |
59 | # COMMAND ----------
60 |
61 | # MAGIC %md-sandbox
62 | # MAGIC ### Review the raw data received as JSON
63 |
64 | # COMMAND ----------
65 |
66 | display(spark.sql("SELECT * FROM json.`"+rawDataVolume+"/users`"))
67 |
68 |
69 | # COMMAND ----------
70 |
71 | # MAGIC %md-sandbox
72 | # MAGIC ### Review the raw data received as CSV
73 |
74 | # COMMAND ----------
75 |
76 | # Read the CSV file into a DataFrame
77 | df = spark.read.option("header", "true").csv(rawDataVolume + "/events")
78 |
79 | # Create a temporary view so you can use SQL to query the data
80 | df.createOrReplaceTempView("eventsView")
81 |
82 | # Now you can display the data using SQL
83 | display(spark.sql("SELECT * FROM eventsView"))
84 |
85 |
86 | # COMMAND ----------
87 |
88 | # MAGIC %md-sandbox
89 | # MAGIC ### 1/ Loading our data using Databricks Autoloader (cloud_files)
90 | # MAGIC
92 | # MAGIC
142 | # MAGIC
143 | # MAGIC We can chain these incremental transformation between tables, consuming only new data.
144 | # MAGIC
145 | # MAGIC This can be triggered in near realtime, or in batch fashion, for example as a job running every night to consume daily data.
146 |
147 | # COMMAND ----------
148 |
149 | # DBTITLE 1,Silver table for the users data
150 | from pyspark.sql.functions import sha1, col, initcap, to_timestamp
151 |
152 | (spark.readStream
153 | .table("churn_users_bronze")
154 | .withColumnRenamed("id", "user_id")
155 | .withColumn("email", sha1(col("email")))
156 | .withColumn("creation_date", to_timestamp(col("creation_date"), "MM-dd-yyyy H:mm:ss"))
157 | .withColumn("last_activity_date", to_timestamp(col("last_activity_date"), "MM-dd-yyyy HH:mm:ss"))
158 | .withColumn("firstname", initcap(col("firstname")))
159 | .withColumn("lastname", initcap(col("lastname")))
160 | .withColumn("age_group", col("age_group").cast('int'))
161 | .withColumn("gender", col("gender").cast('int'))
162 | .drop(col("churn"))
163 | .drop(col("_rescued_data"))
164 | .writeStream
165 | .option("checkpointLocation", f"{deltaTablesDirectory}/checkpoint/users")
166 | .trigger(once=True)
167 | .table("churn_users").awaitTermination())
168 |
169 | # COMMAND ----------
170 |
171 | # MAGIC %sql select * from churn_users;
172 |
173 | # COMMAND ----------
174 |
175 | # DBTITLE 1,Silver table for the orders data
176 | (spark.readStream
177 | .table("churn_orders_bronze")
178 | .withColumnRenamed("id", "order_id")
179 | .withColumn("amount", col("amount").cast('int'))
180 | .withColumn("item_count", col("item_count").cast('int'))
181 | .withColumn("creation_date", to_timestamp(col("transaction_date"), "MM-dd-yyyy H:mm:ss"))
182 | .drop(col("_rescued_data"))
183 | .writeStream
184 | .option("checkpointLocation", f"{deltaTablesDirectory}/checkpoint/orders")
185 | .trigger(once=True)
186 | .table("churn_orders").awaitTermination())
187 |
188 | # COMMAND ----------
189 |
190 | # MAGIC %sql select * from churn_orders;
191 |
192 | # COMMAND ----------
193 |
194 | # MAGIC %md-sandbox
195 | # MAGIC ### 3/ Aggregate and join data to create our ML features
196 | # MAGIC
197 | # MAGIC
198 | # MAGIC
199 | # MAGIC
200 | # MAGIC We are now ready to create the features required for our churn prediction.
201 | # MAGIC
202 | # MAGIC We need to enrich our user dataset with extra information which our model will use to help predicting churn, sucj as:
203 | # MAGIC
204 | # MAGIC * last command date
205 | # MAGIC * number of item bought
206 | # MAGIC * number of actions in our website
207 | # MAGIC * device used (ios/iphone)
208 | # MAGIC * ...
209 |
210 | # COMMAND ----------
211 |
212 | # DBTITLE 1,Creating a "gold table" to be used by the Machine Learning practitioner
213 | spark.sql(
214 | """
215 | CREATE OR REPLACE TABLE churn_features AS
216 | WITH
217 | churn_orders_stats AS (
218 | SELECT
219 | user_id,
220 | count(*) as order_count,
221 | sum(amount) as total_amount,
222 | sum(item_count) as total_item,
223 | max(creation_date) as last_transaction
224 | FROM churn_orders
225 | GROUP BY user_id
226 | ),
227 | churn_app_events_stats AS (
228 | SELECT
229 | first(platform) as platform,
230 | user_id,
231 | count(*) as event_count,
232 | count(distinct session_id) as session_count,
233 | max(to_timestamp(date, "MM-dd-yyyy HH:mm:ss")) as last_event
234 | FROM churn_app_events GROUP BY user_id
235 | )
236 | SELECT
237 | *,
238 | datediff(now(), creation_date) as days_since_creation,
239 | datediff(now(), last_activity_date) as days_since_last_activity,
240 | datediff(now(), last_event) as days_last_event
241 | FROM churn_users
242 | INNER JOIN churn_orders_stats using (user_id)
243 | INNER JOIN churn_app_events_stats using (user_id)
244 | """
245 | )
246 |
247 | display(spark.table("churn_features"))
248 |
249 | # COMMAND ----------
250 |
251 | # MAGIC %md
252 | # MAGIC ## Exploiting the benefits of Delta
253 | # MAGIC
254 | # MAGIC ### (a) Simplifing operations with transactional DELETE/UPDATE/MERGE operations
255 | # MAGIC
256 | # MAGIC Traditional Data Lakes struggle to run even simple DML operations. Using Databricks and Delta Lake, your data is stored on your blob storage with transactional capabilities. You can issue DML operation on Petabyte of data without having to worry about concurrent operations.
257 |
258 | # COMMAND ----------
259 |
260 | # DBTITLE 1,We just realised we have to delete users created before 2016-01-01 for compliance; let's fix that
261 | # MAGIC %sql DELETE FROM churn_users where creation_date < '2016-01-01T03:38:55.000+0000';
262 |
263 | # COMMAND ----------
264 |
265 | # DBTITLE 1,Delta Lake keeps the history of the table operations
266 | # MAGIC %sql describe history churn_users;
267 |
268 | # COMMAND ----------
269 |
270 | # DBTITLE 1,We can leverage the history to travel back in time, restore or clone a table, enable CDC, etc.
271 | # MAGIC %sql
272 | # MAGIC -- the following also works with AS OF TIMESTAMP "yyyy-MM-dd HH:mm:ss"
273 | # MAGIC select * from churn_users version as of 1 ;
274 |
275 | # COMMAND ----------
276 |
277 | # MAGIC %sql
278 | # MAGIC -- You made the DELETE by mistake ? You can easily restore the table at a given version / date:
279 | # MAGIC RESTORE TABLE churn_users TO VERSION AS OF 1
280 | # MAGIC
281 | # MAGIC -- Or clone it (SHALLOW provides zero copy clone):
282 | # MAGIC -- CREATE TABLE user_gold_clone SHALLOW|DEEP CLONE user_gold VERSION AS OF 1
283 | # MAGIC
284 | # MAGIC
285 |
286 | # COMMAND ----------
287 |
288 | # MAGIC %md
289 | # MAGIC ### (b) Optimizing for performance
290 |
291 | # COMMAND ----------
292 |
293 | # DBTITLE 1,Ensuring that all our tables are storage-optimized
294 | # MAGIC %sql
295 | # MAGIC ALTER TABLE churn_users SET TBLPROPERTIES (delta.autooptimize.optimizewrite = TRUE, delta.autooptimize.autocompact = TRUE );
296 | # MAGIC ALTER TABLE churn_orders SET TBLPROPERTIES (delta.autooptimize.optimizewrite = TRUE, delta.autooptimize.autocompact = TRUE );
297 | # MAGIC ALTER TABLE churn_features SET TBLPROPERTIES (delta.autooptimize.optimizewrite = TRUE, delta.autooptimize.autocompact = TRUE );
298 |
299 | # COMMAND ----------
300 |
301 | # DBTITLE 1,Our user table will be queried mostly by 3 fields; let's optimize the table for that!
302 | # MAGIC %sql
303 | # MAGIC OPTIMIZE churn_users ZORDER BY user_id, firstname, lastname
304 |
--------------------------------------------------------------------------------
/Retail/02 - Machine Learning with MLflow.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md-sandbox
3 | # MAGIC
4 | # MAGIC # Data Science with Databricks
5 | # MAGIC
6 | # MAGIC ## ML is key to disruption & personalization
7 | # MAGIC
8 | # MAGIC Being able to ingest and query our C360 database is a first step, but this isn't enough to thrive in a very competitive market.
9 | # MAGIC
10 | # MAGIC ## Machine learning is data + transforms.
11 | # MAGIC
12 | # MAGIC ML is hard because delivering value to business lines isn't only about building a Model.
18 | # MAGIC
19 | # MAGIC
20 | # MAGIC 

73 | # MAGIC
74 | # MAGIC *Note: Make sure you switched to the "Machine Learning" persona on the top left menu.*
75 |
76 | # COMMAND ----------
77 |
78 | # MAGIC %run ./includes/SetupLab
79 |
80 | # COMMAND ----------
81 |
82 | # MAGIC %md
83 | # MAGIC ### Our training Data
84 | # MAGIC The tables generated with the DLT pipeline contain a **churn** flag which will be used as the label for training of the model.
85 | # MAGIC The predictions will eventually be applied to the tables generated with the spark pipeline.
86 |
87 | # COMMAND ----------
88 |
89 | spark.sql("use catalog main")
90 | spark.sql("use database "+databaseForDLT)
91 |
92 |
93 | # COMMAND ----------
94 |
95 | ## Use the tables within the DlT schema to creat our model.
96 | print("We will be working with our DLT Schema to build our final predication table:\n" + databaseForDLT + "\n")
97 |
98 | # COMMAND ----------
99 |
100 | # MAGIC %md
101 | # MAGIC ## Data exploration and analysis
102 | # MAGIC
103 | # MAGIC Let's review our dataset and start analyze the data we have to predict our churn
104 |
105 | # COMMAND ----------
106 |
107 | # DBTITLE 1,Read our churn gold table
108 | # Read our churn_features table
109 | churn_dataset = spark.table("churn_features")
110 | display(churn_dataset)
111 |
112 | # COMMAND ----------
113 |
114 | # DBTITLE 1,Data Exploration and analysis
115 | import seaborn as sns
116 | g = sns.PairGrid(churn_dataset.sample(0.01).toPandas()[['age_group','gender','order_count']], diag_sharey=False)
117 | g.map_lower(sns.kdeplot)
118 | g.map_diag(sns.kdeplot, lw=3)
119 | g.map_upper(sns.regplot)
120 |
121 | # COMMAND ----------
122 |
123 | # MAGIC %md
124 | # MAGIC ### Further data analysis and preparation using pandas API
125 | # MAGIC
126 | # MAGIC Because our Data Scientist team is familiar with Pandas, we'll use `pandas on spark` to scale `pandas` code. The Pandas instructions will be converted in the spark engine under the hood and distributed at scale.
127 | # MAGIC
128 | # MAGIC Typicaly a Data Science project would involve more a advanced preparation and likely require extra data prep steps, including more a complex feature preparation.
129 |
130 | # COMMAND ----------
131 |
132 | # DBTITLE 1,Custom pandas transformation / code on top of your entire dataset
133 | # Convert to pandas on spark
134 | dataset = churn_dataset.pandas_api()
135 | dataset.describe()
136 | # Drop columns we don't want to use in our model
137 | dataset = dataset.drop(columns=['address', 'email', 'firstname', 'lastname', 'creation_date', 'last_activity_date', 'last_event'])
138 | # Drop missing values
139 | dataset = dataset.dropna()
140 | # print the ten first rows
141 | dataset[:10]
142 |
143 | # COMMAND ----------
144 |
145 | # MAGIC %md-sandbox
146 | # MAGIC
147 | # MAGIC ## Write to Feature Store
148 | # MAGIC
149 | # MAGIC
150 | # MAGIC
151 | # MAGIC Once our features are ready, we can save them in Databricks Feature Store. Under the hood, features store are backed by a Delta Lake table.
152 | # MAGIC
153 | # MAGIC This will allow discoverability and reusability of our feature across our organization, increasing team efficiency.
154 | # MAGIC
155 | # MAGIC Feature store will bring traceability and governance in our deployment, knowing which model is dependent of which set of features. It also simplify realtime serving.
156 | # MAGIC
157 | # MAGIC Make sure you're using the "Machine Learning" menu to have access to your feature store using the UI.
158 |
159 | # COMMAND ----------
160 |
161 | from databricks.feature_store import FeatureStoreClient
162 |
163 | fs = FeatureStoreClient()
164 |
165 | try:
166 | #drop table if exists
167 | fs.drop_table('churn_user_features')
168 | except: pass
169 |
170 | #Note: You might need to delete the FS table using the UI
171 | churn_feature_table = fs.create_table(
172 | name='churn_user_features',
173 | primary_keys='user_id',
174 | schema=dataset.spark.schema(),
175 | description='These features are derived from the churn_bronze_customers table in the lakehouse. We created dummy variables for the categorical columns, cleaned up their names, and added a boolean flag for whether the customer churned or not. No aggregations were performed.'
176 | )
177 |
178 | fs.write_table(df=dataset.to_spark(), name='churn_user_features', mode='overwrite')
179 | features = fs.read_table('churn_user_features')
180 | display(features)
181 |
182 | # COMMAND ----------
183 |
184 | # MAGIC %md
185 | # MAGIC ## Training a model from the table in the Feature Store
186 | # MAGIC
187 | # MAGIC As we will be using a scikit-learn algorith, we will convert the feature table into a pandas model
188 |
189 | # COMMAND ----------
190 |
191 | # Convert to Pandas
192 | df = features.toPandas()
193 |
194 | # COMMAND ----------
195 |
196 | # MAGIC %md
197 | # MAGIC #### Train - test splitting
198 |
199 | # COMMAND ----------
200 |
201 | # Split to train and test set
202 | from sklearn.model_selection import train_test_split
203 | train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
204 |
205 | # COMMAND ----------
206 |
207 | # MAGIC %md
208 | # MAGIC #### Define the preprocessing steps
209 |
210 | # COMMAND ----------
211 |
212 | # Select the columns
213 | from databricks.automl_runtime.sklearn.column_selector import ColumnSelector
214 | supported_cols = ["event_count", "gender", "total_amount", "country", "order_count", "channel", "total_item", "days_since_last_activity", "days_last_event", "days_since_creation", "session_count", "age_group", "platform"]
215 | col_selector = ColumnSelector(supported_cols)
216 |
217 | # COMMAND ----------
218 |
219 | # Preprocessing
220 | from sklearn.compose import ColumnTransformer
221 | from sklearn.impute import SimpleImputer
222 | from sklearn.pipeline import Pipeline
223 | from sklearn.preprocessing import FunctionTransformer, StandardScaler
224 |
225 | num_imputers = []
226 | num_imputers.append(("impute_mean", SimpleImputer(), ["age_group", "days_last_event", "days_since_creation", "days_since_last_activity", "event_count", "gender", "order_count", "session_count", "total_amount", "total_item"]))
227 |
228 | numerical_pipeline = Pipeline(steps=[
229 | ("converter", FunctionTransformer(lambda df: df.apply(pd.to_numeric, errors="coerce"))),
230 | ("imputers", ColumnTransformer(num_imputers)),
231 | ("standardizer", StandardScaler()),
232 | ])
233 |
234 | numerical_transformers = [("numerical", numerical_pipeline, ["event_count", "gender", "total_amount", "order_count", "total_item", "days_since_last_activity", "days_last_event", "days_since_creation", "session_count", "age_group"])]
235 |
236 | # COMMAND ----------
237 |
238 | # Treating categorical variables
239 | from databricks.automl_runtime.sklearn import OneHotEncoder
240 | from sklearn.compose import ColumnTransformer
241 | from sklearn.impute import SimpleImputer
242 | from sklearn.pipeline import Pipeline
243 |
244 | one_hot_imputers = []
245 | one_hot_pipeline = Pipeline(steps=[
246 | ("imputers", ColumnTransformer(one_hot_imputers, remainder="passthrough")),
247 | ("one_hot_encoder", OneHotEncoder(handle_unknown="indicator")),
248 | ])
249 | categorical_one_hot_transformers = [("onehot", one_hot_pipeline, ["age_group", "channel", "country", "event_count", "gender", "order_count", "platform", "session_count"])]
250 |
251 | # COMMAND ----------
252 |
253 | # Final transformation of the columns
254 | from sklearn.compose import ColumnTransformer
255 | transformers = numerical_transformers + categorical_one_hot_transformers
256 | preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=1)
257 |
258 | # COMMAND ----------
259 |
260 | # Separate target column from features
261 | target_col = "churn"
262 | X_train = train_df.drop([target_col], axis=1)
263 | y_train = train_df[target_col]
264 |
265 | X_test = test_df.drop([target_col], axis=1)
266 | y_test = test_df[target_col]
267 |
268 | # COMMAND ----------
269 |
270 | # MAGIC %md
271 | # MAGIC #### Training a model and logging everything with MLflow
272 |
273 | # COMMAND ----------
274 |
275 | # DBTITLE 0,Train a model and log it with MLflow
276 | import pandas as pd
277 | import mlflow
278 | from mlflow.models import Model
279 | from mlflow import pyfunc
280 | from mlflow.pyfunc import PyFuncModel
281 |
282 | import sklearn
283 | from sklearn.ensemble import RandomForestClassifier
284 | from sklearn.pipeline import Pipeline
285 |
286 | # Start a run
287 | with mlflow.start_run(run_name="simple-RF-run") as run:
288 |
289 | classifier = RandomForestClassifier()
290 | model = Pipeline([
291 | ("column_selector", col_selector),
292 | ("preprocessor", preprocessor),
293 | ("classifier", classifier),
294 | ])
295 |
296 | # Enable automatic logging of input samples, metrics, parameters, and models
297 | mlflow.sklearn.autolog(
298 | log_input_examples=True,
299 | silent=True)
300 |
301 | model.fit(X_train, y_train)
302 |
303 | # Log metrics for the test set
304 | mlflow_model = Model()
305 | pyfunc.add_to_model(mlflow_model, loader_module="mlflow.sklearn")
306 | pyfunc_model = PyFuncModel(model_meta=mlflow_model, model_impl=model)
307 | X_test[target_col] = y_test
308 | test_eval_result = mlflow.evaluate(
309 | model=pyfunc_model,
310 | data=X_test,
311 | targets=target_col,
312 | model_type="classifier",
313 | evaluator_config = {"log_model_explainability": False,
314 | "metric_prefix": "test_" , "pos_label": 1 }
315 | )
316 |
317 |
318 | # COMMAND ----------
319 |
320 | # MAGIC %md
321 | # MAGIC #### Explore the above in the UI
322 | # MAGIC
323 | # MAGIC From the experiments page on the left pane and select the "simple-RF-run" experiment as noted above
324 |
325 | # COMMAND ----------
326 |
327 | # MAGIC %md-sandbox 
399 | # MAGIC
400 | # MAGIC
401 | # MAGIC Models can be directly deployed, or instead leverage generated notebooks to boostrap projects with best-practices, saving you weeks of efforts.
402 | # MAGIC
403 | # MAGIC
406 | # MAGIC
407 | # MAGIC ### Using Databricks Auto ML with our Churn dataset
408 | # MAGIC
409 | # MAGIC Auto ML is available in the "Machine Learning" space. All we have to do is start a new Auto-ML experimentation and select the feature table we just created (`churn_features`)
410 | # MAGIC
411 | # MAGIC Our prediction target is the `churn` column.
412 | # MAGIC
413 | # MAGIC Click on Start, and Databricks will do the rest.
414 | # MAGIC
415 | # MAGIC While this is done using the UI, you can also leverage the [python API](https://docs.databricks.com/applications/machine-learning/automl.html#automl-python-api-1)
416 |
417 | # COMMAND ----------
418 |
419 | # MAGIC %md
420 | # MAGIC ### Next up
421 | # MAGIC [Use the model to predict the churn]($./02.1 - Machine Learning - Inference)
422 |
--------------------------------------------------------------------------------