├── Chapter01
    ├── Data
    │   └── testfile.txt
    ├── CreateAzureStorage.ps1
    ├── AzureCloudShell.ps1
    ├── Readme.md
    ├── CreateAzureQueues.ps1
    ├── CreateAzureTable.ps1
    ├── CreateAzureFileShare.ps1
    └── CreateVM.ps1
├── Chapter03
    └── Readme.md
├── Chapter11
    └── Readme.md
├── Chapter15
    └── Readme.md
├── Chapter06
    ├── Readme.md
    ├── SynSQLExtTable-C6.sql
    └── SynSQLPartitionSwitching-C6.sql
├── Chapter09
    ├── Readme.md
    ├── SynSQLPolybase-C9.sql
    ├── AzureBatch.ps1
    └── SparkBatchJob-C9.ipynb
├── Chapter07
    ├── Readme.md
    ├── Data
    │   └── customer.csv
    ├── SynSQLServerlessParquet-C7.sql
    ├── SynSQLStarSchema-C7.sql
    └── ParquetWithSpark-C7.ipynb
├── Chapter10
    ├── Readme.md
    ├── ASATransformations.sql
    ├── ASAWindowedAggregates.sql
    ├── EventGenerator.py
    └── EventHub-StructuredStreaming-C10.ipynb
├── Chapter08
    ├── Readme.md
    ├── SynSQLShreddingJSON-C8.sql
    ├── EncodeAndDecodeUsingSpark-C8.ipynb
    ├── SparkTransformations-C8.ipynb
    ├── TSQLTransformations-C8.sql
    └── ShreddingJSONUsingSpark-C8.ipynb
├── Chapter13
    ├── Readme.md
    ├── SynSQLPerfTables-C13.sql
    ├── SynSQLServerlessStats-C13.sql
    └── SynSQLDedicatedStats-C13.sql
├── Chapter04
    ├── Readme.md
    ├── SynSQLDimHierarchy-C4.sql
    ├── AzureSQLTemporalData-C4.sql
    └── AzureSQLWaterMarks-C4.sql
├── Chapter14
    ├── Readme.md
    ├── SparkUDF-C14.ipynb
    ├── SynSQLQueryPlan-C14.sql
    ├── SynSQLUDF-C14.sql
    ├── SparkQueryPlan-C14.ipynb
    ├── SparkDeltaWithCompaction-C14.ipynb
    └── HyperspaceIndexing-C14.ipynb
├── Chapter12
    ├── Readme.md
    ├── AzureSQLAlwaysEncrypted-C12.sql
    ├── SynSQLDDM-C12.sql
    ├── SynSQLColLevelSecurity-C12.sql
    ├── SynSQLRowLevelSecurity-C12.sql
    └── HandlingSensitiveInfoInDataframe-C12.ipynb
├── Chapter05
    ├── Readme.md
    ├── SynSQLPartitions-C5.sql
    ├── CompressionUsingSpark-C5.ipynb
    ├── PartitioningUsingSpark-C5.ipynb
    └── SynSQLDistributionsIndexes-C5.sql
├── Chapter02
    ├── Readme.md
    ├── SynSQLDataPruning-C2.sql
    ├── DataPruningWithSpark-C2.ipynb
    └── SynSQLDistributions-C2.sql
├── LICENSE
└── README.md


/Chapter01/Data/testfile.txt:
--------------------------------------------------------------------------------
1 | Hello World


--------------------------------------------------------------------------------
/Chapter03/Readme.md:
--------------------------------------------------------------------------------
1 | # Chapter 3
2 | 
3 | No sample code in this chapter.


--------------------------------------------------------------------------------
/Chapter11/Readme.md:
--------------------------------------------------------------------------------
1 | # Chapter 11
2 | 
3 | No sample code in this chapter.


--------------------------------------------------------------------------------
/Chapter15/Readme.md:
--------------------------------------------------------------------------------
1 | # Chapter 15
2 | 
3 | No sample code in this chapter.


--------------------------------------------------------------------------------
/Chapter06/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 6
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see code examples for: 
 5 | 
 6 | * Partition switching using Synapse SQL
 7 | * External tables using Synapse SQL
 8 | 
 9 | 
10 | ## Steps:
11 | 1. Follow the instructions in each fileS.
12 | 
13 | 


--------------------------------------------------------------------------------
/Chapter09/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 9
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see code examples for: 
 5 | 
 6 | * Batch Transformation using ADB
 7 | * COPY using POLYBASE
 8 | * Azure Batch job life cycle
 9 | 
10 | 
11 | ## Steps:
12 | 1. Follow the instructions in each file.
13 | 


--------------------------------------------------------------------------------
/Chapter07/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 7
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see code examples for: 
 5 | 
 6 | * Implementing a Star schema
 7 | * Reading and writing Parquet files using Synapse SQL Serverless
 8 | * Reading and writing Parquet files using Spark
 9 | 
10 | ## Steps:
11 | 1. Follow the instructions in each file.
12 | 
13 | 


--------------------------------------------------------------------------------
/Chapter10/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 10
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see code examples for: 
 5 | 
 6 | * Sample event generation script for EventHub
 7 | * Spark Structured streaming
 8 | * Creating windowed aggregates
 9 | * Transformations using Streaming Analytics
10 | 
11 | 
12 | ## Steps:
13 | 1. Follow the instructions in each file.
14 | 


--------------------------------------------------------------------------------
/Chapter08/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 8
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see code examples for: 
 5 | 
 6 | * Spark transformations
 7 | * T-SQL trasformations
 8 | * Shredding JSON using Spark
 9 | * Shredding JSON using Synapse SQL
10 | * Encoding and Decoding using Spark
11 | 
12 | 
13 | ## Steps:
14 | 1. Follow the instructions in each file.
15 | 
16 | 


--------------------------------------------------------------------------------
/Chapter13/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 13
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see code examples for: 
 5 | 
 6 | * Creating statistics for Synapse dedicated pools
 7 | * Creating statistics for Synapse serverless pools
 8 | * Querying the system tables in Synapse dedicated pools
 9 | 
10 | 
11 | ## Steps:
12 | 1. Follow the instructions in each file.
13 | 
14 | 


--------------------------------------------------------------------------------
/Chapter04/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 4
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see code examples for: 
 5 | 
 6 | * Designing a solution for temporal data
 7 | * Designing a dimensional hierarchy
 8 | * Loading tables using watermark technique
 9 | 
10 | 
11 | ## Steps:
12 | 1. Follow the instructions in each file as they use different services.
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/Chapter14/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 14
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see code examples for: 
 5 | 
 6 | * Spark Delta example
 7 | * Writing UDFs in Synapse SQL pool
 8 | * Identifying Shuffles in a SQL query plan
 9 | * Identifying Shuffles in a Spark query plan
10 | * Indexing in Synapse Spark pool using Hyperspace
11 | 
12 | 
13 | ## Steps:
14 | 1. Follow the instructions in each file.


--------------------------------------------------------------------------------
/Chapter12/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 12
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see code examples for: 
 5 | 
 6 | * Always Encrypted in Azure SQL
 7 | * Data Masking in Synapse SQL
 8 | * Row level security in Synapse SQL
 9 | * Column level security in Synapse SQL
10 | * Loading dataframe with sensitive information in Spark
11 | 
12 | 
13 | ## Steps:
14 | 1. Follow the instructions in each file.
15 | 


--------------------------------------------------------------------------------
/Chapter05/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 5
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see code examples for: 
 5 | 
 6 | * Compressing files using Spark
 7 | * Horizontal Partitioning using Synapse SQL
 8 | * Horizontal Partitioning or Sharding using Spark
 9 | * Implementing distributions using Synapse SQL
10 | 
11 | 
12 | ## Steps:
13 | 1. Follow the instructions in each file.
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/Chapter07/Data/customer.csv:
--------------------------------------------------------------------------------
 1 | customerId, name, emailId, phoneNum, city
 2 | 301,Alice,alice@someemail.com, '(465)-xxx-xxxx', 'New York'
 3 | 302,Bryan,bryan@someemail.com, '(480)-xxx-xxxx', 'San Jose'
 4 | 303,Carmen,carmen@somemail.com, '(122)-xxx-xxxx', 'Phoenix'
 5 | 304,Daniel,daniel@somemail.com, '(510)-xxx-xxxx', 'Tahoe'
 6 | 305,Ethan,ethan@somemail.com, '(265)-xxx-xxxx', 'Dallas'
 7 | ,,,,
 8 | ,,,,
 9 | ,,,,
10 | 


--------------------------------------------------------------------------------
/Chapter01/CreateAzureStorage.ps1:
--------------------------------------------------------------------------------
1 | # Creating Azure storage account
2 | 
3 | $resourceGroup = "<INSERT RESOURCE GROUP NAME>"
4 | $storageAccount ="<INSERT STORAGE ACCOUNT NAME>"
5 | $region = "<INSERT REGION NAME>"
6 | 
7 | # We will have to create an Azure Storage first before we can create queues, shares or files
8 | az storage account create --resource-group $resourceGroup --name $storageAccount --location $region --kind StorageV2 --sku Standard_LRS


--------------------------------------------------------------------------------
/Chapter01/AzureCloudShell.ps1:
--------------------------------------------------------------------------------
 1 | # Use this script to set your subscription and 
 2 | # create a resource group  in to Azure before running the 
 3 | # other Powershell scripts in this chapter.
 4 | 
 5 | $subscriptionName="<INSERT SUBSCRIPTION NAME>"
 6 | $resourceGroup = "<INSERT RESOURCE GROUP NAME>"
 7 | $region = "<INSERT REGION NAME>"
 8 | 
 9 | az account set --subscription $subscriptionName
10 | 
11 | az group create --name $resourceGroup --location $region
12 | 
13 | 


--------------------------------------------------------------------------------
/Chapter02/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 2
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see how to create: 
 5 | 
 6 | * Synapse SQL Distribution Strategy
 7 | * Dedicated SQL pool example with pruning
 8 | * Spark example with pruning
 9 | 
10 | 
11 | ## Steps:
12 | 1. Create an Azure Synapse workspace and launch it.
13 | 2. Open a SQL Editor and start copy pasting the lines from the corresponding example files in this directory.
14 | 3. Or you could just import the file into your synapse workspace.
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/Chapter13/SynSQLPerfTables-C13.sql:
--------------------------------------------------------------------------------
1 | -- Querying the system tables
2 | -- Synapse SQL Pool provides the following system tables that can be used to monitor the query performance:
3 | 
4 | -- sys.dm_pdw_exec_requests – contains all the current and recently active requests in Azure Synapse Analytics. It contains details like total_elapsed_time, submit_time, start_time, end_time, command, result_cache_hit and so on.
5 | SELECT * FROM sys.dm_pdw_exec_requests;
6 | 
7 | -- sys.dm_pdw_waits – contains details of the wait states in a query, including locks and waits on transmission queues. 
8 | SELECT * FROM sys.dm_pdw_waits;


--------------------------------------------------------------------------------
/Chapter10/ASATransformations.sql:
--------------------------------------------------------------------------------
 1 | -- ASA Transformation Examples
 2 | -- You will have to copy these SQL snippets into your ASA Job Query section and run it one by one.
 3 | 
 4 | SELECT
 5 |     COUNT(DISTINCT tripId) AS TripCount,
 6 |     System.TIMESTAMP() AS Time
 7 | INTO [Output]
 8 | FROM [Input] TIMESTAMP BY createdAt
 9 | GROUP BY 
10 |      TumblingWindow(second, 10)
11 | 
12 | SELECT tripId, SUM(CAST(fare AS FLOAT)) AS TenSecondFares
13 | INTO [Output]
14 | FROM [Input] TIMESTAMP BY createdAt
15 | GROUP BY
16 | tripId, TumblingWindow(second, 10)
17 | 
18 | 
19 | SELECT *
20 | INTO [Output]
21 | FROM [Input] TIMESTAMP BY timestamp
22 | WHERE startLocation LIKE 'S%F'
23 | 


--------------------------------------------------------------------------------
/Chapter08/SynSQLShreddingJSON-C8.sql:
--------------------------------------------------------------------------------
 1 | -- Example for reading JSON files using OPENROWSET
 2 | 
 3 | SELECT TOP 10 *
 4 | FROM openrowset(
 5 |         BULK '<INSERT https:// JSON LOCATION>',
 6 |         FORMAT = 'csv',
 7 |         FIELDTERMINATOR ='0x0b',
 8 |         FIELDQUOTE = '0x0b'
 9 |     ) with (doc nvarchar(max)) as rows
10 | GO
11 | 
12 | SELECT     
13 |     JSON_VALUE(doc, '$.firstname') AS firstName,
14 |     JSON_VALUE(doc, '$.lastname') AS lastName,
15 |     CAST(JSON_VALUE(doc, '$.id') AS INT) as driverId,
16 |     JSON_VALUE(doc, '$.salary') as salary
17 | FROM openrowset(
18 |         BULK '<INSERT https:// JSON LOCATION>',
19 |         FORMAT = 'csv',
20 |         FIELDTERMINATOR ='0x0b',
21 |         FIELDQUOTE = '0x0b'
22 |     ) WITH (doc nvarchar(max)) AS ROWS
23 | GO
24 | 


--------------------------------------------------------------------------------
/Chapter14/SparkUDF-C14.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f3a68d09-150b-435f-b71c-2fc9707f54ab","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","// Simple UDF example\n","\n","import org.apache.spark.sql.functions.{col, udf}\n","\n","// Define the UDF\n","val double = udf((s: Long) => 2 * s)\n","\n","// Use the UDF like regular functions\n","display(spark.range(1, 20).select(double(col(\"id\")) as \"doubled\"))"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"SparkUDF-C14","notebookOrigID":188113580888596,"widgets":{}},"description":null,"kernelspec":{"display_name":"scala","name":"synapse_spark"},"language_info":{"name":"scala"},"save_output":true,"synapse_widget":{"state":{},"version":"0.1"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter06/SynSQLExtTable-C6.sql:
--------------------------------------------------------------------------------
 1 | -- External Table Example
 2 | 
 3 | IF NOT EXISTS (SELECT * FROM sys.external_file_formats WHERE name = 'Dp203ParquetFormat') 
 4 | 	CREATE EXTERNAL FILE FORMAT [Dp203ParquetFormat] 
 5 | 	WITH ( FORMAT_TYPE = PARQUET)
 6 | GO
 7 | 
 8 | IF NOT EXISTS (SELECT * FROM sys.external_data_sources WHERE name = 'Dp203DataSource') 
 9 | 	CREATE EXTERNAL DATA SOURCE [Dp203DataSource] 
10 | 	WITH (
11 | 		LOCATION  = '<INSERT abfss://  DATA SOURCE LOCATION>' 
12 | 	)
13 | G0
14 | 
15 | --DROP EXTERNAL TABLE TestExtTable;
16 | 
17 | CREATE EXTERNAL TABLE TestExtTable (
18 | 	[tripId] INT,
19 | 	[driverId] INT,
20 | 	[customerId] INT,
21 | 	[cabId] INT,
22 | 	[tripDate] INT,
23 | 	[startLocation] VARCHAR (50),
24 | 	[endLocation] VARCHAR (50)
25 | )
26 | WITH (
27 | 	LOCATION = '/parquet/trips/*.parquet',
28 | 	DATA_SOURCE = [Dp203DataSource],
29 | 	FILE_FORMAT = [Dp203ParquetFormat]
30 | )
31 | GO
32 | 
33 | SELECT TOP 100 * FROM TestExtTable
34 | GO
35 | 
36 | 


--------------------------------------------------------------------------------
/Chapter04/SynSQLDimHierarchy-C4.sql:
--------------------------------------------------------------------------------
 1 | -- Dimensional Hierarchy Example
 2 | 
 3 | -- DROP TABLE DimEmployee;
 4 | 
 5 | CREATE TABLE DimEmployee (
 6 | 	[employeeId] VARCHAR(20) NOT NULL,
 7 | 	[name] VARCHAR(100),
 8 | 	[department] VARCHAR(50),
 9 | 	[title] VARCHAR(50),
10 | 	[parentEmployeeId] VARCHAR(20)
11 | )
12 | 
13 | GO
14 | -- Insert some sample values
15 | 
16 | INSERT INTO [dbo].[DimEmployee] ([employeeId], [name], [department], [title], [parentEmployeeId]) VALUES (100, 'Alan Li', 'Manufacturing', 'Manager', NULL);
17 | INSERT INTO [dbo].[DimEmployee] ([employeeId], [name], [department], [title], [parentEmployeeId]) VALUES (200, 'Brenda Jackman', 'Manufacturing', 'Supervisor', 100);
18 | INSERT INTO [dbo].[DimEmployee] ([employeeId], [name], [department], [title], [parentEmployeeId]) VALUES (300, 'David Hood', 'Manufacturing', 'Machine operator', 200);
19 | 
20 | -- Check the hierarchy established via the [parentEmployeeId] column
21 | SELECT * FROM [dbo].[DimEmployee];
22 | 
23 | 


--------------------------------------------------------------------------------
/Chapter09/SynSQLPolybase-C9.sql:
--------------------------------------------------------------------------------
 1 | -- Synapse SQL PolyBase Example
 2 | 
 3 | IF NOT EXISTS (SELECT * FROM sys.external_file_formats WHERE name = 'Dp203ParquetFormat') 
 4 | 	CREATE EXTERNAL FILE FORMAT [Dp203ParquetFormat] 
 5 | 	WITH ( FORMAT_TYPE = PARQUET)
 6 | 
 7 | IF NOT EXISTS (SELECT * FROM sys.external_data_sources WHERE name = 'Dp203DataSource') 
 8 | 	CREATE EXTERNAL DATA SOURCE [Dp203DataSource] 
 9 | 	WITH (
10 | 		LOCATION = '<INSERT abfss://  DATA SOURCE LOCATION>' 
11 | 	)
12 | 
13 | -- Here we are assuming that the parquet files are present in /partition/year=2022/*/*/
14 | CREATE EXTERNAL TABLE TripExtTable
15 | WITH (
16 | 	LOCATION = '/partition/year=2022/*/*/*.parquet',
17 | 	DATA_SOURCE = [Dp203DataSource],
18 | 	FILE_FORMAT = [Dp203ParquetFormat]
19 | ) AS 
20 | SELECT 
21 | 	[tripId] INT,
22 | 	[driverId] INT,
23 | 	[customerId] INT,
24 | 	[cabId] INT,
25 | 	[tripDate] INT,
26 | 	[startLocation] VARCHAR (50),
27 | 	[endLocation] VARCHAR (50)
28 | FROM 
29 |     OPENROWSET(BULK '/partition/year=2022/*/*/*.parquet', FORMAT='PARQUET')
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Chapter01/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chapter 1
 2 | 
 3 | ## Overview
 4 | In this chapter, we will see how to create: 
 5 | 
 6 | * Azure VMs
 7 | * Azure File Shares
 8 | * Azure Queues
 9 | * Azure Tables
10 | 
11 | 
12 | ## Steps:
13 | 1. Install the Azure command line tool on your computer or directly open the Azure web command line tool and run the commands there.
14 | 2. You can find instructions to install Azure CLI here: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli-windows?tabs=azure-cli
15 | 3. You can also choose to run the commands directly on the Azure Cloud Shell. Refer here: https://docs.microsoft.com/en-us/azure/cloud-shell/quickstart
16 | 2. Start with the AzureCloudShell.ps1 script first. Enter your Azure details in that script and either execute the script manually or just run the complete script from the command line.
17 | 3. Try each of the other scripts and explore the options available with each of the commands.
18 | 
19 | ## Azure CLI
20 | After installing Azure CLI on your computer, start a Windows PowerShell terminal and run the following command to log into Azure.
21 | ```
22 | az login
23 | ```
24 | This should take you to a browser where you can log in to Azure.
25 | Once logged in, start from the AzureCloudShell.ps1 script in this directory.
26 | 


--------------------------------------------------------------------------------
/Chapter12/AzureSQLAlwaysEncrypted-C12.sql:
--------------------------------------------------------------------------------
 1 | -- Always encrypted example
 2 | 
 3 | CREATE COLUMN MASTER KEY CMK  
 4 | WITH (  
 5 |      KEY_STORE_PROVIDER_NAME = 'AZURE_KEY_VAULT',   
 6 |      KEY_PATH = '<INSERT KEYVAULT PATH>'  
 7 | );  
 8 | 
 9 | CREATE COLUMN ENCRYPTION KEY CEK   
10 | WITH VALUES  
11 | (  
12 |     COLUMN_MASTER_KEY = CMK,   
13 |     ALGORITHM = 'RSA_OAEP',   
14 |     ENCRYPTED_VALUE = <INSERT ENCRYPTION KEY>
15 | );  
16 | 
17 | CREATE TABLE Customer (  
18 |    [name] VARCHAR(30),
19 |    [email] VARCHAR(10)   
20 |         COLLATE  Latin1_General_BIN2 ENCRYPTED WITH (COLUMN_ENCRYPTION_KEY = CEK,  
21 |         ENCRYPTION_TYPE = RANDOMIZED,  
22 |         ALGORITHM = 'AEAD_AES_256_CBC_HMAC_SHA_256'),   
23 |    [phone] VARCHAR (12),
24 |    [SSN] VARCHAR (11)   
25 |         COLLATE  Latin1_General_BIN2 ENCRYPTED WITH (COLUMN_ENCRYPTION_KEY = CEK,  
26 |         ENCRYPTION_TYPE = DETERMINISTIC ,  
27 |         ALGORITHM = 'AEAD_AES_256_CBC_HMAC_SHA_256'),   
28 | );  
29 | 
30 | INSERT INTO Customer VALUES (101, 'Alan Li', 'alan@li.com', '111-222-3333', '111-11-1111');
31 | INSERT INTO Customer VALUES (102, 'Becky King', 'becky@king.com', '222-333-4444', '222-22-2222');
32 | INSERT INTO Customer VALUES (103, 'Daniel Martin', 'daniel@someone.com', '333-444-555', '333-33-3333');
33 | 
34 | SELECT * FROM Customer;


--------------------------------------------------------------------------------
/Chapter07/SynSQLServerlessParquet-C7.sql:
--------------------------------------------------------------------------------
 1 | -- Reading parquet with Synapse SQL serverless
 2 | 
 3 | SELECT
 4 |     TOP 100 *
 5 | FROM
 6 |     OPENROWSET(
 7 |         BULK '<INSERT https:// LOCATION>',
 8 |         FORMAT = 'PARQUET'
 9 |     ) AS [result]
10 | 
11 | 
12 | -- Accessing data using external Table example
13 | IF NOT EXISTS (SELECT * FROM sys.external_file_formats WHERE name = 'Dp203ParquetFormat') 
14 | 	CREATE EXTERNAL FILE FORMAT [Dp203ParquetFormat] 
15 | 	WITH ( FORMAT_TYPE = PARQUET)
16 | GO
17 | 
18 | IF NOT EXISTS (SELECT * FROM sys.external_data_sources WHERE name = 'Dp203DataSource') 
19 | 	CREATE EXTERNAL DATA SOURCE [Dp203DataSource] 
20 | 	WITH (
21 | 		LOCATION  = '<INSERT abfss:// LOCATION>' 
22 | 	)
23 | GO
24 | 
25 | DROP EXTERNAL TABLE TripsExtTable;
26 | 
27 | CREATE EXTERNAL TABLE TripsExtTable (
28 | 	[tripId] VARCHAR (10),
29 | 	[driverId] VARCHAR (10),
30 | 	[customerId] VARCHAR (10),
31 | 	[cabId] VARCHAR (10),
32 | 	[tripDate] VARCHAR (10),
33 | 	[startLocation] VARCHAR (50),
34 | 	[endLocation] VARCHAR (50)
35 | )
36 | WITH (
37 | 	LOCATION = '/parquet/trips/*.parquet',
38 | 	DATA_SOURCE = [Dp203DataSource],
39 | 	FILE_FORMAT = [Dp203ParquetFormat]
40 | )
41 | GO
42 | 
43 | SELECT TOP 100 * FROM TripsExtTable
44 | GO
45 | 
46 | SELECT COUNT(*) AS 'Trips', [startLocation] AS 'Location' FROM TripsExtTable GROUP BY startLocation;
47 | 


--------------------------------------------------------------------------------
/Chapter12/SynSQLDDM-C12.sql:
--------------------------------------------------------------------------------
 1 | -- Dynamic Data Masking examples
 2 | 
 3 | -- DROP TABLE dbo.CustomerDDM
 4 | 
 5 | CREATE TABLE dbo.CustomerDDM
 6 | (
 7 |     [customerId] INT NOT NULL,
 8 |     [name] VARCHAR(40) NOT NULL,
 9 |     [email] VARCHAR(100),
10 |     [phoneNum] VARCHAR(40),
11 |     [city] VARCHAR(40),
12 |     [SSN] VARCHAR (12)
13 | )
14 | WITH (
15 |     CLUSTERED COLUMNSTORE INDEX,
16 |     DISTRIBUTION = REPLICATE
17 | )
18 | GO
19 | 
20 | INSERT INTO dbo.CustomerDDM(customerId, name, email, phoneNum, city, SSN) VALUES (301, 'Sarah', 'sarah@ryan.com', '(465)-111-xxxx', 'New York', '111-22-3333');
21 | INSERT INTO dbo.CustomerDDM(customerId, name, email, phoneNum, city, SSN) VALUES (303, 'Ryan', 'ryan@ryan.com', '(122)-222-xxxx', 'Phoenix', '222-33-4444');
22 | INSERT INTO dbo.CustomerDDM(customerId, name, email, phoneNum, city, SSN) VALUES (303, 'alicia', 'alicia@alicia.com', '(354)-333-xxxx', 'LA', '333-44-5555');
23 | 
24 | SELECT * from dbo.CustomerDDM;
25 | 
26 | ALTER TABLE [dbo].[CustomerDDM] ALTER COLUMN SSN ADD MASKED WITH (FUNCTION = 'PARTIAL(0,"xxx-xx-", 4)');
27 | ALTER TABLE [dbo].[CustomerDDM] ALTER COLUMN email ADD MASKED WITH (FUNCTION = 'email()');
28 | GO
29 | 
30 | -- Impersonate a low priv user for testing:
31 | CREATE USER MaskingTestUser WITHOUT LOGIN;  
32 | GRANT SELECT ON SCHEMA::dbo TO MaskingTestUser;  
33 | 
34 | EXECUTE AS USER = 'MaskingTestUser';  
35 | SELECT * from dbo.CustomerDDM;
36 | REVERT;
37 | 


--------------------------------------------------------------------------------
/Chapter14/SynSQLQueryPlan-C14.sql:
--------------------------------------------------------------------------------
 1 | -- SQL Query Plan Example
 2 | 
 3 | -- DROP TABLE dbo.DimDriver;
 4 | 
 5 | -- Create a Driver table
 6 | CREATE TABLE dbo.DimDriver
 7 | (
 8 |     [driverId] INT NOT NULL,
 9 |     [firstName] VARCHAR (40),
10 |     [middleName] VARCHAR(40),
11 |     [lastName] VARCHAR(40),
12 |     [city] VARCHAR (40),
13 |     [gender] VARCHAR(40),
14 |     [salary] INT
15 | )
16 | WITH
17 | (
18 |     CLUSTERED COLUMNSTORE INDEX
19 | )
20 | GO
21 | 
22 | -- Insert some sample values
23 | 
24 | INSERT INTO dbo.DimDriver VALUES (210, 'Alicia','','Yang','New York', 'Female', 2000);
25 | INSERT INTO dbo.DimDriver VALUES (211, 'Brandon','','Rhodes','New York','Male', 3000);
26 | INSERT INTO dbo.DimDriver VALUES (212, 'Cathy','','Mayor','California','Female', 3000);
27 | INSERT INTO dbo.DimDriver VALUES (213, 'Dennis','','Brown','Florida','Male', 2500);
28 | INSERT INTO dbo.DimDriver VALUES (214, 'Jeremey','','Stilton','Arizona','Male', 2500);
29 | INSERT INTO dbo.DimDriver VALUES (215, 'Maile','','Green','Florida','Female', 4000);
30 | 
31 | SELECT * FROM dbo.DimDriver;
32 | 
33 | -- Let us run a query that has some aggregations
34 | Select [gender], AVG([salary]) AS 'AVG salary' from dbo.DimDriver GROUP BY [gender];
35 | 
36 | -- Use the EXPLAIN option to see the query plan
37 | EXPLAIN WITH_RECOMMENDATIONS
38 | SELECT
39 |         [gender],SUM([salary]) as Totalsalary
40 |     FROM
41 |        dbo.DimDriver
42 |     GROUP BY
43 |         [gender]
44 | 


--------------------------------------------------------------------------------
/Chapter02/SynSQLDataPruning-C2.sql:
--------------------------------------------------------------------------------
 1 | -- Dedicated SQL pool example with pruning
 2 | 
 3 | -- The following table has been partitioned using the PARTITION keyword on tripDate.
 4 | 
 5 | CREATE TABLE dbo.TripTable
 6 | (
 7 |     [tripId] INT NOT NULL,
 8 |     [driverId] INT NOT NULL,
 9 |     [customerId] INT NOT NULL,
10 |     [tripDate] INT,
11 |     [startLocation] VARCHAR(40),
12 |     [endLocation] VARCHAR(40)
13 |  )
14 |  WITH
15 |  (
16 |     CLUSTERED COLUMNSTORE INDEX,
17 |     DISTRIBUTION = HASH ([tripId]),
18 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
19 |         ( 20220101, 20220201, 20220301 )
20 |     )
21 | )
22 | GO
23 | 
24 | INSERT INTO dbo.TripTable VALUES (100, 200, 300, 20220101, 'New York', 'New Jersey');
25 | INSERT INTO dbo.TripTable VALUES (101, 201, 301, 20220101, 'Miami', 'Dallas');
26 | INSERT INTO dbo.TripTable VALUES (102, 202, 302, 20220102, 'Phoenix', 'Tempe');
27 | INSERT INTO dbo.TripTable VALUES (103, 203, 303, 20220204, 'LA', 'San Jose');
28 | INSERT INTO dbo.TripTable VALUES (104, 204, 304, 20220205, 'Seattle', 'Redmond');
29 | INSERT INTO dbo.TripTable VALUES (105, 205, 305, 20220301, 'Atlanta', 'Chicago');
30 | 
31 | -- If find all the customers who traveled with IAC in the month of January. 
32 | -- All you need to do is use a simple filter, such as in the following example:
33 | -- Tis will ensure that only the data in the below partitions are read and not a 
34 | -- full table scan.
35 | 
36 | SELECT customerId FROM TripTable WHERE tripDate BETWEEN '20220101' AND '20220131'
37 | 
38 | 


--------------------------------------------------------------------------------
/Chapter14/SynSQLUDF-C14.sql:
--------------------------------------------------------------------------------
 1 | -- Synapse SQL UDF Example
 2 | 
 3 | -- Drop the UDF if it already exists
 4 | DROP FUNCTION IF EXISTS dbo.isValidEmail
 5 | GO
 6 | -- Create an UDF to check valid emails. It returns 'Not Available' for all invalid emails
 7 | CREATE FUNCTION dbo.isValidEmail(@EMAIL VARCHAR(100))
 8 | RETURNS VARCHAR(100) AS
 9 | BEGIN     
10 |   DECLARE @returnValue AS VARCHAR(100)
11 |   DECLARE @EmailText VARCHAR(100)
12 |   SET @EmailText= isnull(@EMAIL,'')
13 |   SET @returnValue = CASE WHEN @EmailText NOT LIKE '_%@_%._%' THEN 'Not Available'
14 |                           ELSE @EmailText
15 |                       end
16 |   RETURN @returnValue
17 | END
18 | GO
19 | -- Drop the sample table if it already exists
20 | -- DROP TABLE dbo.CustomerContact;
21 | -- Create a sample table
22 | 
23 | -- DROP TABLE dbo.CustomerContact
24 | 
25 | CREATE TABLE dbo.CustomerContact
26 | (  
27 |     [CustomerID] INT,  
28 |     [Name] VARCHAR(100),  
29 |     [Email] VARCHAR(100) 
30 | )
31 | 
32 | -- Insert some dummy values
33 | INSERT INTO dbo.CustomerContact VALUES (1, 'Arielle', 'arielle');
34 | INSERT INTO dbo.CustomerContact VALUES (2, 'Bran', 'bryan@domain.com');
35 | INSERT INTO dbo.CustomerContact VALUES (3, 'Cathy', 'cathy@domain.com');
36 | INSERT INTO dbo.CustomerContact VALUES (4, 'Demin', 'demin@wrongdomain');
37 | INSERT INTO dbo.CustomerContact VALUES (5, 'Ethan', 'ethan@domain.com');
38 | 
39 | -- View the rows in the table  
40 | SELECT * FROM dbo.CustomerContact;
41 | 
42 | 
43 | -- Here is how you can use the UDF
44 | SELECT CustomerID, Name, dbo.isValidEmail(Email) AS Email FROM dbo.CustomerContact;


--------------------------------------------------------------------------------
/Chapter12/SynSQLColLevelSecurity-C12.sql:
--------------------------------------------------------------------------------
 1 | -- COLUMN SECURITY example
 2 | 
 3 | -- Create a sample Customer table
 4 | 
 5 | CREATE TABLE dbo.DimCustomer
 6 | (
 7 |     [customerId] INT NOT NULL,
 8 |     [name] VARCHAR(40) NOT NULL,
 9 |     [emailId] VARCHAR(40),
10 |     [phoneNum] VARCHAR(40),
11 |     [city] VARCHAR(40)
12 | )
13 | WITH
14 | (
15 |     CLUSTERED COLUMNSTORE INDEX,
16 |     DISTRIBUTION = REPLICATE
17 | )
18 | GO
19 | 
20 | INSERT INTO dbo.DimCustomer(customerId, name, emailId, phoneNum, city) VALUES (301, 'Sarah', 'sarah@ryan.com', '(465)-xxx-xxxx', 'New York');
21 | INSERT INTO dbo.DimCustomer(customerId, name, emailId, phoneNum, city) VALUES (303, 'Ryan', 'ryan@ryan.com', '(122)-xxx-xxxx', 'Phoenix');
22 | 
23 | SELECT * from dbo.DimCustomer;
24 | 
25 | -- Create two users: HiPriv_User and LowPriv_User
26 | -- Let us assume the HiPriv_User will have access to all the rows
27 | -- And LowPriv_User will not have access to rows with TripID > 900
28 | CREATE USER HiPriv_User WITHOUT LOGIN;  
29 | CREATE USER LowPriv_User WITHOUT LOGIN;
30 | 
31 | -- Grant the right Privileges to HiPriv_User and LowPriv_User 
32 | GRANT SELECT ON dbo.DimCustomer (customerId, name, city) TO LowPriv_User;
33 | GRANT SELECT ON dbo.DimCustomer (customerId, name, emailId, phoneNum, city) TO HiPriv_User;
34 |  
35 | -- Now run the query as HiPriv_User. You will see all the columns
36 | EXECUTE AS USER = 'HiPriv_User';
37 | SELECT * from dbo.DimCustomer
38 | REVERT;
39 | 
40 | -- Now run the query as LowPriv_User. You will only see customerId, name, city
41 | EXECUTE AS USER = 'LowPriv_User';
42 | SELECT * from dbo.DimCustomer
43 | REVERT;
44 | 


--------------------------------------------------------------------------------
/Chapter05/SynSQLPartitions-C5.sql:
--------------------------------------------------------------------------------
 1 | -- Synapse SQL Partition Example
 2 | 
 3 | CREATE TABLE dbo.TripTable
 4 | (
 5 |     [tripId] INT NOT NULL,
 6 |     [driverId] INT NOT NULL,
 7 |     [customerId] INT NOT NULL,
 8 |     [tripDate] INT,
 9 |     [startLocation] VARCHAR(40),
10 |     [endLocation] VARCHAR (40)
11 |  )
12 |  WITH
13 |  (
14 |     CLUSTERED COLUMNSTORE INDEX,
15 |     DISTRIBUTION = HASH ([tripId]),
16 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
17 |         ( 20220101, 20220201, 20220301 )
18 |     )
19 | )
20 | 
21 | 
22 | INSERT INTO dbo.TripTable VALUES (101, 201, 301, 20220101, 'Miami', 'Dallas');
23 | INSERT INTO dbo.TripTable VALUES (102, 202, 302, 20220102, 'Phoenix', 'Tempe');
24 | INSERT INTO dbo.TripTable VALUES (103, 203, 303, 20220204, 'LA', 'San Jose');
25 | INSERT INTO dbo.TripTable VALUES (104, 204, 304, 20220205, 'Seattle', 'Redmond');
26 | INSERT INTO dbo.TripTable VALUES (105, 205, 305, 20220301, 'Atlanta', 'Chicago');
27 | 
28 | SELECT * from dbo.TripTable;
29 | 
30 | -- You can use this query to find the partition details
31 | 
32 | SELECT  QUOTENAME(s.[name])+'.'+QUOTENAME(t.[name]) as Table_name
33 | ,       i.[name] as Index_name
34 | ,       p.partition_number as Partition_nmbr
35 | ,       p.[rows] as Row_count
36 | ,       p.[data_compression_desc] as Data_Compression_desc
37 | FROM    sys.partitions p
38 | JOIN    sys.tables     t    ON    p.[object_id]   = t.[object_id]
39 | JOIN    sys.schemas    s    ON    t.[schema_id]   = s.[schema_id]
40 | JOIN    sys.indexes    i    ON    p.[object_id]   = i.[object_Id]
41 |                             AND   p.[index_Id]    = i.[index_Id]
42 | WHERE t.[name] = 'TripTable'
43 | ;
44 | 
45 | 


--------------------------------------------------------------------------------
/Chapter10/ASAWindowedAggregates.sql:
--------------------------------------------------------------------------------
 1 | -- ASA Windowed Aggregates Examples
 2 | -- You will have to copy these SQL snippets into your ASA Job Query section and run it one by one.
 3 | 
 4 | -- Tumbling window example in ASA. It calculates the number of trips grouped by Location, in 10-second-wide tumbling windows.
 5 | SELECT System.Timestamp() AS WindowEnd, tripLocation, COUNT(*)  
 6 | INTO [Output]
 7 | FROM [Input] TIMESTAMP BY createdAt  
 8 | GROUP BY tripLocation, TumblingWindow(Duration(second, 10), Offset(millisecond, -1))
 9 | 
10 | -- Hopping window example. Every 10 seconds, fetch the trip count per location for the last 20 seconds. 
11 | -- Here the window size is 20 seconds, and the hop size is 10 seconds.
12 | SELECT System.Timestamp() AS WindowEnd, tripLocation, COUNT(*)  
13 | INTO [Output]
14 | FROM [Input]  TIMESTAMP BY createdAt  
15 | GROUP BY tripLocation, HoppingWindow(Duration(second, 20), Hop(second, 10), Offset(millisecond, -1))
16 | 
17 | -- Sliding window example. For every 10 seconds, alert if a location appears more than 5 times.
18 | SELECT System.Timestamp() AS WindowEnd, tripLocation, COUNT(*)  
19 | INTO [Output]
20 | FROM [Input] TIMESTAMP BY createdAt
21 | GROUP BY tripLocation, SlidingWindow(second, 10)
22 | HAVING COUNT(*) > 5
23 | 
24 | -- Session window example. Find the number of trips that occur within 5 seconds of each other.
25 | SELECT System.Timestamp() AS WindowEnd, tripId, COUNT(*)  
26 | INTO [Output]
27 | FROM [Input] TIMESTAMP BY createdAt
28 | GROUP BY tripId, SessionWindow(second, 5, 10)
29 | 
30 | -- Snapshot window example
31 | SELECT tripId, COUNT(*)
32 | INTO [Output]
33 | FROM [Input] TIMESTAMP BY createdAt
34 | GROUP BY tripId, System.Timestamp()
35 | 


--------------------------------------------------------------------------------
/Chapter13/SynSQLServerlessStats-C13.sql:
--------------------------------------------------------------------------------
 1 | -- Creating statistics for Synapse serverless pools
 2 | -- We will first create an external table and then create statistics for that table.
 3 | 
 4 | -- The concept of statistics is the same for dedicated and serverless pools. 
 5 | -- In case of serverless pools the auto-creation of statistics is turned on by default for Parquet files but not for CSV files. 
 6 | -- Since we deal with external tables in serverless pools, we will have to create statistics for external tables. 
 7 | 
 8 | IF NOT EXISTS (SELECT * FROM sys.external_file_formats WHERE name = 'Dp203ParquetFormat') 
 9 | 	CREATE EXTERNAL FILE FORMAT [Dp203ParquetFormat] 
10 | 	WITH ( FORMAT_TYPE = PARQUET)
11 | GO
12 | 
13 | IF NOT EXISTS (SELECT * FROM sys.external_data_sources WHERE name = 'Dp203DataSource') 
14 | 	CREATE EXTERNAL DATA SOURCE [Dp203DataSource] 
15 | 	WITH (
16 | 		LOCATION = '<INSERT abfss:// DATA SOURCE LOCATION>' 
17 | 	)
18 | GO
19 | 
20 | -- Here we are assuming that the Parquet files are present under parquet/trips folder
21 | CREATE EXTERNAL TABLE TripsExtTable (
22 | 	[tripsId] nvarchar(100),
23 | 	[driverId] nvarchar(100),
24 | 	[customerId] nvarchar(100),
25 | 	[cabId] nvarchar(100),
26 | 	[tripDate] nvarchar(100),
27 | 	[startLocation] nvarchar(100),
28 | 	[endLocation] nvarchar(100)
29 | 	)
30 | 	WITH (
31 | 	LOCATION = 'parquet/trips/**',
32 | 	DATA_SOURCE = [Dp203DataSource],
33 | 	FILE_FORMAT = [Dp203ParquetFormat]
34 | 	)
35 | GO
36 | 
37 | 
38 | SELECT TOP 100 * FROM dbo.TripsExtTable
39 | GO
40 | 
41 | -- Now that we have the table, let us create stats on the tripsId column:
42 | 
43 | CREATE STATISTICS TripStats
44 | ON dbo.TripsExtTable ( tripsId )
45 |     WITH FULLSCAN
46 | 
47 | GO
48 | 


--------------------------------------------------------------------------------
/Chapter04/AzureSQLTemporalData-C4.sql:
--------------------------------------------------------------------------------
 1 | -- Temporal Data Example
 2 | -- Run this in Azure SQL
 3 | 
 4 | CREATE TABLE Customer
 5 | (
 6 |   [customerId] INT NOT NULL PRIMARY KEY CLUSTERED,
 7 |   [name] VARCHAR(100) NOT NULL,
 8 |   [address] VARCHAR(100) NOT NULL,
 9 |   [email] VARCHAR (100) NOT NULL,
10 |   [phone] VARCHAR(12) NOT NULL,
11 |   [validFrom] DATETIME2 GENERATED ALWAYS AS ROW START,
12 |   [validTo] DATETIME2 GENERATED ALWAYS AS ROW END,
13 |   PERIOD FOR SYSTEM_TIME (validFrom, validTo),
14 |  )
15 | WITH (SYSTEM_VERSIONING = ON);
16 | 
17 | -- Let us insert some dummy values
18 | INSERT INTO [dbo].[Customer] ([customerId], [name], [address], [email], [phone]) VALUES (101, 'Alan Li', '101 Test Lane, LA', 'alan@li.com', '111-222-3333');
19 | INSERT INTO [dbo].[Customer] ([customerId], [name], [address], [email], [phone]) VALUES (102, 'Becky King', '202 Second Lane, SF', 'becky@king.com', '222-333-4444');
20 | INSERT INTO [dbo].[Customer] ([customerId], [name], [address], [email], [phone]) VALUES (103, 'Daniel Martin', '303 Third Lane, NY', 'daniel@someone.com', '333-444-5555');
21 | 
22 | -- Check the values
23 | SELECT * FROM Customer;
24 | 
25 | -- Now, let us update one of the table entries and see how the temporal table keeps track of the changes.
26 | 
27 | UPDATE [dbo].[Customer] SET [address] = '111 Updated Lane, LA' WHERE [customerId] = 101;
28 | 
29 | 
30 | -- Change the dates to your current date before running this query
31 | SELECT [customerId]
32 |    , [name]
33 |    , [address]
34 |    , [validFrom]
35 |    , [validTo]
36 |    , IIF (YEAR(validTo) = 9999, 1, 0) AS IsActual
37 | FROM [dbo].[Customer]
38 | FOR SYSTEM_TIME BETWEEN '2022-01-12' AND '2022-01-15'
39 | WHERE CustomerId = 101
40 | ORDER BY validFrom DESC;


--------------------------------------------------------------------------------
/Chapter04/AzureSQLWaterMarks-C4.sql:
--------------------------------------------------------------------------------
 1 | --  Watermark Example
 2 | -- Run this in Azure SQL
 3 | 
 4 | DROP TABLE IF EXISTS  [dbo].[FactTrips];
 5 | 
 6 | CREATE TABLE FactTrips (
 7 | [TripID] INT,
 8 | [CustomerID] INT,
 9 | [LastModifiedTime] DATETIME2
10 | );
11 | 
12 | INSERT INTO [dbo].[FactTrips] VALUES (100, 200, CURRENT_TIMESTAMP);
13 | INSERT INTO [dbo].[FactTrips] VALUES (101, 201, CURRENT_TIMESTAMP);
14 | INSERT INTO [dbo].[FactTrips] VALUES (102, 202, CURRENT_TIMESTAMP);
15 | 
16 | SELECT * FROM [dbo].[FactTrips];
17 | 
18 | -- A simple watermark table with just the table name and the last update value.
19 | DROP TABLE WatermarkTable;
20 | CREATE TABLE WatermarkTable
21 | (
22 |   [TableName] VARCHAR(100),
23 |   [WatermarkValue] DATETIME,
24 | );
25 | 
26 | INSERT INTO [dbo].[WatermarkTable] VALUES ('FactTrips', CURRENT_TIMESTAMP);
27 | SELECT * FROM WatermarkTable;
28 | GO
29 | 
30 | -- You can either update the Watermark table manually as shown or create a stored procedure and execute it everytime there is an update
31 | UPDATE [dbo].[WatermarkTable] SET [WatermarkValue] = CURRENT_TIMESTAMP WHERE [TableName] = 'FactTrips';
32 | 
33 | -- Creating a stored procedure to update the watermark whenever there is an update to the  FactTable
34 | DROP PROCEDURE uspUpdateWatermark
35 | GO
36 | 
37 | CREATE PROCEDURE [dbo].uspUpdateWatermark @LastModifiedtime DATETIME, @TableName VARCHAR(100)
38 | AS
39 | BEGIN
40 | UPDATE [dbo].[WatermarkTable] SET [WatermarkValue] = @LastModifiedtime WHERE [TableName] = @TableName
41 | END
42 | GO
43 | 
44 | -- Executing the stored procedure
45 | DECLARE @timestamp AS DATETIME = CURRENT_TIMESTAMP;
46 | EXECUTE uspUpdateWatermark @LastModifiedtime=@timestamp, @TableName='FactTrips';
47 | 
48 | SELECT * FROM WatermarkTable;


--------------------------------------------------------------------------------
/Chapter13/SynSQLDedicatedStats-C13.sql:
--------------------------------------------------------------------------------
 1 | -- Creating statistics for Synapse dedicated pools
 2 | 
 3 | -- Create the Fact Table. In our case it would be the TripTable
 4 | -- DROP TABLE dbo.FactTrips;
 5 | 
 6 | CREATE TABLE dbo.FactTrips
 7 | (
 8 |     [tripId] int NOT NULL,
 9 |     [driverId] int NOT NULL,
10 |     [customerId] int NOT NULL,
11 |     [tripDate] int,
12 |     [startLocation] VARCHAR (40),
13 |     [endLocation] VARCHAR (40)
14 |  )
15 |  WITH
16 |  (
17 |     CLUSTERED COLUMNSTORE INDEX,
18 |     DISTRIBUTION = HASH ([tripId])
19 |  )
20 | GO
21 | 
22 | -- Insert some sample values. In reality the Fact tables will have Millions of rows.
23 | 
24 |  INSERT INTO dbo.FactTrips VALUES (101, 201, 301, 20220101, 'New York', 'New Jersey');
25 |  INSERT INTO dbo.FactTrips VALUES (102, 202, 302, 20220101, 'Miami', 'Dallas');
26 |  INSERT INTO dbo.FactTrips VALUES (103, 203, 303, 20220102, 'Phoenix', 'Tempe');
27 |  INSERT INTO dbo.FactTrips VALUES (104, 204, 304, 20220204, 'LA', 'San Jose');
28 |  INSERT INTO dbo.FactTrips VALUES (105, 205, 305, 20220205, 'Seattle', 'Redmond');
29 |  INSERT INTO dbo.FactTrips VALUES (106, 206, 306, 20220301, 'Atlanta', 'Chicago');
30 | 
31 | SELECT * from dbo.FactTrips;
32 | 
33 | -- You can enable statistics in Synapse SQL dedicated pools using the following command 
34 | ALTER DATABASE <INSERT DATABASE NAME> SET AUTO_CREATE_STATISTICS ON
35 | 
36 | -- Once the AUTO_CREATE_STATISTICS is ON, any of SELECT, INSERT-SELECT, CTAS, UPDATE, DELETE or EXPLAIN 
37 | -- statements will automatically trigger the creation of statistics for the columns involved in the query, if not already present. 
38 | -- Automatic creation of statistics is not available for temporary or external tables.
39 | -- You can create statistics on-demand using the following command.
40 | 
41 | CREATE STATISTICS TripStats
42 |     ON dbo.FactTrips (tripId)
43 |     WITH SAMPLE 40 PERCENT;
44 | 
45 | -- In the preceding example, we are using a 40% sample. If you do not provide a sample value, the default is 20%. 
46 | -- You can also do a full scan instead of sampling using the following command.
47 | CREATE STATISTICS TripStats
48 |     ON dbo.FactTrips (tripId)
49 |     WITH FULLSCAN;
50 | 


--------------------------------------------------------------------------------
/Chapter01/CreateAzureQueues.ps1:
--------------------------------------------------------------------------------
 1 | #Creating Azure Queues using the CLI
 2 | 
 3 | $resourceGroup = "<INSERT RESOURCE GROUP NAME>"
 4 | $storageAccount ="<INSERT STORAGE ACCOUNT NAME>"
 5 | $storageKey = "<INSERT STORAGE KEY>"
 6 | # Note: It is not recommended to store passwords or access keys in code files.
 7 | # Please use AAD accounts and Azure Key Vault to store secrets
 8 | $region = "<INSERT REGION NAME>"
 9 | $queueName = "<INSERT QUEUE NAME>"
10 | 
11 | #The following variations are accepted for setting the Key:
12 | #   (1) account name and key (--account-name and --account-key options or
13 | #       set AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY environment variables)
14 | #    (2) account name and SAS token (--sas-token option used with either the --account-name
15 | #        option or AZURE_STORAGE_ACCOUNT environment variable)
16 | #    (3) account name (--account-name option or AZURE_STORAGE_ACCOUNT environment variable;
17 | #        this will make calls to query for a storage account key using login credentials)
18 | #    (4) connection string (--connection-string option or
19 | #        set AZURE_STORAGE_CONNECTION_STRING environment variable); some shells will require
20 | #        quoting to preserve literal character interpretation.
21 | 
22 | # NOTE: The following env variable syntax works for Powershell on Windows. You might have to set the env variables differently for other OS.
23 | $env:AZURE_STORAGE_ACCOUNT=$storageAccount
24 | $env:AZURE_STORAGE_KEY=$storageKey
25 | 
26 | #You can create a new Azure queue using the storage queue create command:
27 | az storage queue create --name $queueName --account-name $storageAccount
28 | #You can easily list the queues under a storage account using the storage queue list term:
29 | az storage queue list --account-name $storageAccount
30 | #You can add a new message to the newly created Queue using the storage message put option:
31 | az storage message put --queue-name $queueName --content "test"
32 | #Finally, use the storage message peek command to view the message. This command retrieves one or more messages from the front of the queue but does not alter the visibility of the message:
33 | az storage message peek --queue-name $queueName


--------------------------------------------------------------------------------
/Chapter01/CreateAzureTable.ps1:
--------------------------------------------------------------------------------
 1 | # Creating Azure Table
 2 | 
 3 | $resourceGroup = "<INSERT RESOURCE GROUP NAME>"
 4 | $storageAccount ="<INSERT STORAGE ACCOUNT NAME>"
 5 | $storageKey = "<INSERT STORAGE KEY>"
 6 | # Note: It is not recommended to store passwords or access keys in code files.
 7 | # Please use AAD accounts and Azure Key Vault to store secrets
 8 | $region = "<INSERT REGION NAME>"
 9 | $tableName = "<INSERT TABLE NAME>"
10 | 
11 | #The following variations are accepted for setting the Key:
12 | #   (1) account name and key (--account-name and --account-key options or
13 | #       set AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY environment variables)
14 | #    (2) account name and SAS token (--sas-token option used with either the --account-name
15 | #        option or AZURE_STORAGE_ACCOUNT environment variable)
16 | #    (3) account name (--account-name option or AZURE_STORAGE_ACCOUNT environment variable;
17 | #        this will make calls to query for a storage account key using login credentials)
18 | #    (4) connection string (--connection-string option or
19 | #        set AZURE_STORAGE_CONNECTION_STRING environment variable); some shells will require
20 | #        quoting to preserve literal character interpretation.
21 | 
22 | # NOTE: The following env variable syntax works for Powershell on Windows. You might have to set the env variables differently for other OS.
23 | $env:AZURE_STORAGE_ACCOUNT=$storageAccount
24 | $env:AZURE_STORAGE_KEY=$storageKey
25 | 
26 | 
27 | #   We can create a new Azure Table for our example company, IAC, by using the storage table create option:
28 | az storage table create --name $tableName  --account-name $storageAccount
29 | #	We can easily list the Tables under a storage account using the storage table list option:
30 | az storage table list --account-name $storageAccount
31 | # 	We can insert an entity into the newly created Table using the storage entity insert option:
32 | az storage entity insert --table-name $tableName  --entity PartitionKey=testPartKey RowKey=testRowKey Content=testContent
33 | #   Finally, we can use the storage entity show command to view the entry:
34 | az storage entity show --table-name $tableName  --partition-key testPartKey --row-key testRowKey


--------------------------------------------------------------------------------
/Chapter01/CreateAzureFileShare.ps1:
--------------------------------------------------------------------------------
 1 | #Creating Azure File shares
 2 | 
 3 | $resourceGroup = "<INSERT RESOURCE GROUP NAME>"
 4 | $storageAccount ="<INSERT STORAGE ACCOUNT NAME>"
 5 | $storageKey = "<INSERT STORAGE KEY>"
 6 | # Note: It is not recommended to store passwords or access keys in code files.
 7 | # Please use AAD accounts and Azure Key Vault to store secrets
 8 | $region = "<INSERT REGION NAME>"
 9 | $fileshareName = "<INSERT FILE SHARE NAME>"
10 | 
11 | 
12 | #   The following variations are accepted for setting the Key:
13 | #   (1) account name and key (--account-name and --account-key options or
14 | #       set AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY environment variables)
15 | #    (2) account name and SAS token (--sas-token option used with either the --account-name
16 | #        option or AZURE_STORAGE_ACCOUNT environment variable)
17 | #    (3) account name (--account-name option or AZURE_STORAGE_ACCOUNT environment variable;
18 | #        this will make calls to query for a storage account key using login credentials)
19 | #    (4) connection string (--connection-string option or
20 | #        set AZURE_STORAGE_CONNECTION_STRING environment variable); some shells will require
21 | #        quoting to preserve literal character interpretation.
22 | 
23 | # NOTE: The following env variable syntax works for Powershell on Windows. You might have to set the env variables differently for other OS.
24 | $env:AZURE_STORAGE_ACCOUNT=$storageAccount
25 | $env:AZURE_STORAGE_KEY=$storageKey
26 | 
27 | #You can create a new Azure File Share for IAC using the share-rm create option:
28 | az storage share-rm create --resource-group $resourceGroup --storage-account $storageAccount --name $fileshareName --quota 1024
29 | #You can list the file shares using the share list option:
30 | az storage share list --account-name $storageAccount
31 | #You can put a file into our File share using the file upload option:
32 | az storage file upload --share-name $fileshareName --source Data/testfile.txt
33 | #You can view the files in your File share using file list:
34 | az storage file list --share-name $fileshareName
35 | #Finally, you can download the file that we previously uploaded using the file download option:
36 | az storage file download --share-name $fileshareName -p testfile.txt --dest ./testfile.txt


--------------------------------------------------------------------------------
/Chapter10/EventGenerator.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from azure.eventhub.aio import EventHubProducerClient
 3 | from azure.eventhub import EventData
 4 | 
 5 | import json
 6 | import datetime
 7 | import uuid
 8 | import random
 9 | import time
10 | 
11 | async def run():
12 |     # Create a producer client to send messages to the event hub.
13 |     # Specify a connection string to your event hubs namespace and
14 |     # the event hub name.
15 |     
16 |     producer = EventHubProducerClient.from_connection_string(conn_str="<INSERT CONN STR>", eventhub_name="<INSERT EVENTHUB NAME>")
17 | 
18 |     city_list = ["San Franciso", "San Jose", "Los Angesles", "Seattle","Austin", "Dallas", "Denver", "New York", "Atlanta", "Miami", "Phoenix", "Tempe"]
19 |     async with producer:
20 |         for i in range(0, 600):  
21 |             # Create a batch.
22 |             event_data_batch = await producer.create_batch()
23 |             tripdetail1 = {'tripId': str(uuid.uuid4()), 'createdAt': str(datetime.datetime.utcnow()), 'startLocation': random.choice(city_list), 'endLocation': random.choice(city_list), 
24 |             'distance': random.randint(10, 1000), 'fare': random.randint(100, 1000) }
25 |             tripdetail2 = {'tripId': str(uuid.uuid4()), 'createdAt': str(datetime.datetime.utcnow()), 'startLocation': random.choice(city_list), 'endLocation': random.choice(city_list), 
26 |             'distance': random.randint(10, 1000), 'fare': random.randint(100, 1000) }
27 |             tripdetail3 = {'tripId': str(uuid.uuid4()), 'createdAt': str(datetime.datetime.utcnow()), 'startLocation': random.choice(city_list), 'endLocation': random.choice(city_list), 
28 |             'distance': random.randint(10, 1000), 'fare': random.randint(100, 1000) }
29 | 
30 |             print (tripdetail1);
31 |             print (tripdetail2);
32 |             print (tripdetail3);
33 |             # Add events to the batch.
34 |             event_data_batch.add(EventData(json.dumps(tripdetail1)))
35 |             event_data_batch.add(EventData(json.dumps(tripdetail2)))
36 |             event_data_batch.add(EventData(json.dumps(tripdetail3)))
37 |             event_data_batch.add(EventData(json.dumps(tripdetail1)))
38 | 
39 | 
40 |             # Send the batch of events to the event hub.
41 |             await producer.send_batch(event_data_batch)
42 |             time.sleep(1)
43 | 
44 | loop = asyncio.get_event_loop()
45 | loop.run_until_complete(run())
46 | 
47 | 


--------------------------------------------------------------------------------
/Chapter09/AzureBatch.ps1:
--------------------------------------------------------------------------------
 1 | # Azure Batch Example
 2 | 
 3 | # In this example, we will learn how to create an Azure Batch account and setup a pool of VMs to execute the job. 
 4 | # We will then learn how to run an application on the pool and download the results. Y
 5 | 
 6 | $resourceGroup = "<INSERT RESOURCE GROUP NAME>"
 7 | $storageAccount ="<INSERT STORAGE ACCOUNT NAME>"
 8 | $batchAccount ="<INSERT STORAGE ACCOUNT NAME>"
 9 | 
10 | $appName = "sampleApp"
11 | $poolName = "samplePool"
12 | $jobName = "sampleJob"
13 | $taskName= "sampleTask"
14 | $region = "East US"
15 | 
16 | # Creat the Resource group if not already created
17 | az group create --name $resourceGroup --location $region
18 | #	Create a Batch Account as shown.
19 | az batch account create -g $resourceGroup -n $batchAccount -l $region
20 | #	Create a Storage account as shown.
21 | az storage account create -g $resourceGroup -n $storageAccount -l $region --sku Standard_LRS
22 | #	Now, link the storage account to the batch account.
23 | az batch account set -g $resourceGroup -n $batchAccount --storage-account $storageAccount
24 | #	Next, create a pool using Ubuntu VMs to run our Batch application. This operation takes a few minutes.
25 | az batch pool create --id $poolName --vm-size Standard_A1_v2 --target-dedicated-nodes 2  --image canonical:ubuntuserver:18.04-LTS --node-agent-sku-id "batch.node.ubuntu 18.04"
26 | #	You can check the status of the pool creation as shown.
27 | az batch pool show --pool-id $poolName --query "allocationState"
28 | #	Next, create an application that needs to be run by the Batch job.
29 | az batch application create --resource-group $resourceGroup --name $batchAccount --application-name $appName
30 | #	Next, create a Job 
31 | az batch job create --id $jobName --pool-id $poolName 
32 | 
33 | #	Create the tasks under the job. The tasks will start running as soon as you create them.
34 | for i in {1..4}
35 | do
36 |    az batch task create --task-id $taskName$i --job-id $jobName  --command-line "/bin/bash -c 'printenv; sleep 30s'"
37 | done
38 | #	Monitor the jobs as shown.
39 | az batch task show --job-id $jobName --task-id $taskName
40 | 
41 | #	Download the results as shown.
42 | az batch task file download --job-id $jobName --task-id $taskName --file-path stdout.txt --destination ./stdout.txt
43 | 
44 | #	Finally, you can delete each of the entities as shown:
45 | az batch job delete --job-id $jobName
46 | az batch task delete -job-id $jobName --task-id $taskName
47 | az batch pool delete --pool-id $poolName
48 | 


--------------------------------------------------------------------------------
/Chapter01/CreateVM.ps1:
--------------------------------------------------------------------------------
 1 | ## Creating VM
 2 | 
 3 | $resourceGroup = "<INSERT RESOURCE GROUP NAME>"
 4 | $region = "<INSERT REGION NAME>"
 5 | 
 6 | 
 7 | $vmName = "sampleVM"
 8 | $password = "<INSERT SAMPLE PASSWORD>" 
 9 | # Note: It is not recommended to store passwords or access keys in code files.
10 | # Please use AAD accounts and Azure Key Vault to store secrets
11 | 
12 | $image = "UbuntuLTS"
13 | $diskName = "sampleDisk"
14 | $subnetName = "sampleSubnet"
15 | $pubIpName = "samplePubIp"
16 | $vnetName = "sampleVnet"
17 | $nicName = "sampleNIC"
18 | 
19 | # First, we have to find all the Ubuntu images that are available using the vm image list option:
20 | az vm image list --all --offer Ubuntu --all
21 | # Next, we need to find the Azure regions where we want to deploy. We can use account list-locations for this. You can choose a region that is closest to you:
22 | az account list-locations --output table
23 | 
24 | # Once we’ve done this, we can either create a new resource group or use an existing one to associate this VM with. You can create a new resource group using the group create option, as shown here:
25 | az group create --name $resourceGroup --location $region
26 | 
27 | # Finally, we can create a VM using the information from the preceding commands. In this example, I’ve chosen the eastus location to deploy this VM to. All the non-mandatory fields will default to the Azure default values:
28 | az vm create --resource-group $resourceGroup --name $vmName --image $image --admin-username  --admin-password $password --location $region
29 | 
30 | ## Creating and attaching Managed Disks to a VM using the CLI
31 | az vm disk attach --resource-group $resourceGroup --vm-name $vmName --name $diskName --size-gb 64 -new
32 | 
33 | ## Creating an Azure VNet using the CLI
34 | # First, we need to create a VNET by specifying the necessary IP ranges and subnet prefixes:
35 | az network vnet create --address-prefixes 10.20.0.0/16 --name $vnetName --resource-group $resourceGroup --subnet-name $subnetName  --subnet-prefixes 10.20.0.0/24
36 | # Then, we need to create a public IP so that we can access our VM from the internet:
37 | az network public-ip create --resource-group $resourceGroup --name $pubIpName --allocation-method dynamic
38 | # Next, we must create a network interface card (NIC), which will be the network interface between the VM and the outside world, with the previously created VNet and public IP:
39 | az network nic create --resource-group $resourceGroup --vnet-name $vnetName --subnet $subnetName  --name $nicName --public-ip-address $pubIpName
40 | # We now have all the components required to create a VM within our new VNet, IACVnet. We can reuse the UbuntuLTS image that we used in the earlier virtual machine creation example to create a new VM within the new Vnet:
41 | az vm create --resource-group $resourceGroup --name $vmName  --nics $nicName --image $image --generate-ssh-keys
42 | 


--------------------------------------------------------------------------------
/Chapter12/SynSQLRowLevelSecurity-C12.sql:
--------------------------------------------------------------------------------
 1 | -- Row Level Security
 2 | 
 3 | -- Create sample Trips and Customer tables
 4 | 
 5 | DROP TABLE dbo.TripTable;
 6 | 
 7 | CREATE TABLE dbo.TripTable
 8 | (
 9 |     [tripId] INT NOT NULL,
10 |     [driverId] INT NOT NULL,
11 |     [customerId] INT NOT NULL,
12 |     [tripDate] INT,
13 |     [startLocation] VARCHAR (40),
14 |     [endLocation] VARCHAR (40)
15 |  )
16 |  WITH
17 |  (
18 |     CLUSTERED COLUMNSTORE INDEX,
19 |     DISTRIBUTION = HASH ([tripId])
20 | )
21 | 
22 |  INSERT INTO dbo.TripTable VALUES (111, 201, 301, 20220101, 'New York', 'New Jersey');
23 |  INSERT INTO dbo.TripTable VALUES (112, 202, 302, 20220101, 'Miami', 'Dallas');
24 |  INSERT INTO dbo.TripTable VALUES (113, 203, 302, 20220102, 'Phoenix', 'Tempe');
25 |  INSERT INTO dbo.TripTable VALUES (114, 204, 303, 20220204, 'LA', 'San Jose');
26 |  INSERT INTO dbo.TripTable VALUES (115, 205, 304, 20220205, 'Seattle', 'Redmond');
27 |  INSERT INTO dbo.TripTable VALUES (116, 203, 305, 20220301, 'Atlanta', 'Chicago');
28 | 
29 | -- Insert a row with tripId >= 900, as we will use that as the condition to restrict the row views to non-privelleged users
30 | INSERT INTO dbo.TripTable VALUES (900, 299, 399, 20220301, 'Pre-Launch', 'Pre-Launch');
31 | 
32 | SELECT * from dbo.TripTable;
33 | 
34 | -- Create two users: HiPriv_User and LowPriv_User
35 | -- Let us assume the HiPriv_User will have access to all the rows
36 | -- And LowPriv_User will not have access to rows with tripId > 900
37 | CREATE USER HiPriv_User WITHOUT LOGIN;  
38 | CREATE USER LowPriv_User WITHOUT LOGIN;
39 | 
40 | GRANT SELECT ON dbo.TripTable TO HiPriv_User;  
41 | GRANT SELECT ON dbo.TripTable TO LowPriv_User; 
42 | GO
43 | 
44 | -- Drop old policies and schemas if already created
45 | 
46 | -- DROP SECURITY POLICY PrivFilter;
47 | -- DROP FUNCTION Security.tvf_securitypredicate;
48 | -- DROP SCHEMA Security;
49 | 
50 | -- Create a new Security schema
51 | CREATE SCHEMA Security;  
52 | GO  
53 |   
54 | -- Create the new function that has the business logic
55 | CREATE FUNCTION Security.tvf_securitypredicate(@tripId AS int)  
56 |     RETURNS TABLE  
57 | WITH SCHEMABINDING  
58 | AS  
59 |     RETURN SELECT 1 AS tvf_securitypredicate_result
60 | WHERE  @tripId < 900 OR USER_NAME() = 'HiPriv_User';  
61 | GO
62 | 
63 | -- Create a security policy and map the previous function to the table on which it needs to operate
64 | CREATE SECURITY POLICY PrivFilter ADD FILTER PREDICATE Security.tvf_securitypredicate(tripId) ON dbo.TripTable WITH (STATE = ON);
65 | GO
66 | 
67 | -- Now run the query as HiPriv_User. You will see all the rows.
68 | EXECUTE AS USER = 'HiPriv_User';
69 | SELECT * from dbo.TripTable
70 | REVERT;
71 | 
72 | -- Now run the query as LowPriv_User. You will not see the rows with tripId >= 900
73 | EXECUTE AS USER = 'LowPriv_User';
74 | SELECT * from dbo.TripTable
75 | REVERT;
76 | 
77 | -- Turn off the filter
78 | ALTER SECURITY POLICY PrivFilter  
79 | WITH (STATE = ON);  
80 | 


--------------------------------------------------------------------------------
/Chapter08/EncodeAndDecodeUsingSpark-C8.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","source":["%sql\n-- Let us try to encode the word \"Azure\" and print it in hex format\nSELECT hex(encode('Azure', 'UTF-16'));"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3b8661c3-25cb-4ac2-96cf-3e3c98a2987b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"overflow":false,"datasetInfos":[],"data":[["FEFF0041007A007500720065"]],"plotOptions":{"displayType":"table","customPlotOptions":{},"pivotColumns":[],"pivotAggregation":null,"xColumns":[],"yColumns":[]},"columnCustomDisplayInfos":{},"aggType":"","isJsonSchema":true,"removedWidgets":[],"aggSchema":[],"schema":[{"name":"hex(encode(Azure, UTF-16))","type":"\"string\"","metadata":"{}"}],"aggError":"","aggData":[],"addedWidgets":{},"metadata":{},"dbfsResultPath":null,"type":"table","aggOverflow":false,"aggSeriesLimitReached":false,"arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>hex(encode(Azure, UTF-16))</th></tr></thead><tbody><tr><td>FEFF0041007A007500720065</td></tr></tbody></table></div>"]},"transient":null}],"execution_count":0},{"cell_type":"code","source":["%sql\n-- Now let us try to decode the hex value\nSELECT decode(X'FEFF0041007A007500720065', 'UTF-16')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"e1033c05-7f6e-41c0-9ffa-16725e11ade2"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"overflow":false,"datasetInfos":[],"data":[["Azure"]],"plotOptions":{"displayType":"table","customPlotOptions":{},"pivotColumns":[],"pivotAggregation":null,"xColumns":[],"yColumns":[]},"columnCustomDisplayInfos":{},"aggType":"","isJsonSchema":true,"removedWidgets":[],"aggSchema":[],"schema":[{"name":"decode(X'FEFF0041007A007500720065', UTF-16)","type":"\"string\"","metadata":"{}"}],"aggError":"","aggData":[],"addedWidgets":{},"metadata":{},"dbfsResultPath":null,"type":"table","aggOverflow":false,"aggSeriesLimitReached":false,"arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .table-result-container {\n    max-height: 300px;\n    overflow: auto;\n  }\n  table, th, td {\n    border: 1px solid black;\n    border-collapse: collapse;\n  }\n  th, td {\n    padding: 5px;\n  }\n  th {\n    text-align: left;\n  }\n</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>decode(X'FEFF0041007A007500720065', UTF-16)</th></tr></thead><tbody><tr><td>Azure</td></tr></tbody></table></div>"]},"transient":null}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"EncodeAndDecodeUsingSpark-C8","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":188113580888545}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter14/SparkQueryPlan-C14.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5ce0d083-c50c-4e45-8fc2-85dc286f4331","showTitle":false,"title":""}},"source":["Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n","https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n","\n","If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"852dd2af-ff00-483c-9d0f-bd4f6d7a98fd","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","val fileSystemName = \"<INSERT CONTAINER NAME>\"\n","\n","val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","# AAD Application Details\n","val appID = \"<INSERT APP ID>\"\n","val secret = \"<INSERT SECRET>\"\n","val tenantID = \"<INSERT TENANT ID>\"\n","\n","spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n","spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n","dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5d61b9a4-591c-43bf-8d51-9c0a5423e2a7","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","// Let us run a simple join and then look at the query plan for it.\n","val jump2Numbers = spark.range(0, 100000,2) \n","val jump5Numbers = spark.range(0, 200000, 5) \n","val ds1 = jump2Numbers.repartition(3) \n","val ds2 = jump5Numbers.repartition(5) \n","val joined = ds1.join(ds2) \n"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3767ab77-7f5e-4f8c-b216-31cd18f5a0fb","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","\n","// use the explain option to view the query plan\n","joined.explain"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"SparkQueryPlan-C14","notebookOrigID":188113580888591,"widgets":{}},"kernelspec":{"display_name":"python","name":"synapse_pyspark"},"language_info":{"name":"python"},"save_output":true,"synapse_widget":{"state":{},"version":"0.1"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter07/SynSQLStarSchema-C7.sql:
--------------------------------------------------------------------------------
  1 | -- Example for creating a Star schema 
  2 | 
  3 | -- Create the Fact Table. In our case it would be the TripTable
  4 | DROP TABLE dbo.FactTrips;
  5 | 
  6 | CREATE TABLE dbo.FactTrips
  7 | (
  8 |     [tripId] INT NOT NULL,
  9 |     [driverId] INT NOT NULL,
 10 |     [customerId] INT NOT NULL,
 11 |     [tripdate] INT,
 12 |     [startLocation] VARCHAR (40),
 13 |     [endLocation] VARCHAR (40)
 14 |  )
 15 |  WITH
 16 |  (
 17 |     CLUSTERED COLUMNSTORE INDEX,
 18 |     DISTRIBUTION = HASH ([tripId])
 19 |  )
 20 | GO
 21 | 
 22 | -- Insert some sample values. In reality the Fact tables will have Millions of rows.
 23 | 
 24 |  INSERT INTO dbo.FactTrips VALUES (101, 201, 301, 20220101, 'New York', 'New Jersey');
 25 |  INSERT INTO dbo.FactTrips VALUES (102, 202, 302, 20220101, 'Miami', 'Dallas');
 26 |  INSERT INTO dbo.FactTrips VALUES (103, 203, 303, 20220102, 'Phoenix', 'Tempe');
 27 |  INSERT INTO dbo.FactTrips VALUES (104, 204, 304, 20220204, 'LA', 'San Jose');
 28 |  INSERT INTO dbo.FactTrips VALUES (105, 205, 305, 20220205, 'Seattle', 'Redmond');
 29 |  INSERT INTO dbo.FactTrips VALUES (106, 206, 306, 20220301, 'Atlanta', 'Chicago');
 30 | 
 31 | 
 32 | -- Create the Customer Dimension table
 33 | DROP TABLE dbo.DimCustomer;
 34 | 
 35 | CREATE TABLE dbo.DimCustomer
 36 | (
 37 |     [customerId] INT NOT NULL,
 38 |     [name] VARCHAR(40) NOT NULL,
 39 |     [emailId] VARCHAR(40),
 40 |     [phoneNum] VARCHAR(40),
 41 |     [city] VARCHAR(40)
 42 | )
 43 | WITH
 44 | (
 45 |     CLUSTERED COLUMNSTORE INDEX,
 46 |     DISTRIBUTION = REPLICATE
 47 | )
 48 | GO
 49 | 
 50 | -- Another way of inserting data using COPY INTO
 51 | -- You will have to use the https format here instead of the abfss format
 52 | -- Copy the customer.csv file in this directory to the ADLS Gen2 location and use that path here.
 53 | 
 54 | COPY INTO dbo.DimCustomer
 55 | FROM '<INSERT https:// LOCATION>'
 56 | WITH (
 57 |     FILE_TYPE='CSV',
 58 |     FIELDTERMINATOR=',',
 59 |     FIELDQUOTE='',
 60 |     ROWTERMINATOR='\n',
 61 |     ENCODING = 'UTF8',
 62 |     FIRSTROW = 2
 63 | );
 64 | 
 65 | SELECT * from dbo.DimCustomer;
 66 | 
 67 | -- Create a Driver Dimension table
 68 | CREATE TABLE dbo.DimDriver
 69 | (
 70 |     [driverId] INT NOT NULL,
 71 |     [firstName] VARCHAR(40),
 72 |     [middleName] VARCHAR(40),
 73 |     [lastName] VARCHAR(40),
 74 |     [city] VARCHAR(40),
 75 |     [gender] VARCHAR(40),
 76 |     [salary] INT
 77 | )
 78 | WITH
 79 | (
 80 |     CLUSTERED COLUMNSTORE INDEX,
 81 |     DISTRIBUTION = REPLICATE
 82 | )
 83 | GO
 84 | 
 85 | -- Insert some sample values
 86 | 
 87 | INSERT INTO dbo.DimDriver VALUES (210, 'Alicia','','Yang','New York', 'Female', 2000);
 88 | INSERT INTO dbo.DimDriver VALUES (211, 'Brandon','','Rhodes','New York','Male', 3000);
 89 | INSERT INTO dbo.DimDriver VALUES (212, 'Cathy','','Mayor','California','Female', 3000);
 90 | INSERT INTO dbo.DimDriver VALUES (213, 'Dennis','','Brown','Florida','Male', 2500);
 91 | INSERT INTO dbo.DimDriver VALUES (214, 'Jeremey','','Stilton','Arizona','Male', 2500);
 92 | INSERT INTO dbo.DimDriver VALUES (215, 'Maile','','Green','Florida','Female', 4000);
 93 | 
 94 | SELECT * from dbo.DimDriver;
 95 | 
 96 | DROP TABLE dbo.DimDate
 97 | -- Create the date dimension table
 98 | CREATE TABLE dbo.DimDate
 99 | (
100 |     [dateId] INT NOT NULL,
101 |     [date] DATETIME NOT NULL,
102 |     [dayOfWeek] VARCHAR(40),
103 |     [fiscalQuarter] VARCHAR(40)
104 | )
105 | WITH
106 | (
107 |     CLUSTERED COLUMNSTORE INDEX,
108 |     DISTRIBUTION = REPLICATE
109 | )
110 | GO
111 | 
112 | INSERT INTO dbo.DimDate VALUES (20210101, '20210101','Saturday','Q3');
113 | INSERT INTO dbo.DimDate VALUES (20210102, '20210102','Sunday','Q3');
114 | INSERT INTO dbo.DimDate VALUES (20210103, '20210103','Monday','Q3');
115 | INSERT INTO dbo.DimDate VALUES (20210104, '20210104','Tuesday','Q3');
116 | INSERT INTO dbo.DimDate VALUES (20210105, '20210105','Wednesday','Q3');
117 | 
118 | 
119 | 
120 | -- Now run some sample queries
121 | 
122 | SELECT trip.[tripId], customer.[name] from 
123 | dbo.FactTrips AS trip
124 | JOIN dbo.DimCustomer AS customer
125 | ON trip.[customerId] = customer.[customerId] 
126 | WHERE trip.[endLocation] = 'San Jose';
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/Chapter05/CompressionUsingSpark-C5.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a30cc73a-5137-42ef-970d-3ee4789114c7","showTitle":false,"title":""}},"source":["Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n","https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n","\n","If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"94955981-3e4d-4aa2-8a93-d0fb116c328a","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","// Azure Databricks Spark setup to read Azure storage\n","val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","val fileSystemName = \"<INSERT CONTAINER NAME>\"\n","\n","val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","# AAD Application Details\n","val appID = \"<INSERT APP ID>\"\n","val secret = \"<INSERT SECRET>\"\n","val tenantID = \"<INSERT TENANT ID>\"\n","\n","spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n","spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n","dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4b67b889-ce41-4ddf-8cdf-583de3e94591","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{"application/vnd.databricks.v1+output":{"addedWidgets":{},"arguments":{},"data":"<div class=\"ansiout\"></div>","datasetInfos":[],"metadata":{},"removedWidgets":[],"type":"html"}},"output_type":"display_data"}],"source":["# Compressing files using Spark\n","# Spark provides libraries that can directly write the outputs in compressed formats such as Parquet, ORC, and so on. On top of this, we can specify the compression algorithms to be used. For example, the following Python script stores the data in Parquet format using gzip compression:\n","columnNames = [\"name\",\"license\",\"gender\",\"salary\"]\n","driverData = [\n","  ('Alice', 'A224455', 'Female', 3000),\n","  ('Bryan','B992244','Male',4000),\n","  ('Catherine','C887733','Female',4000)\n","]\n","df = spark.createDataFrame(data= driverData, schema = columnNames)\n","df.write.option(\"compression\", \"gzip\").mode(\"overwrite\").parquet(commonPath + \"/compressed/\")\n"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0e9c4285-199d-4741-92fa-f58668b81b9d","showTitle":false,"title":""}},"outputs":[],"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"CompressionUsingSpark-C5","notebookOrigID":188113580888528,"widgets":{}},"kernelspec":{"display_name":"python","name":"synapse_pyspark"},"language_info":{"name":"python"},"save_output":true,"synapse_widget":{"state":{},"version":"0.1"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter05/PartitioningUsingSpark-C5.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c7ab3737-6159-48cc-a309-8a89455de4e8","showTitle":false,"title":""}},"source":["Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n","https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n","\n","If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"16551fdb-a4b7-4f51-94f9-3e00e0bf3e69","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","val fileSystemName = \"<INSERT CONTAINER NAME>\"\n","\n","val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","# AAD Application Details\n","val appID = \"<INSERT APP ID>\"\n","val secret = \"<INSERT SECRET>\"\n","val tenantID = \"<INSERT TENANT ID>\"\n","\n","spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n","spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n","dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3bb54392-dd29-4d01-9f66-adbda394fd03","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\">Default Partitions: 8\n","Repartition Partitions: 3\n","Coalesce Partitions: 2\n","Range Partitions: 1\n","</div>"]},"metadata":{"application/vnd.databricks.v1+output":{"addedWidgets":{},"arguments":{},"data":"<div class=\"ansiout\">Default Partitions: 8\nRepartition Partitions: 3\nCoalesce Partitions: 2\nRange Partitions: 1\n</div>","datasetInfos":[],"metadata":{},"removedWidgets":[],"type":"html"}},"output_type":"display_data"}],"source":["%python\n","import pyspark\n","\n","columnNames = [\"name\",\"license\",\"gender\",\"salary\"]\n","driverData = [\n","  ('Alice', 'A224455', 'Female', 3000),\n","  ('Bryan','B992244','Male',4000),\n","  ('Catherine','C887733','Female',2000),\n","  ('Daryl','D229988','Male',3000),\n","  ('Jenny','J663300','Female', 6000)\n","]\n","\n","# Create the Dataframe\n","df = spark.createDataFrame(data= driverData, schema = columnNames)\n","print(\"Default Partitions: \" + str(df.rdd.getNumPartitions()))\n","\n","# Using repartition\n","repartitionDF = df.repartition(3)\n","print(\"Repartition Partitions: \" + str(repartitionDF.rdd.getNumPartitions()))\n","\n","# Using coalesce\n","coalesceDF=df.coalesce(2)\n","print(\"Coalesce Partitions: \" + str(coalesceDF.rdd.getNumPartitions()))\n","\n","# Using reparitionByRange\n","repartitionRangeDF = df.repartitionByRange(1,'salary')\n","print(\"Range Partitions: \" + str(repartitionRangeDF.rdd.getNumPartitions()))\n","\n","# You can also use partitionBy and write to files\n","df.write.partitionBy(\"gender\",\"salary\").mode(\"overwrite\").parquet(commonPath + \"/parquet/driver/partition/\")\n","\n","# List the files in the directory or take a screen shot and add it as Markdown"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"PartitioningUsingSpark-C5","notebookOrigID":188113580888533,"widgets":{}},"kernelspec":{"display_name":"python","name":"synapse_pyspark"},"language_info":{"name":"python"},"save_output":true,"synapse_widget":{"state":{},"version":"0.1"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter14/SparkDeltaWithCompaction-C14.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"15eb6d2d-22a7-47be-99bd-167d143c324e","showTitle":false,"title":""}},"source":["Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n","https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n","\n","If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e49b87b8-31ca-4f7e-b6d8-cd5e4b223522","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","val fileSystemName = \"<INSERT CONTAINER NAME>\"\n","\n","val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","# AAD Application Details\n","val appID = \"<INSERT APP ID>\"\n","val secret = \"<INSERT SECRET>\"\n","val tenantID = \"<INSERT TENANT ID>\"\n","\n","spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n","spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n","dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"06320451-6698-43af-857a-de0f29f050a4","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","import org.apache.spark.sql.{DataFrame, Row, SaveMode}\n","import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}\n","\n","val driverDeltaPath = commonPath + \"/delta\"\n","\n","val driverSchema = new StructType().add(\"driverID\", StringType).add(\"name\", StringType).add(\"license\",StringType).add(\"gender\",StringType).add(\"salary\",IntegerType)\n","\n","val driverData = Seq(\n","  Row(\"200\", \"Alice\", \"A224455\", \"Female\", 3000),\n","  Row(\"202\", \"Bryan\",\"B992244\",\"Male\",4000),\n","  Row(\"204\", \"Catherine\",\"C887733\",\"Female\",4000),\n","  Row(\"208\", \"Daryl\",\"D229988\",\"Male\",3000),\n","  Row(\"212\", \"Jenny\",\"J663300\",\"Female\", 5000)\n",")\n","\n","// Create a Dataframe using the above sample data\n","val driverWriteDF = spark.createDataFrame(spark.sparkContext.parallelize(driverData),driverSchema)\n","\n","// Write Driver to Delta\n","driverWriteDF.write.mode(\"overwrite\").format(\"delta\").save(driverDeltaPath)\n","\n","// Now let us read back from the delta location into a Dataframe\n","val driverDF: DataFrame = spark.read.format(\"delta\").load(driverDeltaPath)\n","\n","// Verify the data is available and correct\n","driverDF.show()\n","\n","spark.sql(\"CREATE TABLE IF NOT EXISTS Driver USING DELTA LOCATION '\" + driverDeltaPath + \"'\")\n","spark.sql(\"SELECT * FROM Driver\").show()\n"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5af49e39-5e5e-4759-aa21-9effe4881b9c","showTitle":false,"title":""},"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["%scala\n","// Here is how you can compact the data using OPTMIZE command in Spark SQL\n","spark.sql(\"OPTIMIZE delta.`\" + commonPath + \"/delta`\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0cc9aed2-703c-4bb9-9651-247a6f6cd5f5","showTitle":false,"title":""}},"outputs":[],"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"SparkDeltaWithCompaction-C14","notebookOrigID":188113580888585,"widgets":{}},"kernelspec":{"display_name":"python","name":"synapse_pyspark"},"language_info":{"name":"python"},"save_output":true,"synapse_widget":{"state":{},"version":"0.1"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter08/SparkTransformations-C8.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2036131c-6bed-4364-81ef-e0ee7383e79b","showTitle":false,"title":""},"microsoft":{}},"outputs":[],"source":["%scala\n","// RDD Transformation examples\n","\n","// Let us take a sample list of cities and perform transformations on it.\n","val cities = Seq(\"New York\",\n","  \"New Jersey\",\n","  \"San Francisco\",\n","  \"Phoenix\",\n","  \"Seattle\",\n","  \"Austin\",\n","  \"Atlanta\",\n","  \"Miami\",\n","  \"Salt Lake City\",\n","  \"Tempe\",\n","  \"San Jose\",\n","  \"Chicago\",\n","  \"San Jose\",\n","  \"Miami\",\n","  \" \",\n","  \"Austin\")\n","\n","// Creat the RDD\n","val rdd=spark.sparkContext.parallelize(cities)\n","\n","// Map transformation\n","val maprdd=rdd.map( f => (f,1))\n","maprdd.collect.foreach(println)\n","\n","//FlatMap transformation\n","val fmrdd = rdd.flatMap(word => word.split(\" \"))\n","fmrdd.collect.foreach(println)\n","\n","//Filter transformation\n","val filterrdd = rdd.filter(word => word.contains(\" \"))\n","filterrdd.collect.foreach(println)\n","\n","// Filtering out empty entries\n","val emptystrfilterrdd = rdd.filter(_.nonEmpty)\n","emptystrfilterrdd.collect.foreach(println)\n","\n","// groupby transformation\n","val groupbyrdd = rdd.groupBy(word => word.charAt(0))\n","groupbyrdd.collect.foreach(println)\n","\n","//Union transformation\n","val rdd1 = spark.sparkContext.parallelize(List(1, 2, 3))\n","val rdd2 = spark.sparkContext.parallelize(List(4, 5, 6))\n","\n","val unionrdd = rdd1.union(rdd2)\n","unionrdd.collect().foreach(println)\n","\n","//Distinct transformation\n","val distrinctrdd = rdd.distinct()\n","distrinctrdd.collect.foreach(println)\n"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad7f0c3f-148c-4cb0-9129-ce501dffd521","showTitle":false,"title":""},"collapsed":true,"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["%scala\n","// Dataframe Transformation examples\n","\n","import org.apache.spark.sql.{DataFrame, Row, SaveMode}\n","import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}\n","\n","// Let us define a sample Dataframe\n","val driverDetails = Seq(\n","    Row(\"Alice\",\"\",\"Hood\",\"100\",\"New York\", \"Female\", 4100),\n","    Row(\"Bryan\",\"M\",\"Williams\",\"101\",\"New York\",\"Male\", 4000),\n","    Row(\"Catherine\",\"Goodwin\",\"\",\"102\",\"California\",\"Female\", 4300),\n","    Row(\"Daryl\",\"\",\"Jones\",\"103\",\"Florida\",\"Male\", 5500),\n","    Row(\"Jenny\",\"Anne\",\"Simons\",\"104\",\"Arizona\",\"Female\", 3400),\n","    Row(\"Daryl\",\"\",\"Jones\",\"103\",\"Florida\",\"Male\", 5500)\n","  )\n","\n","// Define the schema\n","val driverSchema = new StructType().add(\"firstName\", StringType).add(\"middleName\", StringType).add(\"lastName\",StringType).add(\"id\",StringType).add(\"location\",StringType).add(\"gender\",StringType).add(\"salary\",IntegerType)\n","\n","// Create the Dataframe\n","val driverDf = spark.createDataFrame(\n","    spark.sparkContext.parallelize(driverDetails),driverSchema)\n","  driverDf.printSchema()\n","  driverDf.show(false)\n","\n","// select transformation\n","driverDf.select(\"firstname\",\"lastname\").show()\n","\n","// filter tranformation\n","driverDf.filter('location === \"Florida\").show(false)\n","\n","// distinct transformation\n","driverDf.distinct().show(false)\n","\n","// Sortby\n","driverDf.sort(\"lastname\",\"firstname\").show(false)\n","\n","// Orderby\n","driverDf.orderBy(\"location\").show(false)\n","\n","//groupby transformation\n","driverDf.groupBy(\"location\").avg(\"salary\").show(false)\n","\n","// Join\n","// For the join, let us create one more datafram called driverRating\n","\n","val driverRating = Seq(\n","    Row(\"100\", 5),\n","    Row(\"101\", 4),\n","    Row(\"102\", 3),\n","    Row(\"103\", 5),\n","    Row(\"104\", 2),\n","    Row(\"103\",4)\n","  )\n","\n","// Define the schema\n","val ratingSchema = new StructType().add(\"id\",StringType).add(\"rating\",IntegerType)\n","\n","// Create the Dataframe\n","val ratingDf = spark.createDataFrame(\n","    spark.sparkContext.parallelize(driverRating),ratingSchema)\n","  ratingDf.printSchema()\n","  ratingDf.show(false)\n","\n","driverDf.join(ratingDf, driverDf(\"id\") ===  ratingDf(\"id\"),\"inner\").show(false)\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6aa1fe5c-45a4-4064-b519-814281a44c09","showTitle":false,"title":""}},"outputs":[],"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"SparkTransformations-C8","notebookOrigID":188113580888554,"widgets":{}},"description":null,"kernelspec":{"display_name":"Synapse Spark","name":"synapse_spark"},"language_info":{"name":"scala"},"save_output":false},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter08/TSQLTransformations-C8.sql:
--------------------------------------------------------------------------------
  1 | -- SQL Transformation Examples
  2 | 
  3 | -- CLean up the workspace
  4 | 
  5 | --DROP TABLE dbo.Driver;
  6 | --DROP TABLE dbo.TripTable;
  7 | --DROP TABLE dbo.Feedback;
  8 | --DROP TABLE dbo.TempDriver;
  9 | 
 10 | -- Create a Driver table
 11 | 
 12 | CREATE TABLE dbo.Driver
 13 | (
 14 |     [driverId] INT NOT NULL,
 15 |     [firstName] VARCHAR(40),
 16 |     [middleName] VARCHAR(40),
 17 |     [lastName] VARCHAR(40),
 18 |     [city] VARCHAR(40),
 19 |     [gender] VARCHAR(40),
 20 |     [salary] INT
 21 | )
 22 |     WITH 
 23 |     (
 24 |         CLUSTERED COLUMNSTORE INDEX
 25 |     )
 26 | GO
 27 | 
 28 | -- Insert sample data 
 29 | INSERT INTO dbo.Driver VALUES (200, 'Alice','','Hood','New York', 'Female', 4100);
 30 | INSERT INTO dbo.Driver VALUES (201, 'Bryan','M','Williams','New York','Male', 4000);
 31 | INSERT INTO dbo.Driver VALUES (202, 'Catherine','Goodwin','','California','Female', 4300);
 32 | INSERT INTO dbo.Driver VALUES (203, 'Daryl','','Jones','Florida','Male', 5500);
 33 | INSERT INTO dbo.Driver VALUES (204, 'Jenny','Anne','Simons','Arizona','Female', 3400);
 34 | INSERT INTO dbo.Driver VALUES (203, 'Daryl','','Jones','Florida','Male', 5500);
 35 | 
 36 | 
 37 | CREATE TABLE dbo.TempDriver
 38 | (
 39 |     [driverId] INT NOT NULL,
 40 |     [firstName] VARCHAR(40),
 41 |     [middleName] VARCHAR(40),
 42 |     [lastName] VARCHAR(40),
 43 |     [city] VARCHAR(40),
 44 |     [gender] VARCHAR(40),
 45 |     [salary] INT
 46 | )
 47 |     WITH 
 48 |     (
 49 |         CLUSTERED COLUMNSTORE INDEX
 50 |     )
 51 | GO
 52 | 
 53 | INSERT INTO dbo.TempDriver VALUES (210, 'Alicia','','Yang','New York', 'Female', 4000);
 54 | INSERT INTO dbo.TempDriver VALUES (211, 'Brandon','','Rhodes','New York','Male', 3000);
 55 | INSERT INTO dbo.TempDriver VALUES (212, 'Cathy','','Mayor','California','Female', 3000);
 56 | INSERT INTO dbo.TempDriver VALUES (213, 'Dennis','','Brown','Florida','Male', 2500);
 57 | INSERT INTO dbo.TempDriver VALUES (214, 'Jeremey','','Stilton','Arizona','Male', 2500);
 58 | INSERT INTO dbo.TempDriver VALUES (215, 'Maile','','Green','Florida','Female', 4000);
 59 | 
 60 | 
 61 | CREATE TABLE dbo.TripTable
 62 | (
 63 |     [tripId] INT NOT NULL,
 64 |     [driverId] INT NOT NULL,
 65 |     [customerId] INT NOT NULL,
 66 |     [tripDate] INT,
 67 |     [startLocation] VARCHAR(40),
 68 |     [endLocation] VARCHAR(40)
 69 |  )
 70 |  WITH
 71 |  (
 72 |     CLUSTERED COLUMNSTORE INDEX,
 73 |     DISTRIBUTION = HASH ([tripId])
 74 |  )
 75 | 
 76 | INSERT INTO dbo.TripTable VALUES (100, 200, 300, 20220101, 'New York', 'New Jersey');
 77 | INSERT INTO dbo.TripTable VALUES (101, 201, 301, 20220101, 'Miami', 'Dallas');
 78 | INSERT INTO dbo.TripTable VALUES (102, 202, 302, 20220102, 'Phoenix', 'Tempe');
 79 | INSERT INTO dbo.TripTable VALUES (103, 203, 303, 20220204, 'LA', 'San Jose');
 80 | INSERT INTO dbo.TripTable VALUES (104, 204, 304, 20220205, 'Seattle', 'Redmond');
 81 | INSERT INTO dbo.TripTable VALUES (105, 205, 305, 20220301, 'Atlanta', 'Chicago');
 82 | 
 83 | 
 84 | CREATE TABLE dbo.Feedback
 85 | (
 86 |     [driverId] INT NOT NULL,
 87 |     [rating] INT,
 88 |     [comment] VARCHAR(100)
 89 | )
 90 | WITH
 91 | (
 92 |     CLUSTERED COLUMNSTORE INDEX
 93 | )
 94 | 
 95 | INSERT INTO dbo.Feedback VALUES (200, 5, 'On time');
 96 | INSERT INTO dbo.Feedback VALUES (201, 4, 'Good manners');
 97 | INSERT INTO dbo.Feedback VALUES (201, 5, 'Punctual driver');
 98 | INSERT INTO dbo.Feedback VALUES (203, 2, 'Rude');
 99 | INSERT INTO dbo.Feedback VALUES (200, 1, 'Dirty seats');
100 | INSERT INTO dbo.Feedback VALUES (204, 4, 'Clean car');
101 | 
102 | 
103 | -- Now let us run some transformations on the above tables
104 | 
105 | -- WHERE
106 | SELECT [firstName], [lastName] from dbo.Driver WHERE [city] = 'New York';
107 | SELECT [firstName], [lastName] from dbo.Driver WHERE [salary] > 5000;
108 | 
109 | -- DISTINCT 
110 | SELECT DISTINCT [firstName], [lastName] from dbo.Driver;
111 | 
112 | -- ORDER BY
113 | SELECT DISTINCT [firstName], [lastName] from dbo.Driver ORDER BY [firstName];
114 | 
115 | -- GROUP BY
116 | SELECT [gender], AVG([salary]) AS 'AVG salary' from dbo.Driver GROUP BY [gender];
117 | 
118 | -- UNION
119 | 
120 | SELECT [firstName], [lastName] FROM
121 | dbo.Driver
122 | WHERE [city] = 'New York'
123 | UNION 
124 | select [firstName], [lastName] FROM
125 | dbo.TempDriver
126 | WHERE [city] = 'New York';
127 | GO
128 | 
129 | 
130 | --JOIN
131 | 
132 | SELECT driver.[firstName], driver.[lastName], feedback.[rating], Feedback.[comment] FROM
133 | dbo.Driver AS driver
134 | INNER JOIN dbo.Feedback AS feedback
135 | ON driver.[driverId] = feedback.[driverId]
136 | WHERE driver.[city] = 'New York';
137 | GO
138 | 
139 | 
140 | --VIEWS
141 | -- Driver and Feedback
142 | DROP VIEW CompleteDriverView;
143 | 
144 | CREATE VIEW CompleteDriverView 
145 | AS
146 | SELECT driver.[firstName], driver.[lastName], feedback.[rating], feedback.[comment] FROM
147 | dbo.Driver AS driver
148 | INNER JOIN dbo.Feedback AS feedback
149 | ON driver.[driverId] = feedback.[driverId]
150 | WHERE driver.[city] = 'New York';
151 | GO
152 | 
153 | SELECT DISTINCT * from CompleteDriverView;


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Azure Data Engineer Associate Certification Guide
 5 | 
 6 | <a href="https://www.packtpub.com/product/dp-203-azure-data-engineer-associate-certification-guide/9781801816069?utm_source=github&utm_medium=repository&utm_campaign=9781801816069"><img src="https://static.packt-cdn.com/products/9781801816069/cover/smaller" alt="Azure Data Engineer Associate Certification Guide" height="256px" align="right"></a>
 7 | 
 8 | This is the code repository for [Azure Data Engineer Associate Certification Guide](https://www.packtpub.com/product/dp-203-azure-data-engineer-associate-certification-guide/9781801816069?utm_source=github&utm_medium=repository&utm_campaign=9781801816069), published by Packt.
 9 | 
10 | **A hands-on reference guide to developing your data engineering skills and preparing for the DP-203 exam**
11 | 
12 | ## What is this book about?
13 | The DP-203: Azure Data Engineer Associate Certification Guide offers complete coverage of the DP-203 certification requirements so that you can take the exam with confidence. Going beyond the requirements for the exam, 
14 | this book also provides you with additional knowledge to enable you to succeed in your real-life Azure data engineering projects.
15 | 
16 | 
17 | This book covers the following exciting features: 
18 | * Gain intermediate-level knowledge of Azure the data infrastructure
19 | * Design and implement data lake solutions with batch and stream pipelines
20 | * Identify the partition strategies available in Azure storage technologies
21 | * Implement different table geometries in Azure Synapse Analytics
22 | * Use the transformations available in T-SQL, Spark, and Azure Data Factory
23 | * Use Azure Databricks or Synapse Spark to process data using Notebooks
24 | * Design security using RBAC, ACL, encryption, data masking, and more
25 | * Monitor and optimize data pipelines with debugging tips
26 | 
27 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/B09N73BVDQ) today!
28 | 
29 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
30 | alt="https://www.packtpub.com/" border="5" /></a>
31 | 
32 | 
33 | ## Instructions and Navigations
34 | All of the code is organized into folders.
35 | 
36 | The code will look like the following:
37 | ```
38 | SELECT trip.[tripId], customer.[name] FROM 
39 | dbo.FactTrips AS trip
40 | JOIN dbo.DimCustomer AS customer
41 | ON trip.[customerId] = customer.[customerId] 
42 | WHERE trip.[endLocation] = 'San Jose';
43 | ```
44 | 
45 | **Following is what you need for this book:**
46 | This book is for data engineers who want to take the DP-203: Azure Data Engineer Associate exam and are looking to gain in-depth knowledge of the Azure cloud stack.
47 | The book will also help engineers and product managers who are new to Azure or interviewing with companies working on Azure technologies, to get hands-on experience of Azure data technologies. 
48 | A basic understanding of cloud technologies, extract, transform, and load (ETL), and databases will help you get the most out of this book.
49 | 
50 | With the following software and hardware list you can run all code files present in the book (Chapter 1-15).
51 | 
52 | ### Software and Hardware List
53 | 
54 | | Chapter  | Software required                   | OS required                        |
55 | | -------- | ------------------------------------| -----------------------------------|
56 | | 1-15	   | Azure account (free or paid)        | Windows, Mac OS X, and Linux (Any) |
57 | | 1-15     | Azure CLI                           | Windows, Mac OS X, and Linux (Any) |
58 | 
59 | 
60 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://static.packt-cdn.com/downloads/9781801816069_ColorImages.pdf).
61 | 
62 | 
63 | ### Related products <Other books you may enjoy>
64 | * Azure Data Scientist Associate Certification Guide [[Packt]](https://www.packtpub.com/product/azure-data-scientist-associate-certification-guide/9781800565005?utm_source=github&utm_medium=repository&utm_campaign=9781800565005) [[Amazon]](https://www.amazon.com/dp/1800565003)
65 | 
66 | * Data Engineering with Apache Spark, Delta Lake, and Lakehouse [[Packt]](https://www.packtpub.com/product/data-engineering-with-apache-spark-delta-lake-and-lakehouse/9781801077743?utm_source=github&utm_medium=repository&utm_campaign=9781801077743) [[Amazon]](https://www.amazon.com/dp/1801077746)
67 | 
68 | ## Get to Know the Authors
69 | **Newton Alex** 
70 | leads several Azure Data Analytics teams in Microsoft, India. His team contributes to technologies including Azure Synapse, Azure Databricks, Azure HDInsight, and many open source technologies, including Apache YARN, Apache Spark, and Apache Hive.
71 | He started using Hadoop while at Yahoo, USA, where he helped build the first batch processing pipelines for Yahoo’s ad serving team. 
72 | After Yahoo, he became the leader of the big data team at Pivotal Inc., USA, where he was responsible for the entire open source stack of Pivotal Inc. 
73 | He later moved to Microsoft and started the Azure Data team in India. 
74 | He has worked with several Fortune 500 companies to help build their data systems on Azure.
75 | ### Download a free PDF
76 | 
77 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
78 | <p align="center"> <a href="https://packt.link/free-ebook/9781801816069">https://packt.link/free-ebook/9781801816069 </a> </p>


--------------------------------------------------------------------------------
/Chapter08/ShreddingJSONUsingSpark-C8.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f719de3a-1d57-4f33-a157-402c8b663948","showTitle":false,"title":""}},"source":["Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n","https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n","\n","If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e7f5ea18-004c-4263-ad13-2fe218f26f10","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","val fileSystemName = \"<INSERT CONTAINER NAME>\"\n","\n","val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","# AAD Application Details\n","val appID = \"<INSERT APP ID>\"\n","val secret = \"<INSERT SECRET>\"\n","val tenantID = \"<INSERT TENANT ID>\"\n","\n","spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n","spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n","dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e3c18e58-17b7-4e58-a9e7-08f48e0599c0","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","\n","// Let us see how to write to a JSON file\n","import org.apache.spark.sql.{DataFrame, Row, SaveMode}\n","import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}\n","\n","// Generate sample data\n","val driverDetails = Seq(\n","    Row(\"Alice\",\"\",\"Hood\",\"100\",\"New York\", \"Female\", 4100),\n","    Row(\"Bryan\",\"M\",\"Williams\",\"101\",\"New York\",\"Male\", 4000),\n","    Row(\"Catherine\",\"Goodwin\",\"\",\"102\",\"California\",\"Female\", 4300),\n","    Row(\"Daryl\",\"\",\"Jones\",\"103\",\"Florida\",\"Male\", 5500),\n","    Row(\"Jenny\",\"Anne\",\"Simons\",\"104\",\"Arizona\",\"Female\", 3400),\n","    Row(\"Daryl\",\"\",\"Jones\",\"103\",\"Florida\",\"Male\", 5500)\n","  )\n","\n","val driverSchema = new StructType().add(\"firstname\", StringType).add(\"middlename\", StringType).add(\"lastname\",StringType).add(\"id\",StringType).add(\"location\",StringType).add(\"gender\",StringType).add(\"salary\",IntegerType)\n","\n","// Create the Dataframe using the sample data\n","val df2 = spark.createDataFrame(spark.sparkContext.parallelize(driverDetails), driverSchema)\n","df2.printSchema()\n","df2.show(false)\n","\n","// Write to storage as JSON file\n","df2.write.mode(SaveMode.Overwrite).json(commonPath + \"/json/\")\n"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"98f9c0f1-7f41-43e6-9b5d-cc2c238f5668","showTitle":false,"title":""},"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["%scala\n","// Let us see how to read the JSON file back into a new Dataframe\n","\n","import org.apache.spark.sql.{DataFrame, Row, SaveMode}\n","import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}\n","\n","val driverSchema = new StructType().add(\"firstname\", StringType).add(\"middlename\", StringType).add(\"lastname\",StringType).add(\"id\",StringType).add(\"location\",StringType).add(\"gender\",StringType).add(\"salary\",IntegerType)\n","\n","val dfJSON = spark.read.schema(driverSchema).json(commonPath + \"/json/*.json\")\n","\n","// View the file\n","dfJSON.printSchema()\n","dfJSON.show(false)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a6fb18ec-dcf0-4d2a-970c-c0b1df9537ff","showTitle":false,"title":""},"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["%scala\n","import org.apache.spark.sql.{DataFrame, Row, SaveMode}\n","import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}\n","\n","// You can also infer the schema directly without specifying it for simple structures\n","\n","val dfJSON = spark.read.json(commonPath + \"/json/*.json\")\n","dfJSON.printSchema()\n","dfJSON.show(false)\n","\n","// Here is the deduced schema\n","val schema = dfJSON.schema\n","println(schema)"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"ShreddingJSONUsingSpark-C8","notebookOrigID":188113580888548,"widgets":{}},"kernelspec":{"display_name":"python","name":"synapse_pyspark"},"language_info":{"name":"python"},"save_output":true,"synapse_widget":{"state":{},"version":"0.1"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter02/DataPruningWithSpark-C2.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c6f10c98-0681-479a-9297-63977f20f894","showTitle":false,"title":""}},"source":["Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n","https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n","\n","If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"de7ed935-dcba-42f0-a9e7-89f33dc9a494","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","val fileSystemName = \"<INSERT CONTAINER NAME>\"\n","\n","val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","# AAD Application Details\n","val appID = \"<INSERT APP ID>\"\n","val secret = \"<INSERT SECRET>\"\n","val tenantID = \"<INSERT TENANT ID>\"\n","\n","spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n","spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n","dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e607f0b9-89b2-4f2b-9912-2e12f83dc56e","showTitle":false,"title":""}},"outputs":[],"source":["from pyspark.sql.functions import *\n","\n","columnNames = [\"tripId\",\"driverId\",\"customerId\",\"cabId\",\"tripDate\",\"startLocation\",\"endLocation\"]\n","tripData = [\n","  ('100', '200', '300', '400', '20220101', 'New York', 'New Jersey'),\n","  ('101', '201', '301', '401', '20220102', 'Tempe', 'Phoenix'),\n","  ('102', '202', '302', '402', '20220103', 'San Jose', 'San Franciso'),\n","  ('103', '203', '303', '403', '20220102', 'New York', 'Boston'),\n","  ('104', '204', '304', '404', '20220103', 'New York', 'Washington'),\n","  ('105', '205', '305', '405', '20220201', 'Miami', 'Fort Lauderdale'),\n","  ('106', '206', '306', '406', '20220202', 'Seattle', 'Redmond'),\n","  ('107', '207', '307', '407', '20220203', 'Los Angeles', 'San Diego'),\n","  ('108', '208', '308', '408', '20220301', 'Phoenix', 'Las Vegas'),\n","  ('109', '209', '309', '409', '20220302', 'Washington', 'Baltimore'),\n","  ('110', '210', '310', '410', '20220303', 'Dallas', 'Austin'),\n","  ('111', '211', '311', '411', '20220303', 'New York', 'New Jersey'),\n","  ('112', '212', '312', '412', '20220304', 'New York', 'Boston'),\n","  ('113', '212', '312', '412', '20220401', 'San Jose', 'San Ramon'),\n","  ('114', '212', '312', '412', '20220404', 'San Jose', 'Oakland'),\n","  ('115', '212', '312', '412', '20220404', 'Tempe', 'Scottsdale'),\n","  ('116', '212', '312', '412', '20220405', 'Washington', 'Atlanta'),\n","  ('117', '212', '312', '412', '20220405', 'Seattle', 'Portland'),\n","  ('118', '212', '312', '412', '20220405', 'Miami', 'Tampa')\n","]\n","df = spark.createDataFrame(data= tripData, schema = columnNames)\n","\n","# Split the data according the current timestamp and write to store as parquet files\n","dftripDate = df.withColumn(\"tripDate\", to_timestamp(col(\"tripDate\"), 'yyyyMMdd')) \\\n","           .withColumn(\"year\", tripDate_format(col(\"tripDate\"), \"yyyy\")) \\\n","           .withColumn(\"month\", tripDate_format(col(\"tripDate\"), \"MM\")) \\\n","           .withColumn(\"day\", tripDate_format(col(\"tripDate\"), \"dd\"))\n","\n","dftripDate.show(truncate=False)\n","\n","dftripDate.write.partitionBy(\"year\", \"month\", \"day\").mode(\"overwrite\").parquet(commonPath + \"/partition/\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d513bd6b-abd1-4061-acd8-2ec127d6b468","showTitle":false,"title":""},"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["# Now, let’s see how pruning works. \n","# For example,  the following query will only scan month=01 folder and skip all other folders.\n","readDF = spark.read.parquet(commonPath + \"/partition/year=2022\").filter(\"month=01\")\n","readDF.show(truncate=False)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0504c855-f269-4891-9b2f-c0d11d6d6259","showTitle":false,"title":""},"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"DataPruningWithSpark-C2","notebookOrigID":188113580888522,"widgets":{}},"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"display_name":"Synapse PySpark","language":"Python","name":"synapse_pyspark"},"language_info":{"name":"python"},"save_output":true,"synapse_widget":{"state":{},"version":"0.1"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter05/SynSQLDistributionsIndexes-C5.sql:
--------------------------------------------------------------------------------
  1 | -- Synapse SQL Indexes, Distributions and Partitions Example
  2 | 
  3 | -- Example of CLUSTERED COLUMNSTORE INDEX with Hash index
  4 | DROP TABLE dbo.TripTable;
  5 | 
  6 | CREATE TABLE dbo.TripTable
  7 | (
  8 |     [tripId] INT NOT NULL,
  9 |     [driverId] INT NOT NULL,
 10 |     [customerID] INT NOT NULL,
 11 |     [tripDate] INT,
 12 |     [startLocation] VARCHAR(40),
 13 |     [endLocation] VARCHAR(40)
 14 |  )
 15 |  WITH
 16 |  (
 17 |     CLUSTERED COLUMNSTORE INDEX,
 18 |     DISTRIBUTION = HASH ([tripId]),
 19 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
 20 |         ( 20220101, 20220201, 20220301 )
 21 |     )
 22 | )
 23 | 
 24 | INSERT INTO dbo.TripTable VALUES (100, 200, 300, 20220101, 'New York', 'New Jersey');
 25 | INSERT INTO dbo.TripTable VALUES (101, 201, 301, 20220101, 'Miami', 'Dallas');
 26 | INSERT INTO dbo.TripTable VALUES (102, 202, 302, 20220102, 'Phoenix', 'Tempe');
 27 | INSERT INTO dbo.TripTable VALUES (103, 203, 303, 20220204, 'LA', 'San Jose');
 28 | INSERT INTO dbo.TripTable VALUES (104, 204, 304, 20220205, 'Seattle', 'Redmond');
 29 | INSERT INTO dbo.TripTable VALUES (105, 205, 305, 20220301, 'Atlanta', 'Chicago');
 30 | 
 31 |  SELECT * from dbo.TripTable;
 32 | 
 33 | DROP TABLE dbo.TripTable;
 34 | 
 35 | -- Example of CLUSTERED COLUMNSTORE INDEX with Round Robin
 36 | 
 37 | CREATE TABLE dbo.TripTable
 38 | (
 39 |     [tripId] INT NOT NULL,
 40 |     [driverId] INT NOT NULL,
 41 |     [customerID] INT NOT NULL,
 42 |     [tripDate] INT,
 43 |     [startLocation] VARCHAR(40),
 44 |     [endLocation] VARCHAR(40)
 45 |  )
 46 |  WITH
 47 |  (
 48 |     CLUSTERED COLUMNSTORE INDEX,
 49 |     DISTRIBUTION = ROUND_ROBIN,
 50 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
 51 |         ( 20220101, 20220201, 20220301 )
 52 |     )
 53 | )
 54 | 
 55 | INSERT INTO dbo.TripTable VALUES (100, 200, 300, 20220101, 'New York', 'New Jersey');
 56 | INSERT INTO dbo.TripTable VALUES (101, 201, 301, 20220101, 'Miami', 'Dallas');
 57 | INSERT INTO dbo.TripTable VALUES (102, 202, 302, 20220102, 'Phoenix', 'Tempe');
 58 | INSERT INTO dbo.TripTable VALUES (103, 203, 303, 20220204, 'LA', 'San Jose');
 59 | INSERT INTO dbo.TripTable VALUES (104, 204, 304, 20220205, 'Seattle', 'Redmond');
 60 | INSERT INTO dbo.TripTable VALUES (105, 205, 305, 20220301, 'Atlanta', 'Chicago');
 61 | 
 62 |  SELECT * from dbo.TripTable;
 63 | 
 64 | -- Example of CLUSTERED COLUMNSTORE INDEX with REPLICATE distribution
 65 | DROP TABLE dbo.TripTable;
 66 | 
 67 | CREATE TABLE dbo.TripTable
 68 | (
 69 |     [tripId] INT NOT NULL,
 70 |     [driverId] INT NOT NULL,
 71 |     [customerID] INT NOT NULL,
 72 |     [tripDate] INT,
 73 |     [startLocation] VARCHAR(40),
 74 |     [endLocation] VARCHAR(40)
 75 |  )
 76 |  WITH
 77 |  (
 78 |     CLUSTERED COLUMNSTORE INDEX,
 79 |     DISTRIBUTION = REPLICATE,
 80 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
 81 |         ( 20220101, 20220201, 20220301 )
 82 |     )
 83 | )
 84 | 
 85 | INSERT INTO dbo.TripTable VALUES (100, 200, 300, 20220101, 'New York', 'New Jersey');
 86 | INSERT INTO dbo.TripTable VALUES (101, 201, 301, 20220101, 'Miami', 'Dallas');
 87 | INSERT INTO dbo.TripTable VALUES (102, 202, 302, 20220102, 'Phoenix', 'Tempe');
 88 | INSERT INTO dbo.TripTable VALUES (103, 203, 303, 20220204, 'LA', 'San Jose');
 89 | INSERT INTO dbo.TripTable VALUES (104, 204, 304, 20220205, 'Seattle', 'Redmond');
 90 | INSERT INTO dbo.TripTable VALUES (105, 205, 305, 20220301, 'Atlanta', 'Chicago');
 91 | 
 92 |  SELECT * from dbo.TripTable;
 93 | 
 94 | -- Example of CLUSTERED INDEX
 95 | 
 96 | DROP TABLE dbo.TripTable;
 97 | 
 98 | CREATE TABLE dbo.TripTable
 99 | (
100 |     [tripId] INT NOT NULL,
101 |     [driverId] INT NOT NULL,
102 |     [customerID] INT NOT NULL,
103 |     [tripDate] INT,
104 |     [startLocation] VARCHAR(40),
105 |     [endLocation] VARCHAR(40)
106 |  )
107 |  WITH
108 |  (
109 |     CLUSTERED INDEX (tripID),
110 |     DISTRIBUTION = REPLICATE,
111 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
112 |         ( 20220101, 20220201, 20220301 )
113 |     )
114 | )
115 | 
116 | INSERT INTO dbo.TripTable VALUES (100, 200, 300, 20220101, 'New York', 'New Jersey');
117 | INSERT INTO dbo.TripTable VALUES (101, 201, 301, 20220101, 'Miami', 'Dallas');
118 | INSERT INTO dbo.TripTable VALUES (102, 202, 302, 20220102, 'Phoenix', 'Tempe');
119 | INSERT INTO dbo.TripTable VALUES (103, 203, 303, 20220204, 'LA', 'San Jose');
120 | INSERT INTO dbo.TripTable VALUES (104, 204, 304, 20220205, 'Seattle', 'Redmond');
121 | INSERT INTO dbo.TripTable VALUES (105, 205, 305, 20220301, 'Atlanta', 'Chicago');
122 | 
123 |  SELECT * from dbo.TripTable;
124 | 
125 | -- Example of  Heap indexing
126 | 
127 | DROP TABLE dbo.TripTable;
128 | 
129 | CREATE TABLE dbo.TripTable
130 | (
131 |     [tripId] INT NOT NULL,
132 |     [driverId] INT NOT NULL,
133 |     [customerID] INT NOT NULL,
134 |     [tripDate] INT,
135 |     [startLocation] VARCHAR(40),
136 |     [endLocation] VARCHAR(40)
137 |  )
138 |  WITH
139 |  (
140 |     HEAP,
141 |     DISTRIBUTION = REPLICATE,
142 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
143 |         ( 20220101, 20220201, 20220301 )
144 |     )
145 | )
146 | 
147 | INSERT INTO dbo.TripTable VALUES (100, 200, 300, 20220101, 'New York', 'New Jersey');
148 | INSERT INTO dbo.TripTable VALUES (101, 201, 301, 20220101, 'Miami', 'Dallas');
149 | INSERT INTO dbo.TripTable VALUES (102, 202, 302, 20220102, 'Phoenix', 'Tempe');
150 | INSERT INTO dbo.TripTable VALUES (103, 203, 303, 20220204, 'LA', 'San Jose');
151 | INSERT INTO dbo.TripTable VALUES (104, 204, 304, 20220205, 'Seattle', 'Redmond');
152 | INSERT INTO dbo.TripTable VALUES (105, 205, 305, 20220301, 'Atlanta', 'Chicago');
153 | 
154 |  SELECT * from dbo.TripTable;
155 | 
156 | 
157 | -- You can use the following query to find the indexes, partition number and other details about the table
158 | 
159 | SELECT  QUOTENAME(s.[name])+'.'+QUOTENAME(t.[name]) as Table_name
160 | ,       i.[name] as Index_name
161 | ,       p.partition_number as Partition_nmbr
162 | ,       p.[rows] as Row_count
163 | ,       p.[data_compression_desc] as Data_Compression_desc
164 | FROM    sys.partitions p
165 | JOIN    sys.tables     t    ON    p.[object_id]   = t.[object_id]
166 | JOIN    sys.schemas    s    ON    t.[schema_id]   = s.[schema_id]
167 | JOIN    sys.indexes    i    ON    p.[object_id]   = i.[object_Id]
168 |                             AND   p.[index_Id]    = i.[index_Id]
169 | WHERE t.[name] = 'TripTable'
170 | 


--------------------------------------------------------------------------------
/Chapter06/SynSQLPartitionSwitching-C6.sql:
--------------------------------------------------------------------------------
  1 | -- Data Partition Switching Example
  2 | 
  3 | -- Let’s assume that we need to store only 3 months’ worth of data. 
  4 | -- Our Fact table, dbo.TripTable, contains the data for 20220101, 20220201, and 20220301. 
  5 | -- Now, let’s learn how to delete the first month’s data and add the new month’s data, 20220401, to the table using the technique of Data switching
  6 | 
  7 | -- Since Synapse SQL doesn't support CREATE TABLE IF EXISTS, I've provided commented DROP TABLE statements for convenience
  8 | -- DROP TABLE dbo.TripTable
  9 | 
 10 | -- Create a sample TripTable with partitions
 11 | CREATE TABLE dbo.TripTable
 12 | (
 13 |     [tripId] INT NOT NULL,
 14 |     [driverId] INT NOT NULL,
 15 |     [customerId] INT NOT NULL,
 16 |     [tripDate] INT,
 17 |     [startLocation] VARCHAR (40),
 18 |     [endLocation] VARCHAR (40)
 19 |  )
 20 |  WITH
 21 |  (
 22 |     CLUSTERED COLUMNSTORE INDEX,
 23 |     DISTRIBUTION = HASH ([tripId]),
 24 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
 25 |         ( 20220101, 20220201, 20220301 )
 26 |   )
 27 | )
 28 | 
 29 | -- Insert some dummy data covering all the three date partition ranges
 30 | INSERT INTO dbo.TripTable VALUES (100, 200, 300, 20220101, 'New York', 'New Jersey');
 31 | INSERT INTO dbo.TripTable VALUES (101, 201, 301, 20220101, 'Miami', 'Dallas');
 32 | INSERT INTO dbo.TripTable VALUES (102, 202, 302, 20220102, 'Phoenix', 'Tempe');
 33 | INSERT INTO dbo.TripTable VALUES (103, 203, 303, 20220204, 'LA', 'San Jose');
 34 | INSERT INTO dbo.TripTable VALUES (104, 204, 304, 20220205, 'Seattle', 'Redmond');
 35 | INSERT INTO dbo.TripTable VALUES (105, 205, 305, 20220301, 'Atlanta', 'Chicago');
 36 | 
 37 | SELECT * FROM dbo.TripTable;
 38 | 
 39 | -- Deleting an old partition
 40 | -- To delete a partition, we need to create a dummy table that has the same structure as the original table 
 41 | -- and then swap out the partition from the original table to  the dummy table. 
 42 | -- This section will show you how to switch out the 20220101 partition.
 43 | 
 44 | -- Create a dummy table that contains the partition that needs to be switched out, as follows:
 45 | -- DROP TABLE dbo.TripTable_20220101;
 46 | CREATE TABLE dbo.TripTable_20220101
 47 | WITH
 48 |  (
 49 |     CLUSTERED COLUMNSTORE INDEX,
 50 |     DISTRIBUTION = HASH ([tripId]),
 51 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES (20220101) ) 
 52 |   )
 53 | AS
 54 | SELECT * FROM dbo.TripTable WHERE 1=2 ;
 55 | 
 56 | SELECT * FROM dbo.TripTable_20220101;
 57 | 
 58 | -- Let us try to switch out PARTITION 20220101.
 59 | -- 20220101 is actually the second partition since the first partition will correspond to all the values before 20220101, 
 60 | -- which, in our case, would be empty. 
 61 | ALTER TABLE dbo.TripTable SWITCH PARTITION 2 TO dbo.TripTable_20220101 PARTITION 2 WITH (TRUNCATE_TARGET = ON);
 62 | 
 63 | -- Now, dbo.TripTable will contain 0 rows for partition 2, which corresponds to 20220101.
 64 | 
 65 | -- Next, let’s add a new partition, 20220401, to the table.
 66 | -- We have to repeat the same steps as before to empty the partition 20220301 first.
 67 | -- Then split the last partition of TripTable into two ranges: 20220301 and 20220401. 
 68 | -- Move the 20220301 data from the temporary table back into TripTable
 69 | -- Finally swap in a new partition for 20220401
 70 | 
 71 | -- Create a dummy table that with the same partitions as the TripTable:
 72 | DROP TABLE dbo.TripTable_20220301;
 73 | CREATE TABLE dbo.TripTable_20220301
 74 | WITH
 75 |  (
 76 |     CLUSTERED COLUMNSTORE INDEX,
 77 |     DISTRIBUTION = HASH ([tripId]),
 78 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
 79 |         (20220101, 20220201, 20220301)
 80 |     )
 81 |   )
 82 | AS
 83 | SELECT * FROM dbo.TripTable WHERE
 84 |    [tripDate] >= 20220301 AND [tripDate] < 20220401 ;
 85 | 
 86 | SELECT * FROM dbo.TripTable_20220301;
 87 | 
 88 | -- Now switch out PARTITION 20220103.
 89 | 
 90 | -- Run the ALTER TABLE command, as shown in the following code block, to swap the partition out:
 91 | ALTER TABLE dbo.TripTable SWITCH PARTITION 4 TO dbo.TripTable_20220301 PARTITION 4 WITH (TRUNCATE_TARGET = ON);
 92 | 
 93 | -- Adding a new partition to the TripTable
 94 | -- To add our new partition, we need to split the last partition into two partitions. We can use the following SPLIT command to do this:
 95 | ALTER TABLE dbo.TripTable SPLIT RANGE (20220401);
 96 | 
 97 | -- Copy back the 20220301 data now. 
 98 | -- Since the partition ranges should be exactly the same as TripTable, which now contains one extra 20220401 partition,
 99 | -- we have to create another temporary table with all the four partition ranges as shown by copying the data from the TripTable_20220301 temporary table.
100 | 
101 | -- DROP TABLE dbo.TripTable_20220301_20220301;
102 | CREATE TABLE dbo.TripTable_20220301_20220301
103 | WITH
104 |  (
105 |     CLUSTERED COLUMNSTORE INDEX,
106 |     DISTRIBUTION = HASH ([tripId]),
107 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
108 |         (20220101, 20220201, 20220301, 20220401)
109 |     )
110 |   )
111 | AS
112 | SELECT * FROM dbo.TripTable_20220301 WHERE
113 |    [tripDate] >= 20220301 AND [tripDate] < 20220401 ;
114 | 
115 | SELECT * FROM dbo.TripTable_20220301_20220301;
116 | 
117 | -- Now switch back TripTable_20220301 (using TripTable_20220301_20220301 temp table) data into TripTable
118 | 
119 | ALTER TABLE dbo.TripTable_20220301_20220301 SWITCH PARTITION 4 TO dbo.TripTable PARTITION 4 WITH (TRUNCATE_TARGET = ON);
120 | 
121 | -- DROP TABLE dbo.TripTable_new
122 | -- Once we have created the new partition for 20220401, we must create a dummy table with the same partition alignment again to add the new month's data
123 | -- The following code snippet does this:
124 | CREATE TABLE dbo.TripTable_new
125 | WITH
126 |  (
127 |     CLUSTERED COLUMNSTORE INDEX,
128 |     DISTRIBUTION = HASH ([tripId]),
129 |     PARTITION ([tripDate] RANGE RIGHT FOR VALUES
130 |         (20220101, 20220201, 20220301, 20220401)
131 |   )
132 | )
133 | AS
134 | SELECT * FROM dbo.TripTable WHERE 1 = 2;
135 | 
136 | -- Let’s add some values to the new partition:
137 | INSERT INTO dbo.TripTable_new VALUES (333, 444, 555, 20220401, 'New York', 'New Jersey');
138 | 
139 | SELECT * FROM dbo.TripTable_new;
140 | 
141 | -- Once we have loaded the partition data into the dummy table, we can switch the partition into our Fact table using the ALTER command
142 | 
143 | ALTER TABLE dbo.TripTable_new SWITCH PARTITION 5 TO dbo.TripTable PARTITION 5 WITH (TRUNCATE_TARGET = ON);
144 | 
145 | -- The ALTER TABLE commands will return almost immediately as they are metadata operations  
146 | -- It doesn't involve copying rows from one partition to another. 
147 | 
148 | -- Check the data to confirm if the swap has happened correctly
149 | 
150 | SELECT * FROM dbo.TripTable;


--------------------------------------------------------------------------------
/Chapter12/HandlingSensitiveInfoInDataframe-C12.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f3b41bd4-0b99-4143-97d3-7b280145557d","showTitle":false,"title":""}},"source":["Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n","https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n","\n","If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a5bfb54c-47df-4805-8476-2220852bae26","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","val fileSystemName = \"<INSERT CONTAINER NAME>\"\n","\n","val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","# AAD Application Details\n","val appID = \"<INSERT APP ID>\"\n","val secret = \"<INSERT SECRET>\"\n","val tenantID = \"<INSERT TENANT ID>\"\n","\n","spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n","spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n","dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"eef106c4-e05f-4bd2-97db-b5d6830a95e2","showTitle":false,"title":""}},"outputs":[],"source":["%python\n","# Let us create a sample Dataframe with sensitive data\n","from pyspark.sql.types import StructType,StructField, StringType, IntegerType\n","cols = StructType([ \\\n","    StructField(\"Name\",StringType(),True), \\\n","    StructField(\"SSN\",StringType(),True), \\\n","    StructField(\"email\",StringType(),True)\n","  ])\n"," \n","data = [(\"Adam Smith\",\"111-11-1111\",\"adam@adam.com\"),\n","    (\"Brenda Harman\",\"222-22-2222\",\"brenda@brenda.com\"),\n","    (\"Carmen Pinto\",\"333-33-3333\", \"carmen@carmen.com\")\n","  ]\n","\n","# Create the Dataframe\n","piidf = spark.createDataFrame(data=data,schema=cols)\n","display(piidf)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b218ae6b-1bcb-465a-b35e-f0bbacd54220","showTitle":false,"title":""}},"source":["To use the Fernet library, you will have to install encryption and crytography libraries. Go to the Compute tab of the Azure Databricks workspace. In the compute tab,  under Libraries  select PyPI and enter \"encryption\" in the textbox and click on \"Install new\" button. \n","Repeat the same process for the \"cryptography\" library also."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"262ff4a4-2e3e-4a90-8e7b-2d00116778fa","showTitle":false,"title":""}},"outputs":[],"source":["# Import the Fernet library and test it\n","from cryptography.fernet import Fernet\n","key = Fernet.generate_key()\n","f = Fernet(key)\n","token = f.encrypt(b\"Hello Azure\")\n","print(token)\n","print(f.decrypt(token))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"64327636-bb7f-4c64-9aaf-b34666195062","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{"application/vnd.databricks.v1+output":{"addedWidgets":{},"arguments":{},"data":"<div class=\"ansiout\"></div>","datasetInfos":[],"metadata":{},"removedWidgets":[],"type":"html"}},"output_type":"display_data"}],"source":["# Define the encryption function\n","def encryptUdf(plaintext, KEY):\n","    from cryptography.fernet import Fernet\n","    f = Fernet(KEY)\n","    encryptedtext = f.encrypt(bytes(plaintext, 'utf-8'))\n","    return str(encryptedtext.decode('ascii'))\n","encrypt = udf(encryptUdf, StringType())\n","\n","\n","# decrypt udf\n","def decryptUdf(encryptedtext, KEY):\n","    from cryptography.fernet import Fernet\n","    f = Fernet(KEY)\n","    plaintext=f.decrypt( encryptedtext.encode()).decode()\n","    return plaintext\n","decrypt = udf(decryptUdf, StringType())\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"57de160e-c84a-4582-bf2b-ca3d7fc7cc50","showTitle":false,"title":""}},"outputs":[],"source":["from pyspark.sql.functions import udf, lit, md5\n","from pyspark.sql.types import StringType\n","\n","# Fetch key from secrets or Azure Key vault\n","# encryptionKey = dbutils.preview.secret.get(scope = \"encrypt\", key = \"fernetkey\")\n","encryptionKey = key\n","# Encrypt the data \n","encrypteddf = piidf.withColumn(\"SSN\", encrypt(\"SSN\", lit(encryptionKey)))\n","display(encrypteddf)\n","\n","#Save encrypted data \n","encrypteddf.write.format(\"delta\").mode(\"overwrite\").option(\"overwriteSchema\", \"true\").saveAsTable(\"PIIEncryptedTable\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ef33ea99-3778-49ba-bd33-a24a98e1b657","showTitle":false,"title":""}},"outputs":[],"source":["decrypted = encrypteddf.withColumn(\"SSN\", decrypt(\"SSN\",lit(encryptionKey)))\n","display(decrypted)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"093d4b51-2fda-4bb8-a453-69375e896139","showTitle":false,"title":""}},"outputs":[],"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"HandlingSensitiveInfoInDataframe-C12","notebookOrigID":188113580888575,"widgets":{}},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter09/SparkBatchJob-C9.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{},"source":["Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n","https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n","\n","If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b78db953-876e-4b12-b0fa-edf527b269e9","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","val fileSystemName = \"<INSERT CONTAINER NAME>\"\n","\n","val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","# AAD Application Details\n","val appID = \"<INSERT APP ID>\"\n","val secret = \"<INSERT SECRET>\"\n","val tenantID = \"<INSERT TENANT ID>\"\n","\n","spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n","spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n","dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fac7d340-ec5c-4fe4-a617-f617b272c923","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","\n","// Let us see how to write to a JSON file\n","import org.apache.spark.sql.{DataFrame, Row, SaveMode}\n","import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}\n","\n","val tripsCSVPath = commonPath + \"/batch/csv/trips\"\n","val faresParquetPath = commonPath + \"/batch/parquet/fares\"\n","\n","// Generate sample data\n","val tripSchema = new StructType()\n","      .add(\"tripId\",IntegerType)\n","      .add(\"driverId\",IntegerType)\n","      .add(\"customerId\",IntegerType)\n","      .add(\"cabId\",IntegerType)\n","      .add(\"tripDate\",StringType)\n","      .add(\"startLocation\",StringType)\n","      .add(\"endLocation\",StringType)\n","      \n","val tripData = Seq(\n","  Row(100, 200, 300, 400, \"20220101\", \"New York\", \"New Jersey\"),\n","  Row(101, 201, 301, 401, \"20220102\", \"Tempe\", \"Phoenix\"),\n","  Row(102, 202, 302, 402, \"20220103\", \"San Jose\", \"San Franciso\"),\n","  Row(103, 203, 303, 403, \"20220102\", \"New York\", \"Boston\"),\n","  Row(104, 204, 304, 404, \"20220103\", \"New York\", \"Washington\"),\n","  Row(105, 205, 305, 405, \"20220201\", \"Miami\", \"Fort Lauderdale\"),\n","  Row(106, 206, 306, 406, \"20220202\", \"Seattle\", \"Redmond\"),\n","  Row(107, 207, 307, 407, \"20220203\", \"Los Angeles\", \"San Diego\"),\n","  Row(108, 208, 308, 408, \"20220301\", \"Phoenix\", \"Las Vegas\"),\n","  Row(109, 209, 309, 409, \"20220302\", \"Washington\", \"Baltimore\"),\n","  Row(110, 210, 310, 410, \"20220303\", \"Dallas\", \"Austin\"),\n",")\n","\n","// Write Trips to CSV file\n","val tripDF = spark.createDataFrame(spark.sparkContext.parallelize(tripData),tripSchema)\n","tripDF.printSchema()\n","tripDF.show(false)\n","tripDF.write.mode(\"overwrite\").option(\"header\", \"true\").csv(tripsCSVPath)\n","\n","// Generate sample fares data\n","val fareSchema = new StructType()\n","      .add(\"tripId\",IntegerType)\n","      .add(\"fare\",IntegerType)\n","      .add(\"currency\",StringType)\n","\n","val fareData = Seq(\n","  Row(100, 100, \"USD\"),\n","  Row(101, 20, \"USD\"),\n","  Row(102, 25, \"USD\"),\n","  Row(103, 140, \"USD\"),\n","  Row(104, 340, \"USD\"),\n","  Row(105, 75, \"USD\"),\n","  Row(106, 50, \"USD\"),\n","  Row(107, 125, \"USD\"),\n","  Row(108, 40, \"USD\"),\n","  Row(109, 80, \"USD\"),\n","  Row(110, 160, \"USD\")\n",")\n","\n","// Write Trips to Parquet file\n","val faresDF = spark.createDataFrame(spark.sparkContext.parallelize(fareData),fareSchema)\n","faresDF.printSchema()\n","faresDF.show(false)\n","faresDF.write.mode(\"overwrite\").option(\"header\", \"true\").parquet(faresParquetPath)\n"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7004cec7-26c9-43d1-929f-1a2e5db59c37","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","//Sample Batch Transformation using ADB. Let us try this one in scala\n","import org.apache.spark.sql.{DataFrame, Row, SaveMode}\n","import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}\n","\n","val tripsCSVPath = commonPath + \"/batch/csv/trips/*\"\n","val faresParquetPath = commonPath + \"/batch/parquet/fares/*\"\n","val outputParquetPath = commonPath + \"/batch/parquet/output\"\n","\n","// Read  the Trip data (stored as CSV file) and the Fares data (stored as Parquet files)\n","val tripsSchema = new StructType()\n","      .add(\"tripId\",IntegerType)\n","      .add(\"driverId\",IntegerType)\n","      .add(\"customerId\",IntegerType)\n","      .add(\"cabId\",IntegerType)\n","      .add(\"tripDate\",IntegerType)\n","      .add(\"startLocation\",StringType)\n","      .add(\"endLocation\",StringType)\n","\n","val tripsCSV = spark.read.format(\"csv\")\n","      .option(\"header\", \"true\")\n","      .schema(tripsSchema)\n","      .load(tripsCSVPath)\n","tripsCSV.printSchema()\n","tripsCSV.show(false)\n","\n","val faresSchema = new StructType()\n","      .add(\"tripId\",IntegerType)\n","      .add(\"fare\",IntegerType)\n","      .add(\"currency\",StringType)\n","\n","val faresParquet = spark.read.format(\"parquet\")\n","            .schema(faresSchema)\n","            .load(faresParquetPath)\n","faresParquet.printSchema()\n","faresParquet.show(false)\n","\n","\n","\n","// Join them on the tripID and group by StartLocation.\n","val joinDF = tripsCSV.join(\n","faresParquet,tripsCSV(\"tripId\") === \n","      faresParquet(\"tripId\"),\"inner\")\n",".groupBy(\"startLocation\")\n",".sum(\"fare\");\n","\n","// Print the output table with columns: City and Fare\n","import org.apache.spark.sql.functions.col;\n","val outputDF = joinDF.select(col(\"startLocation\").alias(\"City\"),col(\"sum(fare)\").alias(\"Fare\"));\n","display(outputDF)\n","//\tFinally, write the output back to ADLS Gen2 under the transform/fares/out folder.\n","outputDF.write.mode(\"overwrite\").parquet(outputParquetPath)\n"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f67d9a90-c40f-429f-ada0-8ad645ba453c","showTitle":false,"title":""}},"outputs":[],"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"SparkBatchJob-C9","notebookOrigID":188113580888598,"widgets":{}},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter07/ParquetWithSpark-C7.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4b960fbf-0f02-4bc8-b075-9354a420f049","showTitle":false,"title":""}},"source":["Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n","https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n","\n","If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b7e17651-5f7d-4f4b-bd97-a3b0079f5770","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","\n","val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","val fileSystemName = \"<INSERT CONTAINER NAME>\"\n","\n","val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","# AAD Application Details\n","val appID = \"<INSERT APP ID>\"\n","val secret = \"<INSERT SECRET>\"\n","val tenantID = \"<INSERT TENANT ID>\"\n","\n","spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n","spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n","dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2b265bb0-668b-4299-813a-85a0ce9f4ac9","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","\n","// Let us generate some parquet data first\n","import org.apache.spark.sql.{DataFrame, Row, SaveMode}\n","import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}\n","\n","val tripsParquetPath = commonPath + \"/parquet/trips\"\n","val driverParquetPath = commonPath + \"/parquet/driver\"\n","\n","// Generate sample trips data\n","val tripSchema = new StructType().add(\"tripId\", StringType).add(\"driverId\", StringType).add(\"customerId\", StringType).add(\"cabId\", StringType).add(\"tripDate\", StringType).add(\"startLocation\", StringType).add(\"endLocation\", StringType)\n","\n","val tripData = Seq(\n","  Row(\"100\", \"200\", \"300\", \"400\", \"20220101\", \"New York\", \"New Jersey\"),\n","  Row(\"101\", \"201\", \"301\", \"401\", \"20220102\", \"Tempe\", \"Phoenix\"),\n","  Row(\"102\", \"202\", \"302\", \"402\", \"20220103\", \"San Jose\", \"San Franciso\"),\n","  Row(\"103\", \"203\", \"303\", \"403\", \"20220102\", \"New York\", \"Boston\"),\n","  Row(\"104\", \"204\", \"304\", \"404\", \"20220103\", \"New York\", \"Washington\"),\n","  Row(\"105\", \"205\", \"305\", \"405\", \"20220201\", \"Miami\", \"Fort Lauderdale\"),\n","  Row(\"106\", \"206\", \"306\", \"406\", \"20220202\", \"Seattle\", \"Redmond\"),\n","  Row(\"107\", \"207\", \"307\", \"407\", \"20220203\", \"Los Angeles\", \"San Diego\"),\n","  Row(\"108\", \"208\", \"308\", \"408\", \"20220301\", \"Phoenix\", \"Las Vegas\"),\n","  Row(\"109\", \"209\", \"309\", \"409\", \"20220302\", \"Washington\", \"Baltimore\"),\n","  Row(\"110\", \"210\", \"310\", \"410\", \"20220303\", \"Dallas\", \"Austin\"),\n","  Row(\"111\", \"211\", \"311\", \"411\", \"20220303\", \"New York\", \"New Jersey\"),\n","  Row(\"112\", \"212\", \"312\", \"412\", \"20220304\", \"New York\", \"Boston\"),\n","  Row(\"113\", \"212\", \"312\", \"412\", \"20220401\", \"San Jose\", \"San Ramon\"),\n","  Row(\"114\", \"212\", \"312\", \"412\", \"20220404\", \"San Jose\", \"Oakland\"),\n","  Row(\"115\", \"212\", \"312\", \"412\", \"20220404\", \"Tempe\", \"Scottsdale\"),\n","  Row(\"116\", \"212\", \"312\", \"412\", \"20220405\", \"Washington\", \"Atlanta\"),\n","  Row(\"117\", \"212\", \"312\", \"412\", \"20220405\", \"Seattle\", \"Portland\"),\n","  Row(\"118\", \"212\", \"312\", \"412\", \"20220405\", \"Miami\", \"Tampa\")\n",")\n","\n","// Write Trips to Parquet\n","val tripWriteDF = spark.createDataFrame(spark.sparkContext.parallelize(tripData),tripSchema)\n","tripWriteDF.write.mode(\"overwrite\").parquet(tripsParquetPath)\n","\n","val driverSchema = new StructType().add(\"driverId\", StringType).add(\"name\", StringType).add(\"license\",StringType).add(\"gender\",StringType).add(\"salary\",IntegerType)\n","\n","val driverData = Seq(\n","  Row(\"200\", \"Alice\", \"A224455\", \"Female\", 3000),\n","  Row(\"202\", \"Bryan\",\"B992244\",\"Male\",4000),\n","  Row(\"204\", \"Catherine\",\"C887733\",\"Female\",4000),\n","  Row(\"208\", \"Daryl\",\"D229988\",\"Male\",3000),\n","  Row(\"212\", \"Jenny\",\"J663300\",\"Female\", 5000)\n",")\n","// Write Driver to Parquet\n","val driverWriteDF = spark.createDataFrame(spark.sparkContext.parallelize(driverData),driverSchema)\n","driverWriteDF.write.mode(\"overwrite\").parquet(driverParquetPath)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"534b2333-eaf9-43f9-9085-0c7ed76fa580","showTitle":false,"title":""},"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python"},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["%python\n","# Let us read the data from Parquet files and view them as a Dataframe using Python and SQL now\n","\n","storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","fileSystemName = \"<INSERT CONTAINER NAME>\"\n","commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","df = spark.read.load( commonPath + '/parquet/trips/*.parquet',  format='parquet')\n","df.printSchema()\n","\n","spark.sql(\"DROP DATABASE TripsDatabase\")\n","\n","spark.sql(\"CREATE DATABASE IF NOT EXISTS TripsDatabase\")\n","df.write.mode(\"overwrite\").option(\"overwriteSchema\", \"true\").saveAsTable(\"TripsTable\")\n","\n","sqldf = spark.sql(\"SELECT * FROM TripsTable\") \n","display(sqldf)\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"626065bb-f5c6-4be1-946e-a5dd16de5412","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>"]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"Command skipped","errorTraceType":"html","metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["df = spark.read.load( commonPath + '/parquet/trips/*.parquet',  format='parquet')\n","spark.sql(\"CREATE DATABASE IF NOT EXISTS TripsDatabase\")\n","df.write.mode(\"overwrite\").option(\"overwriteSchema\", \"true\").saveAsTable(\"TripsTable\")\n","sqldf = spark.sql(\"\"\"\n","   SELECT COUNT(*) AS Trips, \n","   startLocation AS Location \n","   FROM TripsTable \n","   GROUP BY startLocation \"\"\") \n","display(sqldf)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"eb8ecfbe-3a99-4fa9-8d5e-40e6e9527705","showTitle":false,"title":""}},"outputs":[],"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":2},"notebookName":"ParquetWithSynSpark-C7","notebookOrigID":188113580888537,"widgets":{}},"kernelspec":{"display_name":"python","name":"synapse_pyspark"},"language_info":{"name":"python"},"save_output":true,"synapse_widget":{"state":{},"version":"0.1"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter02/SynSQLDistributions-C2.sql:
--------------------------------------------------------------------------------
  1 | -- Synapse SQL Distribution Strategy
  2 | 
  3 | -- Here is a simple example of creating a round-robin distributed table in Dedicated SQL Pool.
  4 | -- Round robin is the default option, so you don't have to specify the DISTRIBUTION OPTION
  5 | -- DROP TABLE dbo.CabTable1;  
  6 | 
  7 | CREATE TABLE dbo.CabTable1
  8 | (  
  9 |     [cabId] INT NOT NULL,  
 10 |     [driverName] VARCHAR(20),  
 11 |     [driverLicense] VARCHAR(20)  
 12 | )
 13 | GO
 14 | 
 15 | INSERT INTO dbo.CabTable1 VALUES (100, 'Adam', 'A12345');
 16 | INSERT INTO dbo.CabTable1 VALUES (101, 'Brian', 'B54321');
 17 | INSERT INTO dbo.CabTable1 VALUES (102, 'Cathy', 'C98765');
 18 | 
 19 | 
 20 | -- Here is a simple example of creating a hash distributed table in Dedicated SQL Pool:
 21 | -- DROP TABLE dbo.CabTable2; 
 22 | 
 23 | CREATE TABLE dbo.CabTable2
 24 | (  
 25 |     [cabId] INT NOT NULL,  
 26 |     [driverName] VARCHAR(20),  
 27 |     [driverLicense] VARCHAR(20)  
 28 | )  
 29 | WITH  
 30 | (   
 31 |     DISTRIBUTION = HASH (cabId)
 32 | )
 33 | GO
 34 | 
 35 | INSERT INTO dbo.CabTable2 VALUES (100, 'Adam', 'A12345');
 36 | INSERT INTO dbo.CabTable2 VALUES (101, 'Brian', 'B54321');
 37 | INSERT INTO dbo.CabTable2 VALUES (102, 'Cathy', 'C98765');
 38 | 
 39 | 
 40 | -- Here is a simple example of creating a replicated distributed table in Dedicated SQL Pool:
 41 | -- DROP TABLE dbo.CabTable3; 
 42 | 
 43 | CREATE TABLE dbo.CabTable3
 44 | (  
 45 |     [cabId] INT NOT NULL,  
 46 |     [driverName] VARCHAR(20),  
 47 |     [driverLicense] VARCHAR(20)  
 48 | )  
 49 | WITH  
 50 | (   
 51 |     DISTRIBUTION = REPLICATE
 52 | )
 53 | GO
 54 | 
 55 | 
 56 | INSERT INTO dbo.CabTable3 VALUES (100, 'Adam', 'A12345');
 57 | INSERT INTO dbo.CabTable3 VALUES (101, 'Brian', 'B54321');
 58 | INSERT INTO dbo.CabTable3 VALUES (102, 'Cathy', 'C98765');
 59 | 
 60 | 
 61 | -- You can use the below query to check the table details including the distribution strategy used.
 62 | -- The query is directly available from Azure Synapse SQL documentation: 
 63 | -- https://docs.microsoft.com/en-us/azure/synapse-analytics/sql/develop-tables-overview#table-size-queries
 64 | 
 65 | CREATE VIEW dbo.vTableSizes
 66 | AS
 67 | WITH base
 68 | AS
 69 | (
 70 | SELECT
 71 |  GETDATE()                                                             AS  [execution_time]
 72 | , DB_NAME()                                                            AS  [database_name]
 73 | , s.name                                                               AS  [schema_name]
 74 | , t.name                                                               AS  [table_name]
 75 | , QUOTENAME(s.name)+'.'+QUOTENAME(t.name)                              AS  [two_part_name]
 76 | , nt.[name]                                                            AS  [node_table_name]
 77 | , ROW_NUMBER() OVER(PARTITION BY nt.[name] ORDER BY (SELECT NULL))     AS  [node_table_name_seq]
 78 | , tp.[distribution_policy_desc]                                        AS  [distribution_policy_name]
 79 | , c.[name]                                                             AS  [distribution_column]
 80 | , nt.[distribution_id]                                                 AS  [distribution_id]
 81 | , i.[type]                                                             AS  [index_type]
 82 | , i.[type_desc]                                                        AS  [index_type_desc]
 83 | , nt.[pdw_node_id]                                                     AS  [pdw_node_id]
 84 | , pn.[type]                                                            AS  [pdw_node_type]
 85 | , pn.[name]                                                            AS  [pdw_node_name]
 86 | , di.name                                                              AS  [dist_name]
 87 | , di.position                                                          AS  [dist_position]
 88 | , nps.[partition_number]                                               AS  [partition_nmbr]
 89 | , nps.[reserved_page_count]                                            AS  [reserved_space_page_count]
 90 | , nps.[reserved_page_count] - nps.[used_page_count]                    AS  [unused_space_page_count]
 91 | , nps.[in_row_data_page_count]
 92 |     + nps.[row_overflow_used_page_count]
 93 |     + nps.[lob_used_page_count]                                        AS  [data_space_page_count]
 94 | , nps.[reserved_page_count]
 95 |  - (nps.[reserved_page_count] - nps.[used_page_count])
 96 |  - ([in_row_data_page_count]
 97 |          + [row_overflow_used_page_count]+[lob_used_page_count])       AS  [index_space_page_count]
 98 | , nps.[row_count]                                                      AS  [row_count]
 99 | from
100 |     sys.schemas s
101 | INNER JOIN sys.tables t
102 |     ON s.[schema_id] = t.[schema_id]
103 | INNER JOIN sys.indexes i
104 |     ON  t.[object_id] = i.[object_id]
105 |     AND i.[index_id] <= 1
106 | INNER JOIN sys.pdw_table_distribution_properties tp
107 |     ON t.[object_id] = tp.[object_id]
108 | INNER JOIN sys.pdw_table_mappings tm
109 |     ON t.[object_id] = tm.[object_id]
110 | INNER JOIN sys.pdw_nodes_tables nt
111 |     ON tm.[physical_name] = nt.[name]
112 | INNER JOIN sys.dm_pdw_nodes pn
113 |     ON  nt.[pdw_node_id] = pn.[pdw_node_id]
114 | INNER JOIN sys.pdw_distributions di
115 |     ON  nt.[distribution_id] = di.[distribution_id]
116 | INNER JOIN sys.dm_pdw_nodes_db_partition_stats nps
117 |     ON nt.[object_id] = nps.[object_id]
118 |     AND nt.[pdw_node_id] = nps.[pdw_node_id]
119 |     AND nt.[distribution_id] = nps.[distribution_id]
120 | LEFT OUTER JOIN (select * from sys.pdw_column_distribution_properties where distribution_ordinal = 1) cdp
121 |     ON t.[object_id] = cdp.[object_id]
122 | LEFT OUTER JOIN sys.columns c
123 |     ON cdp.[object_id] = c.[object_id]
124 |     AND cdp.[column_id] = c.[column_id]
125 | WHERE pn.[type] = 'COMPUTE'
126 | )
127 | , size
128 | AS
129 | (
130 | SELECT
131 |    [execution_time]
132 | ,  [database_name]
133 | ,  [schema_name]
134 | ,  [table_name]
135 | ,  [two_part_name]
136 | ,  [node_table_name]
137 | ,  [node_table_name_seq]
138 | ,  [distribution_policy_name]
139 | ,  [distribution_column]
140 | ,  [distribution_id]
141 | ,  [index_type]
142 | ,  [index_type_desc]
143 | ,  [pdw_node_id]
144 | ,  [pdw_node_type]
145 | ,  [pdw_node_name]
146 | ,  [dist_name]
147 | ,  [dist_position]
148 | ,  [partition_nmbr]
149 | ,  [reserved_space_page_count]
150 | ,  [unused_space_page_count]
151 | ,  [data_space_page_count]
152 | ,  [index_space_page_count]
153 | ,  [row_count]
154 | ,  ([reserved_space_page_count] * 8.0)                                 AS [reserved_space_KB]
155 | ,  ([reserved_space_page_count] * 8.0)/1000                            AS [reserved_space_MB]
156 | ,  ([reserved_space_page_count] * 8.0)/1000000                         AS [reserved_space_GB]
157 | ,  ([reserved_space_page_count] * 8.0)/1000000000                      AS [reserved_space_TB]
158 | ,  ([unused_space_page_count]   * 8.0)                                 AS [unused_space_KB]
159 | ,  ([unused_space_page_count]   * 8.0)/1000                            AS [unused_space_MB]
160 | ,  ([unused_space_page_count]   * 8.0)/1000000                         AS [unused_space_GB]
161 | ,  ([unused_space_page_count]   * 8.0)/1000000000                      AS [unused_space_TB]
162 | ,  ([data_space_page_count]     * 8.0)                                 AS [data_space_KB]
163 | ,  ([data_space_page_count]     * 8.0)/1000                            AS [data_space_MB]
164 | ,  ([data_space_page_count]     * 8.0)/1000000                         AS [data_space_GB]
165 | ,  ([data_space_page_count]     * 8.0)/1000000000                      AS [data_space_TB]
166 | ,  ([index_space_page_count]  * 8.0)                                   AS [index_space_KB]
167 | ,  ([index_space_page_count]  * 8.0)/1000                              AS [index_space_MB]
168 | ,  ([index_space_page_count]  * 8.0)/1000000                           AS [index_space_GB]
169 | ,  ([index_space_page_count]  * 8.0)/1000000000                        AS [index_space_TB]
170 | FROM base
171 | )
172 | SELECT *
173 | FROM size
174 | ;
175 | 
176 | SELECT
177 |      distribution_policy_name
178 | ,    SUM(row_count)                as table_type_row_count
179 | ,    SUM(reserved_space_GB)        as table_type_reserved_space_GB
180 | ,    SUM(data_space_GB)            as table_type_data_space_GB
181 | ,    SUM(index_space_GB)           as table_type_index_space_GB
182 | ,    SUM(unused_space_GB)          as table_type_unused_space_GB
183 | FROM dbo.vTableSizes
184 | GROUP BY distribution_policy_name
185 | ;
186 | 
187 | 


--------------------------------------------------------------------------------
/Chapter10/EventHub-StructuredStreaming-C10.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2dd5e97c-10e9-4b4a-a9e9-c7e7839876b3","showTitle":false,"title":""}},"source":["Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n","https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n","\n","If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"2b8d08f4-fd94-4f7a-8637-d985ac5f8ac0","showTitle":false,"title":""}},"outputs":[],"source":["%scala\n","val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n","val fileSystemName = \"<INSERT CONTAINER NAME>\"\n","\n","val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n","\n","# AAD Application Details\n","val appID = \"<INSERT APP ID>\"\n","val secret = \"<INSERT SECRET>\"\n","val tenantID = \"<INSERT TENANT ID>\"\n","\n","spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n","spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n","spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n","dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n","spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"993265cf-c577-4669-9584-708cacb50c38","showTitle":false,"title":""}},"source":["Install the Event Hub library in the spark cluster before proceeding to the next step.\n","Spark Event Hubs connector - com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21 for Spark 3.x"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"02b15aed-f166-4b4c-90cb-37c8013a305b","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{"application/vnd.databricks.v1+output":{"addedWidgets":{},"arguments":{},"data":"<div class=\"ansiout\"></div>","datasetInfos":[],"metadata":{},"removedWidgets":[],"type":"html"}},"output_type":"display_data"}],"source":["from pyspark.sql.functions import *\n","from pyspark.sql.types import *\n","\n","# connection string of Event Hubs Namespace\n","EHConnectionString = \"<INSERT EVENT HUB CONN STR>\"\n","\n","EHConfig = {}\n","EHConfig['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(EHConnectionString)\n","\n","EHStreamDF = spark.readStream.format(\"eventhubs\").options(**EHConfig).load()"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ab86425b-05f7-47ba-b4ab-bdd731b40299","showTitle":false,"title":""}},"outputs":[],"source":["print( EHStreamDF.isStreaming)\n","print( EHStreamDF.printSchema())"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0e1f2051-8afe-448d-bfc0-3b20b879ef2e","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{"application/vnd.databricks.v1+output":{"addedWidgets":{},"arguments":{},"data":"<div class=\"ansiout\"></div>","datasetInfos":[],"metadata":{},"removedWidgets":[],"type":"html"}},"output_type":"display_data"}],"source":["JsonSchema = StructType() \\\n",".add(\"tripId\", StringType()) \\\n",".add(\"createdAt\", TimestampType()) \\\n",".add(\"startLocation\", StringType()) \\\n",".add(\"endLocation\", StringType()) \\\n",".add(\"distance\", IntegerType()) \\\n",".add(\"fare\", IntegerType())"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5b9b6198-9d5e-4e09-addb-4d13ddbaf9f6","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{"application/vnd.databricks.v1+output":{"addedWidgets":{},"arguments":{},"data":"<div class=\"ansiout\"></div>","datasetInfos":[],"metadata":{},"removedWidgets":[],"type":"html"}},"output_type":"display_data"}],"source":["stringDF=EHStreamDF.selectExpr(\"CAST(body AS STRING)\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b06c7635-8ca6-452c-a1be-803103f37fd4","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{"application/vnd.databricks.v1+output":{"addedWidgets":{},"arguments":{},"data":"<div class=\"ansiout\"></div>","datasetInfos":[],"metadata":{},"removedWidgets":[],"type":"html"}},"output_type":"display_data"}],"source":["jsonDF=stringDF.withColumn('tripjson', from_json(col('body'),schema=JsonSchema))"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3f3d037e-3778-4f0e-8a4d-c41837044aea","showTitle":false,"title":""}},"outputs":[],"source":["EHStreamJsonDF=jsonDF.select(\"tripjson.*\")\n","display(EHStreamJsonDF)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"24488233-9715-490b-8ef9-a62d5d7e8364","showTitle":false,"title":""}},"outputs":[],"source":["EHStreamJsonDF.selectExpr(\n","                  \"tripId\"\\\n","                  ,\"createdAt\"\\\n","                  ,\"startLocation\"\\\n","                  ,\"endLocation\"\\\n","                  ,\"distance\"\\\n","                  ,\"fare\")\\\n",".writeStream.format(\"delta\")\\\n",".outputMode(\"append\")\\\n",".option(\"checkpointLocation\", \"dbfs:/tripsCheckpointLocation1/\")\\\n",".option(\"mergeSchema\", \"true\")\\\n",".start(\"dbfs:/TripsEventHubDelta1\")"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"aa07eb65-b2c7-4370-88be-cc41e0879d7d","showTitle":false,"title":""}},"outputs":[],"source":["EHStreamJsonDF.groupBy(window('createdAt',\"1 minutes\"),'startLocation').count().orderBy('window') \\\n",".writeStream.format(\"delta\") \\\n",".outputMode(\"complete\") \\\n",".option(\"truncate\", \"false\") \\\n",".option(\"checkpointLocation\", \"dbfs:/tripsCheckpointLocationTumbling2/\") \\\n",".option(\"mergeSchema\", \"true\") \\\n",".start(\"dbfs:/TripsEventHubDeltaTumbling2\") "]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"772bf132-9a20-4ca5-9aa9-06250eb475f5","showTitle":false,"title":""}},"outputs":[],"source":["EHStreamJsonDF.groupBy(window('createdAt',\"1 minutes\"),'startLocation').count().orderBy('window')\\\n",".writeStream.format(\"memory\") \\\n",".outputMode(\"complete\") \\\n",".option(\"truncate\", \"false\") \\\n",".option(\"checkpointLocation\", \"dbfs:/tripsCheckpointLocation/\") \\\n",".queryName(\"TripsTumblingQuery\").start() "]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"89ce537b-bc50-4bac-a0b5-e3570ee3a7be","showTitle":false,"title":""}},"outputs":[],"source":["%sql\n","SELECT * FROM TripsTumblingQuery ORDER BY Window desc"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"77588e88-f9ba-492f-a057-86dd6eb5a950","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":["<style scoped>\n","  .table-result-container {\n","    max-height: 300px;\n","    overflow: auto;\n","  }\n","  table, th, td {\n","    border: 1px solid black;\n","    border-collapse: collapse;\n","  }\n","  th, td {\n","    padding: 5px;\n","  }\n","  th {\n","    text-align: left;\n","  }\n","</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr></tr></thead><tbody></tbody></table></div>"]},"metadata":{"application/vnd.databricks.v1+output":{"addedWidgets":{},"aggData":[],"aggError":"","aggOverflow":false,"aggSchema":[],"aggSeriesLimitReached":false,"aggType":"","arguments":{},"columnCustomDisplayInfos":{},"data":[],"datasetInfos":[],"dbfsResultPath":null,"isJsonSchema":true,"metadata":{},"overflow":false,"plotOptions":{"customPlotOptions":{},"displayType":"table","pivotAggregation":null,"pivotColumns":[],"xColumns":[],"yColumns":[]},"removedWidgets":[],"schema":[],"type":"table"}},"output_type":"display_data"}],"source":["%sql\n","DROP TABLE IF EXISTS TripsAggTumbling;\n","CREATE TABLE IF NOT EXISTS TripsAggTumbling\n","USING DELTA\n","LOCATION \"dbfs:/TripsEventHubDeltaTumbling2/\""]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3d156399-f46a-4aa3-8522-be24ed17e908","showTitle":false,"title":""}},"outputs":[],"source":["%sql\n","SELECT * FROM TripsAggTumbling ORDER BY Window desc"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d78e9c26-8886-418e-9ed5-ec3f11f45a7b","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{"application/vnd.databricks.v1+output":{"addedWidgets":{},"arguments":{},"data":"<div class=\"ansiout\"></div>","datasetInfos":[],"metadata":{},"removedWidgets":[],"type":"html"}},"output_type":"display_data"}],"source":["for s in spark.streams.active:\n","    s.stop()"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fb2e8b5c-640e-468d-8a42-cf39e224718b","showTitle":false,"title":""}},"outputs":[],"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"EventHub-C10","notebookOrigID":2621222552226268,"widgets":{}},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter14/HyperspaceIndexing-C14.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {},
  6 |       "source": [
  7 |         "Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:\n",
  8 |         "https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access\n",
  9 |         "\n",
 10 |         "If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section."
 11 |       ]
 12 |     },
 13 |     {
 14 |       "cell_type": "code",
 15 |       "execution_count": null,
 16 |       "metadata": {},
 17 |       "outputs": [],
 18 |       "source": [
 19 |         "%scala\n",
 20 |         "val storageAccountName = \"<INSERT STORAGE ACCOUNT>\"\n",
 21 |         "val fileSystemName = \"<INSERT CONTAINER NAME>\"\n",
 22 |         "\n",
 23 |         "val commonPath = \"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net\"\n",
 24 |         "\n",
 25 |         "# AAD Application Details\n",
 26 |         "val appID = \"<INSERT APP ID>\"\n",
 27 |         "val secret = \"<INSERT SECRET>\"\n",
 28 |         "val tenantID = \"<INSERT TENANT ID>\"\n",
 29 |         "\n",
 30 |         "spark.conf.set(\"fs.azure.account.auth.type.\" + storageAccountName + \".dfs.core.windows.net\", \"OAuth\")\n",
 31 |         "spark.conf.set(\"fs.azure.account.oauth.provider.type.\" + storageAccountName + \".dfs.core.windows.net\", \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\")\n",
 32 |         "spark.conf.set(\"fs.azure.account.oauth2.client.id.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + appID + \"\")\n",
 33 |         "spark.conf.set(\"fs.azure.account.oauth2.client.secret.\" + storageAccountName + \".dfs.core.windows.net\", \"\" + secret + \"\")\n",
 34 |         "spark.conf.set(\"fs.azure.account.oauth2.client.endpoint.\" + storageAccountName + \".dfs.core.windows.net\", \"https://login.microsoftonline.com/\" + tenantID + \"/oauth2/token\")\n",
 35 |         "spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"true\")\n",
 36 |         "dbutils.fs.ls(\"abfss://\" + fileSystemName  + \"@\" + storageAccountName + \".dfs.core.windows.net/\")\n",
 37 |         "spark.conf.set(\"fs.azure.createRemoteFileSystemDuringInitialization\", \"false\")"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "cell_type": "code",
 42 |       "execution_count": null,
 43 |       "metadata": {},
 44 |       "outputs": [],
 45 |       "source": [
 46 |         "import org.apache.spark.sql.{DataFrame, Row, SaveMode}\n",
 47 |         "import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}\n",
 48 |         "\n",
 49 |         "val tripsParquetPath = commonPath + \"/hyperspace/trips/\"\n",
 50 |         "val driverParquetPath = commonPath + \"/hyperspace/driver/\"\n",
 51 |         "\n",
 52 |         "// Generate sample trips data\n",
 53 |         "val tripSchema = new StructType().add(\"tripId\", StringType).add(\"driverId\", StringType).add(\"customerId\",StringType).add(\"cabId\",StringType).add(\"tripDate\",StringType).add(\"startLocation\",StringType).add(\"endLocation\",StringType)\n",
 54 |         "\n",
 55 |         "val tripData = Seq(\n",
 56 |         "  Row(\"100\", \"200\", \"300\", \"400\", \"20220101\", \"New York\", \"New Jersey\"),\n",
 57 |         "  Row(\"101\", \"201\", \"301\", \"401\", \"20220102\", \"Tempe\", \"Phoenix\"),\n",
 58 |         "  Row(\"102\", \"202\", \"302\", \"402\", \"20220103\", \"San Jose\", \"San Franciso\"),\n",
 59 |         "  Row(\"103\", \"203\", \"303\", \"403\", \"20220102\", \"New York\", \"Boston\"),\n",
 60 |         "  Row(\"104\", \"204\", \"304\", \"404\", \"20220103\", \"New York\", \"Washington\"),\n",
 61 |         "  Row(\"105\", \"205\", \"305\", \"405\", \"20220201\", \"Miami\", \"Fort Lauderdale\"),\n",
 62 |         "  Row(\"106\", \"206\", \"306\", \"406\", \"20220202\", \"Seattle\", \"Redmond\"),\n",
 63 |         "  Row(\"107\", \"207\", \"307\", \"407\", \"20220203\", \"Los Angeles\", \"San Diego\"),\n",
 64 |         "  Row(\"108\", \"208\", \"308\", \"408\", \"20220301\", \"Phoenix\", \"Las Vegas\"),\n",
 65 |         "  Row(\"109\", \"209\", \"309\", \"409\", \"20220302\", \"Washington\", \"Baltimore\"),\n",
 66 |         "  Row(\"110\", \"210\", \"310\", \"410\", \"20220303\", \"Dallas\", \"Austin\"),\n",
 67 |         "  Row(\"111\", \"211\", \"311\", \"411\", \"20220303\", \"New York\", \"New Jersey\"),\n",
 68 |         "  Row(\"112\", \"212\", \"312\", \"412\", \"20220304\", \"New York\", \"Boston\"),\n",
 69 |         "  Row(\"113\", \"212\", \"312\", \"412\", \"20220401\", \"San Jose\", \"San Ramon\"),\n",
 70 |         "  Row(\"114\", \"212\", \"312\", \"412\", \"20220404\", \"San Jose\", \"Oakland\"),\n",
 71 |         "  Row(\"115\", \"212\", \"312\", \"412\", \"20220404\", \"Tempe\", \"Scottsdale\"),\n",
 72 |         "  Row(\"116\", \"212\", \"312\", \"412\", \"20220405\", \"Washington\", \"Atlanta\"),\n",
 73 |         "  Row(\"117\", \"212\", \"312\", \"412\", \"20220405\", \"Seattle\", \"Portland\"),\n",
 74 |         "  Row(\"118\", \"212\", \"312\", \"412\", \"20220405\", \"Miami\", \"Tampa\")\n",
 75 |         ")\n",
 76 |         "\n",
 77 |         "// Write Trips to Parquet\n",
 78 |         "val tripWriteDF = spark.createDataFrame(spark.sparkContext.parallelize(tripData),tripSchema)\n",
 79 |         "tripWriteDF.write.mode(\"overwrite\").parquet(tripsParquetPath)\n",
 80 |         "\n",
 81 |         "val driverSchema = new StructType().add(\"driverId\", StringType).add(\"name\", StringType).add(\"license\",StringType).add(\"gender\",StringType).add(\"salary\",IntegerType)\n",
 82 |         "\n",
 83 |         "val driverData = Seq(\n",
 84 |         "  Row(\"200\", \"Alice\", \"A224455\", \"Female\", 3000),\n",
 85 |         "  Row(\"202\", \"Bryan\",\"B992244\",\"Male\",4000),\n",
 86 |         "  Row(\"204\", \"Catherine\",\"C887733\",\"Female\",4000),\n",
 87 |         "  Row(\"208\", \"Daryl\",\"D229988\",\"Male\",3000),\n",
 88 |         "  Row(\"212\", \"Jenny\",\"J663300\",\"Female\", 5000)\n",
 89 |         ")\n",
 90 |         "// Write Driver to Parquet\n",
 91 |         "val driverWriteDF = spark.createDataFrame(spark.sparkContext.parallelize(driverData),driverSchema)\n",
 92 |         "driverWriteDF.write.mode(\"overwrite\").parquet(driverParquetPath)\n",
 93 |         "\n"
 94 |       ]
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "execution_count": null,
 99 |       "metadata": {
100 |         "jupyter": {
101 |           "outputs_hidden": false,
102 |           "source_hidden": false
103 |         },
104 |         "nteract": {
105 |           "transient": {
106 |             "deleting": false
107 |           }
108 |         }
109 |       },
110 |       "outputs": [],
111 |       "source": [
112 |         "// Let us read back the files to check if the data is showing up correctly\n",
113 |         "val tripsDF: DataFrame = spark.read.parquet(tripsParquetPath)\n",
114 |         "val driverDF: DataFrame = spark.read.parquet(driverParquetPath)\n",
115 |         "\n",
116 |         "// Verify the data is available and correct\n",
117 |         "tripsDF.show()\n",
118 |         "driverDF.show()"
119 |       ]
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "execution_count": null,
124 |       "metadata": {
125 |         "jupyter": {
126 |           "outputs_hidden": false,
127 |           "source_hidden": false
128 |         },
129 |         "nteract": {
130 |           "transient": {
131 |             "deleting": false
132 |           }
133 |         }
134 |       },
135 |       "outputs": [],
136 |       "source": [
137 |         "// Now let us try to join the tables and create a query, which we can later optimize using Hyperspace indexing\n",
138 |         "val driverFilter: DataFrame = tripsDF.join(driverDF, tripsDF(\"driverId\") === driverDF(\"driverId\")).select(tripsDF(\"tripId\"), driverDF(\"name\"))\n",
139 |         "driverFilter.show()\n",
140 |         "\n",
141 |         "driverFilter.explain(true)"
142 |       ]
143 |     },
144 |     {
145 |       "cell_type": "code",
146 |       "execution_count": null,
147 |       "metadata": {
148 |         "jupyter": {
149 |           "outputs_hidden": false,
150 |           "source_hidden": false
151 |         },
152 |         "nteract": {
153 |           "transient": {
154 |             "deleting": false
155 |           }
156 |         }
157 |       },
158 |       "outputs": [],
159 |       "source": [
160 |         "// Let us try the same query with Hypserspace enabled now\n",
161 |         "\n",
162 |         "// Create an instance of Hyperspace\n",
163 |         "import com.microsoft.hyperspace._\n",
164 |         "import com.microsoft.hyperspace.index._\n",
165 |         "\n",
166 |         "val hs: Hyperspace = Hyperspace()\n",
167 |         "\n",
168 |         "// Delete and vacuum the index if you are trying to rerun the query\n",
169 |         "//hs.deleteIndex(\"TripIndex\")\n",
170 |         "//hs.deleteIndex(\"DriverIndex\")\n",
171 |         "//hs.vacuumIndex(\"TripIndex\")\n",
172 |         "//hs.vacuumIndex(\"DriverIndex\")\n",
173 |         "\n",
174 |         "// Create the trips and driver indexes\n",
175 |         "hs.createIndex(tripsDF, IndexConfig(\"TripIndex\", indexedColumns = Seq(\"driverId\"), includedColumns = Seq(\"tripId\")))\n",
176 |         "hs.createIndex(driverDF, IndexConfig(\"DriverIndex\", indexedColumns = Seq(\"driverId\"), includedColumns = Seq(\"name\")))\n",
177 |         "\n",
178 |         "// List the indexes to check if the new indexes have been created\n",
179 |         "hs.indexes.show()"
180 |       ]
181 |     },
182 |     {
183 |       "cell_type": "code",
184 |       "execution_count": null,
185 |       "metadata": {
186 |         "jupyter": {
187 |           "outputs_hidden": false,
188 |           "source_hidden": false
189 |         },
190 |         "nteract": {
191 |           "transient": {
192 |             "deleting": false
193 |           }
194 |         }
195 |       },
196 |       "outputs": [],
197 |       "source": [
198 |         "// Enable Hyperspace\n",
199 |         "spark.enableHyperspace\n",
200 |         "\n",
201 |         "// Read back the same trip and driver parquet data into dataframes again\n",
202 |         "val tripIndexDF: DataFrame = spark.read.parquet(tripsParquetPath)\n",
203 |         "val driverIndexDF: DataFrame = spark.read.parquet(driverParquetPath)\n",
204 |         "\n",
205 |         "tripIndexDF.show(5)\n",
206 |         "driverIndexDF.show(5)"
207 |       ]
208 |     },
209 |     {
210 |       "cell_type": "code",
211 |       "execution_count": null,
212 |       "metadata": {
213 |         "jupyter": {
214 |           "outputs_hidden": false,
215 |           "source_hidden": false
216 |         },
217 |         "nteract": {
218 |           "transient": {
219 |             "deleting": false
220 |           }
221 |         }
222 |       },
223 |       "outputs": [],
224 |       "source": [
225 |         "// Run a Join query again\n",
226 |         "val filterJoin: DataFrame = tripIndexDF.join(driverIndexDF, tripIndexDF(\"driverId\") === driverIndexDF(\"driverId\")).select(tripIndexDF(\"tripId\"), driverIndexDF(\"name\"))\n",
227 |         "filterJoin.show()"
228 |       ]
229 |     },
230 |     {
231 |       "cell_type": "code",
232 |       "execution_count": null,
233 |       "metadata": {
234 |         "jupyter": {
235 |           "outputs_hidden": false,
236 |           "source_hidden": false
237 |         },
238 |         "nteract": {
239 |           "transient": {
240 |             "deleting": false
241 |           }
242 |         }
243 |       },
244 |       "outputs": [],
245 |       "source": [
246 |         "// Check the comparision of the queryplan with and without Index\n",
247 |         "\n",
248 |         "spark.conf.set(\"spark.hyperspace.explain.displayMode\", \"html\")\n",
249 |         "hs.explain(filterJoin)(displayHTML(_))"
250 |       ]
251 |     }
252 |   ],
253 |   "metadata": {
254 |     "description": null,
255 |     "kernel_info": {
256 |       "name": "synapse_pyspark"
257 |     },
258 |     "kernelspec": {
259 |       "display_name": "Synapse PySpark",
260 |       "language": "Python",
261 |       "name": "synapse_pyspark"
262 |     },
263 |     "language_info": {
264 |       "name": "scala"
265 |     },
266 |     "save_output": true,
267 |     "synapse_widget": {
268 |       "state": {},
269 |       "version": "0.1"
270 |     }
271 |   },
272 |   "nbformat": 4,
273 |   "nbformat_minor": 2
274 | }
275 | 


--------------------------------------------------------------------------------