├── README.md
├── databricks_workouts_2025_WE47
├── 1_DATABRICKS_NOTEBOOK_FUNDAMENTALS
│ ├── 4_child_notebook.py
│ ├── 3_Notebook_workflow_utils_notebooks_widgets_invoking_passing_params.py
│ ├── 2_Explore_Notebook_Markdowns.ipynb
│ └── 1_Explore_Notebooks_magic_commands.ipynb
├── 1_USECASES_NB_FUNDAMENTALS
│ ├── 1_Usecase_Explore_Notebooks_magic_commands.ipynb
│ ├── 4_child_nb_dataload.ipynb
│ └── 2_Usecase_md_dbutils_widgets.ipynb
└── 2_Spark_DataFrame_Read_Write_Operations
│ ├── read_write_usecases.ipynb
│ └── 3-Basic-WriteOps.ipynb
├── databricks_workouts_2025
├── 2_Spark_DataFrame_Read_Write_Operations
│ ├── 4-Advanced-WriteOps.ipynb
│ ├── 2-Advanced-Readops.ipynb
│ ├── read_write_usecases.ipynb
│ └── 3-Basic-WriteOps.ipynb
├── 1_DATABRICKS_NOTEBOOK_FUNDAMENTALS
│ ├── 4_child_notebook.py
│ ├── 3_Notebook_workflow_utils_notebooks_widgets_invoking_passing_params.py
│ ├── 2_Explore_Notebook_Markdowns.ipynb
│ └── 1_Explore_Notebooks_magic_commands.ipynb
└── 1_USECASES_NB_FUNDAMENTALS
│ ├── 1_Usecase_Explore_Notebooks_magic_commands.ipynb
│ ├── 4_child_nb_dataload.ipynb
│ └── 2_Usecase_md_dbutils_widgets.ipynb
├── oops_fundamentals_4.py
├── we47_local_notebooks
└── my_first_notebook.ipynb
└── LICENSE
/README.md:
--------------------------------------------------------------------------------
1 | # databricks-code-repo
2 | Repo to maintain the Databricks notebook and other objects
3 |
--------------------------------------------------------------------------------
/databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md
3 | # MAGIC #Creating this child notebook for the demo of calling child notebook from the parent notebook
4 |
5 | # COMMAND ----------
6 |
7 | dbutils.widgets.text("table_name", "cities")
8 | table_name = dbutils.widgets.get("table_name")
9 | print(f"parameter passed is {table_name}")
10 | spark.sql(f"select * from {table_name}").show(2)
11 |
12 | # COMMAND ----------
13 |
14 | dbutils.notebook.exit("success")
15 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/2_Spark_DataFrame_Read_Write_Operations/4-Advanced-WriteOps.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {
4 | "application/vnd.databricks.v1+notebook": {
5 | "computePreferences": null,
6 | "dashboards": [],
7 | "environmentMetadata": {
8 | "base_environment": "",
9 | "environment_version": "3"
10 | },
11 | "inputWidgetPreferences": null,
12 | "language": "python",
13 | "notebookMetadata": {
14 | "pythonIndentUnit": 4
15 | },
16 | "notebookName": "4-Advanced-WriteOps",
17 | "widgets": {}
18 | },
19 | "language_info": {
20 | "name": "python"
21 | }
22 | },
23 | "nbformat": 4,
24 | "nbformat_minor": 0
25 | }
26 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md
3 | # MAGIC #Creating this child notebook for the demo of calling child notebook from the parent notebook
4 |
5 | # COMMAND ----------
6 |
7 | # MAGIC %sql
8 | # MAGIC select current_timestamp()
9 |
10 | # COMMAND ----------
11 |
12 | #dbutils.notebook.exit(0)
13 |
14 | # COMMAND ----------
15 |
16 | dbutils.widgets.text("table_name", "cust")
17 |
18 | # COMMAND ----------
19 |
20 | text_box_value=dbutils.widgets.get("table_name")
21 | print(text_box_value)
22 |
23 | # COMMAND ----------
24 |
25 | #Spark SQL
26 | spark.read.table(text_box_value).display()#domain specific lang(FBP)
27 | spark.sql(f"select * from {text_box_value}").display()#Declarative lang
28 |
29 | # COMMAND ----------
30 |
31 | dbutils.notebook.exit("notebook completed successfully")
32 |
--------------------------------------------------------------------------------
/oops_fundamentals_4.py:
--------------------------------------------------------------------------------
1 | #user -> cc agent -> cost installation_util of product
2 | #siva added something
3 | #from pyspark.sql.session import SparkSession
4 | #pkg/subpkg/module/class/obj/const
5 | #functions (75%) fbp- pkg.subpkg.module.functions
6 | #class/functions (25%) oops+fbp-pkg.subpkg.module.class.functions
7 |
8 | #oops minimum concepts (class,members,self,obj,constructor)
9 | #class - is a template or blueprint program contains related members func/variables/subclasses
10 | #member - any var/funct/classes inside the class is a member
11 | #self - Used to identify the given program as a member (it is the reserved first parameter)
12 | #object - Memory Instance/Copy of a class
13 | #Constructor () - Memory in which we construct/instantiate a class is constructor
14 | #Types of Constructor - Non Parameterized, Parameterized, Default
15 |
16 | #class is a main program that holds the subprograms
17 | #xls class (template/blueprint) holds subprograms (functions) (tabs)
18 | print("hello team - in Git")
19 | class xls:
20 | def tab1(self):
21 | pass
22 | def tab2(self):
23 | pass
24 |
25 | class prod_cost:
26 | def installation_cost(self):
27 | installation_cost=100+10
28 | return installation_cost
29 | def total_cost(self):
30 | total_cost=self.installation_cost()
31 | return total_cost
32 |
33 | mohan_open=xls()
34 | print(mohan_open.tab1())
35 | karthick_open=xls()
36 | print(karthick_open.tab1())
37 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/1_USECASES_NB_FUNDAMENTALS/1_Usecase_Explore_Notebooks_magic_commands.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "74cb71bf-99b6-4b2a-bf2f-3259b376fd41",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | "######Task1: Document the Notebook Using mark down %md
\n",
17 | "A good Title
\n",
18 | "Description of the task
\n",
19 | "Your name in some color
\n",
20 | "Bring our Team photo from the given url \"https://fpimages.withfloats.com/actual/6929d1ac956d0a744b5c9822.jpeg\"
\n",
21 | "Use headings, bold, italics appropriately.
\n",
22 | "\n",
23 | "Task2: Create a volume namely usage_metrics using sql magic command %sql\n",
24 | "\n",
25 | "Task3: \n",
26 | "Create a child notebook \"4_child_nb_dataload\" and write code to load data, Using the requests library, perform api call to pull data from \"https://public.tableau.com/app/sample-data/mobile_os_usage.csv\" into a python variable using the magic command %py and write the data into the created volume \"/Volumes/workspace/default/usage_metrics/mobile_os_usage.csv\" using the above variable.text using the magic command dbutils.fs.put(\"volume\",variable.text,overwrite=True)\n",
27 | "\n",
28 | "Task4: Call the notebook 4_child_nb_dataload using the magic command %run\n",
29 | "\n",
30 | "Task5: list the file is created in the given volume or not and do the head of this file using fs magic command %fs \n",
31 | "\n",
32 | "Task6: Create a pyspark dataframe df1 reading the data from the above file using pyspark magic command %python\n",
33 | "\n",
34 | "Task7: Write the above dataframe df1 data into a databricks table called 'default.mobile_os_usage' using pyspark magic command %python\n",
35 | "\n",
36 | "Task8: Write sql query to display the data loaded into the table 'default.mobile_os_usage' using the pyspark magic command %python \n",
37 | "\n",
38 | "Task9: Create a python function to convert the given input to upper case\n",
39 | "\n",
40 | "Task10: Install pandas library using the pip python magic command %pip\n",
41 | "\n",
42 | "Task11: Import pandas, using pandas read_csv and display the output using the magic command %python\n",
43 | "\n",
44 | "Task12: echo \"Magic commands tasks completed\" using the linux shell magic command %sh "
45 | ]
46 | }
47 | ],
48 | "metadata": {
49 | "application/vnd.databricks.v1+notebook": {
50 | "computePreferences": {
51 | "hardware": {
52 | "accelerator": null,
53 | "gpuPoolId": null,
54 | "memory": null
55 | }
56 | },
57 | "dashboards": [],
58 | "environmentMetadata": {
59 | "base_environment": "",
60 | "environment_version": "4"
61 | },
62 | "inputWidgetPreferences": null,
63 | "language": "python",
64 | "notebookMetadata": {
65 | "mostRecentlyExecutedCommandWithImplicitDF": {
66 | "commandId": 7900721791748489,
67 | "dataframes": [
68 | "_sqldf"
69 | ]
70 | },
71 | "pythonIndentUnit": 4
72 | },
73 | "notebookName": "1_Usecase_Explore_Notebooks_magic_commands",
74 | "widgets": {}
75 | },
76 | "language_info": {
77 | "name": "python"
78 | }
79 | },
80 | "nbformat": 4,
81 | "nbformat_minor": 0
82 | }
83 |
--------------------------------------------------------------------------------
/databricks_workouts_2025_WE47/1_USECASES_NB_FUNDAMENTALS/1_Usecase_Explore_Notebooks_magic_commands.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "74cb71bf-99b6-4b2a-bf2f-3259b376fd41",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | "######Task1: Document the Notebook Using mark down %md
\n",
17 | "A good Title
\n",
18 | "Description of the task
\n",
19 | "Your name in some color
\n",
20 | "Bring our Team photo from the given url \"https://fpimages.withfloats.com/actual/6936e213e40c3ddda3969dd0.jpeg\"
\n",
21 | "Use headings, bold, italics appropriately.
\n",
22 | "\n",
23 | "Task2: Create a volume namely usage_metrics using sql magic command %sql\n",
24 | "\n",
25 | "Task3: \n",
26 | "Create a child notebook \"4_child_nb_dataload\" and write code to load data, Using the requests library, perform api call to pull data from \"https://public.tableau.com/app/sample-data/mobile_os_usage.csv\" into a python variable using the magic command %py and write the data into the created volume \"/Volumes/workspace/default/usage_metrics/mobile_os_usage.csv\" using the above variable.text using the magic command dbutils.fs.put(\"volume\",variable.text,overwrite=True)\n",
27 | "\n",
28 | "Task4: Call the notebook 4_child_nb_dataload using the magic command %run\n",
29 | "\n",
30 | "Task5: list the file is created in the given volume or not and do the head of this file using fs magic command %fs \n",
31 | "\n",
32 | "Task6: Create a pyspark dataframe df1 reading the data from the above file using pyspark magic command %python\n",
33 | "\n",
34 | "Task7: Write the above dataframe df1 data into a databricks table called 'default.mobile_os_usage' using pyspark magic command %python\n",
35 | "\n",
36 | "Task8: Write sql query to display the data loaded into the table 'default.mobile_os_usage' using the pyspark magic command %python \n",
37 | "\n",
38 | "Task9: Create a python function to convert the given input to upper case\n",
39 | "\n",
40 | "Task10: Install pandas library using the pip python magic command %pip\n",
41 | "\n",
42 | "Task11: Import pandas, using pandas read_csv and display the output using the magic command %python\n",
43 | "\n",
44 | "Task12: echo \"Magic commands tasks completed\" using the linux shell magic command %sh "
45 | ]
46 | }
47 | ],
48 | "metadata": {
49 | "application/vnd.databricks.v1+notebook": {
50 | "computePreferences": {
51 | "hardware": {
52 | "accelerator": null,
53 | "gpuPoolId": null,
54 | "memory": null
55 | }
56 | },
57 | "dashboards": [],
58 | "environmentMetadata": {
59 | "base_environment": "",
60 | "environment_version": "4"
61 | },
62 | "inputWidgetPreferences": null,
63 | "language": "python",
64 | "notebookMetadata": {
65 | "mostRecentlyExecutedCommandWithImplicitDF": {
66 | "commandId": 7900721791748489,
67 | "dataframes": [
68 | "_sqldf"
69 | ]
70 | },
71 | "pythonIndentUnit": 4
72 | },
73 | "notebookName": "1_Usecase_Explore_Notebooks_magic_commands",
74 | "widgets": {}
75 | },
76 | "language_info": {
77 | "name": "python"
78 | }
79 | },
80 | "nbformat": 4,
81 | "nbformat_minor": 0
82 | }
83 |
--------------------------------------------------------------------------------
/we47_local_notebooks/my_first_notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "7fa1dbe6-20db-4359-8df7-7866e701f032",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | ""
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "application/vnd.databricks.v1+cell": {
23 | "cellMetadata": {},
24 | "inputWidgets": {},
25 | "nuid": "fbd8ec32-720f-42d9-bd51-45b397b9d381",
26 | "showTitle": false,
27 | "tableResultSettingsMap": {},
28 | "title": ""
29 | }
30 | },
31 | "source": [
32 | "#Lets learn about how to work in notebooks"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {
38 | "application/vnd.databricks.v1+cell": {
39 | "cellMetadata": {},
40 | "inputWidgets": {},
41 | "nuid": "29059821-bbb4-4ad0-bfab-8861666501b5",
42 | "showTitle": false,
43 | "tableResultSettingsMap": {},
44 | "title": ""
45 | }
46 | },
47 | "source": [
48 | "## Lets create markdown designs"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {
54 | "application/vnd.databricks.v1+cell": {
55 | "cellMetadata": {},
56 | "inputWidgets": {},
57 | "nuid": "3950f092-165c-44b7-b6de-d847624b9dc7",
58 | "showTitle": false,
59 | "tableResultSettingsMap": {},
60 | "title": ""
61 | }
62 | },
63 | "source": [
64 | "##Lets learn magic commands"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {
70 | "application/vnd.databricks.v1+cell": {
71 | "cellMetadata": {},
72 | "inputWidgets": {},
73 | "nuid": "0ebbd436-b0ec-42b4-99dd-84ef6371cb04",
74 | "showTitle": false,
75 | "tableResultSettingsMap": {},
76 | "title": ""
77 | }
78 | },
79 | "source": [
80 | "###Lets learn %sh magic command"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 0,
86 | "metadata": {
87 | "application/vnd.databricks.v1+cell": {
88 | "cellMetadata": {
89 | "byteLimit": 2048000,
90 | "rowLimit": 10000
91 | },
92 | "inputWidgets": {},
93 | "nuid": "d2c29bd7-d881-4d82-8f48-e103887cbd9f",
94 | "showTitle": false,
95 | "tableResultSettingsMap": {},
96 | "title": ""
97 | }
98 | },
99 | "outputs": [],
100 | "source": [
101 | "%sh ls -l /home/spark-c799a53d-6b9f-4442-ba04-aa"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {
107 | "application/vnd.databricks.v1+cell": {
108 | "cellMetadata": {},
109 | "inputWidgets": {},
110 | "nuid": "0defa698-799c-4112-a1a5-dd77e86cbff6",
111 | "showTitle": false,
112 | "tableResultSettingsMap": {},
113 | "title": ""
114 | }
115 | },
116 | "source": [
117 | "###Lets learn %fs magic command"
118 | ]
119 | }
120 | ],
121 | "metadata": {
122 | "application/vnd.databricks.v1+notebook": {
123 | "computePreferences": null,
124 | "dashboards": [],
125 | "environmentMetadata": {
126 | "base_environment": "",
127 | "environment_version": "4"
128 | },
129 | "inputWidgetPreferences": null,
130 | "language": "python",
131 | "notebookMetadata": {
132 | "mostRecentlyExecutedCommandWithImplicitDF": {
133 | "commandId": 7789511576367464,
134 | "dataframes": [
135 | "_sqldf"
136 | ]
137 | },
138 | "pythonIndentUnit": 4
139 | },
140 | "notebookName": "my_first_notebook",
141 | "widgets": {}
142 | },
143 | "language_info": {
144 | "name": "python"
145 | }
146 | },
147 | "nbformat": 4,
148 | "nbformat_minor": 0
149 | }
150 |
--------------------------------------------------------------------------------
/databricks_workouts_2025_WE47/1_USECASES_NB_FUNDAMENTALS/4_child_nb_dataload.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "d6e0adc4-d54e-4015-ac37-1ed3c539799c",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | "### Notebook to create and load data into databricks volume"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 0,
22 | "metadata": {
23 | "application/vnd.databricks.v1+cell": {
24 | "cellMetadata": {
25 | "byteLimit": 2048000,
26 | "implicitDf": true,
27 | "rowLimit": 10000
28 | },
29 | "inputWidgets": {},
30 | "nuid": "19d46640-79da-457b-87c6-1235f27cfbac",
31 | "showTitle": false,
32 | "tableResultSettingsMap": {},
33 | "title": ""
34 | }
35 | },
36 | "outputs": [],
37 | "source": [
38 | "%sql\n",
39 | "CREATE VOLUME IF NOT EXISTS workspace.default.mobile_metrics;"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 0,
45 | "metadata": {
46 | "application/vnd.databricks.v1+cell": {
47 | "cellMetadata": {
48 | "byteLimit": 2048000,
49 | "rowLimit": 10000
50 | },
51 | "inputWidgets": {},
52 | "nuid": "07ad79a4-1066-4181-9f5f-0121d09dce83",
53 | "showTitle": false,
54 | "tableResultSettingsMap": {},
55 | "title": ""
56 | }
57 | },
58 | "outputs": [],
59 | "source": [
60 | "import requests\n",
61 | "response = requests.get(\"https://public.tableau.com/app/sample-data/mobile_os_usage.csv\")\n",
62 | "dbutils.fs.put(\"/Volumes/workspace/default/mobile_metrics/mobile_os_usage.csv\", response.text, overwrite=True)\n"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 0,
68 | "metadata": {
69 | "application/vnd.databricks.v1+cell": {
70 | "cellMetadata": {
71 | "byteLimit": 2048000,
72 | "rowLimit": 10000
73 | },
74 | "inputWidgets": {},
75 | "nuid": "96dec063-50b9-489d-a43b-eed5ec938643",
76 | "showTitle": false,
77 | "tableResultSettingsMap": {},
78 | "title": ""
79 | }
80 | },
81 | "outputs": [],
82 | "source": [
83 | "%fs\n",
84 | "ls /Volumes/workspace/default/volume1/mobile_os_usage.csv"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 0,
90 | "metadata": {
91 | "application/vnd.databricks.v1+cell": {
92 | "cellMetadata": {
93 | "byteLimit": 2048000,
94 | "rowLimit": 10000
95 | },
96 | "inputWidgets": {},
97 | "nuid": "9f4f28f6-62e3-434e-bfc5-b23c5675a6dc",
98 | "showTitle": false,
99 | "tableResultSettingsMap": {},
100 | "title": ""
101 | }
102 | },
103 | "outputs": [],
104 | "source": [
105 | "%fs head /Volumes/workspace/default/volume1/mobile_os_usage.csv"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 0,
111 | "metadata": {
112 | "application/vnd.databricks.v1+cell": {
113 | "cellMetadata": {
114 | "byteLimit": 2048000,
115 | "rowLimit": 10000
116 | },
117 | "inputWidgets": {},
118 | "nuid": "ce526cca-6cd6-423d-b37d-841576bd5c25",
119 | "showTitle": false,
120 | "tableResultSettingsMap": {},
121 | "title": ""
122 | }
123 | },
124 | "outputs": [],
125 | "source": [
126 | "spark.read.csv(\"/Volumes/workspace/default/volume1/mobile_os_usage.csv\").write.saveAsTable(\"mobile_os_usage\")"
127 | ]
128 | }
129 | ],
130 | "metadata": {
131 | "application/vnd.databricks.v1+notebook": {
132 | "computePreferences": {
133 | "hardware": {
134 | "accelerator": null,
135 | "gpuPoolId": null,
136 | "memory": null
137 | }
138 | },
139 | "dashboards": [],
140 | "environmentMetadata": {
141 | "base_environment": "",
142 | "environment_version": "4"
143 | },
144 | "inputWidgetPreferences": null,
145 | "language": "python",
146 | "notebookMetadata": {
147 | "mostRecentlyExecutedCommandWithImplicitDF": {
148 | "commandId": 7900721791748484,
149 | "dataframes": [
150 | "_sqldf"
151 | ]
152 | },
153 | "pythonIndentUnit": 4
154 | },
155 | "notebookName": "4_child_nb_dataload",
156 | "widgets": {}
157 | },
158 | "language_info": {
159 | "name": "python"
160 | }
161 | },
162 | "nbformat": 4,
163 | "nbformat_minor": 0
164 | }
165 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/1_USECASES_NB_FUNDAMENTALS/4_child_nb_dataload.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "d6e0adc4-d54e-4015-ac37-1ed3c539799c",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | "### Notebook to create and load data into databricks volume"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 0,
22 | "metadata": {
23 | "application/vnd.databricks.v1+cell": {
24 | "cellMetadata": {
25 | "byteLimit": 2048000,
26 | "implicitDf": true,
27 | "rowLimit": 10000
28 | },
29 | "inputWidgets": {},
30 | "nuid": "19d46640-79da-457b-87c6-1235f27cfbac",
31 | "showTitle": false,
32 | "tableResultSettingsMap": {},
33 | "title": ""
34 | }
35 | },
36 | "outputs": [],
37 | "source": [
38 | "%sql\n",
39 | "CREATE VOLUME IF NOT EXISTS workspace.default.mobile_metrics;"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 0,
45 | "metadata": {
46 | "application/vnd.databricks.v1+cell": {
47 | "cellMetadata": {
48 | "byteLimit": 2048000,
49 | "rowLimit": 10000
50 | },
51 | "inputWidgets": {},
52 | "nuid": "07ad79a4-1066-4181-9f5f-0121d09dce83",
53 | "showTitle": false,
54 | "tableResultSettingsMap": {},
55 | "title": ""
56 | }
57 | },
58 | "outputs": [],
59 | "source": [
60 | "import requests\n",
61 | "response = requests.get(\"https://public.tableau.com/app/sample-data/mobile_os_usage.csv\")\n",
62 | "dbutils.fs.put(\"/Volumes/workspace/default/mobile_metrics/mobile_os_usage.csv\", response.text, overwrite=True)\n"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 0,
68 | "metadata": {
69 | "application/vnd.databricks.v1+cell": {
70 | "cellMetadata": {
71 | "byteLimit": 2048000,
72 | "rowLimit": 10000
73 | },
74 | "inputWidgets": {},
75 | "nuid": "96dec063-50b9-489d-a43b-eed5ec938643",
76 | "showTitle": false,
77 | "tableResultSettingsMap": {},
78 | "title": ""
79 | }
80 | },
81 | "outputs": [],
82 | "source": [
83 | "%fs\n",
84 | "ls /Volumes/workspace/default/volume1/mobile_os_usage.csv"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 0,
90 | "metadata": {
91 | "application/vnd.databricks.v1+cell": {
92 | "cellMetadata": {
93 | "byteLimit": 2048000,
94 | "rowLimit": 10000
95 | },
96 | "inputWidgets": {},
97 | "nuid": "9f4f28f6-62e3-434e-bfc5-b23c5675a6dc",
98 | "showTitle": false,
99 | "tableResultSettingsMap": {},
100 | "title": ""
101 | }
102 | },
103 | "outputs": [],
104 | "source": [
105 | "%fs head /Volumes/workspace/default/volume1/mobile_os_usage.csv"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 0,
111 | "metadata": {
112 | "application/vnd.databricks.v1+cell": {
113 | "cellMetadata": {
114 | "byteLimit": 2048000,
115 | "rowLimit": 10000
116 | },
117 | "inputWidgets": {},
118 | "nuid": "5e796d50-f0ba-427f-9485-63bd3ef375bc",
119 | "showTitle": false,
120 | "tableResultSettingsMap": {},
121 | "title": ""
122 | }
123 | },
124 | "outputs": [],
125 | "source": []
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 0,
130 | "metadata": {
131 | "application/vnd.databricks.v1+cell": {
132 | "cellMetadata": {
133 | "byteLimit": 2048000,
134 | "rowLimit": 10000
135 | },
136 | "inputWidgets": {},
137 | "nuid": "ce526cca-6cd6-423d-b37d-841576bd5c25",
138 | "showTitle": false,
139 | "tableResultSettingsMap": {},
140 | "title": ""
141 | }
142 | },
143 | "outputs": [],
144 | "source": [
145 | "spark.read.csv(\"/Volumes/workspace/default/volume1/mobile_os_usage.csv\").write.saveAsTable(\"mobile_os_usage\")"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 0,
151 | "metadata": {
152 | "application/vnd.databricks.v1+cell": {
153 | "cellMetadata": {
154 | "byteLimit": 2048000,
155 | "rowLimit": 10000
156 | },
157 | "inputWidgets": {},
158 | "nuid": "ec47633a-0cdc-4180-8cf5-a795a51137b9",
159 | "showTitle": false,
160 | "tableResultSettingsMap": {},
161 | "title": ""
162 | }
163 | },
164 | "outputs": [],
165 | "source": [
166 | "%matplotlib inline\n",
167 | "import pandas as pd\n",
168 | "df = pd.read_csv(\"/Volumes/workspace/default/volume1/mobile_os_usage.csv\")\n",
169 | "df.plot(kind=\"bar\", x=df.columns[0])\n"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {
175 | "application/vnd.databricks.v1+cell": {
176 | "cellMetadata": {},
177 | "inputWidgets": {},
178 | "nuid": "67c2a369-637e-457e-9e61-1193ed81182c",
179 | "showTitle": false,
180 | "tableResultSettingsMap": {},
181 | "title": ""
182 | }
183 | },
184 | "source": [
185 | "https://web.s-cdn.boostkit.dev/webaction-files/5ac62a0f22728e050851fc87_our_faculty/face-67f16daa4404199c78d2e38b.jpg\n",
186 | ""
187 | ]
188 | }
189 | ],
190 | "metadata": {
191 | "application/vnd.databricks.v1+notebook": {
192 | "computePreferences": {
193 | "hardware": {
194 | "accelerator": null,
195 | "gpuPoolId": null,
196 | "memory": null
197 | }
198 | },
199 | "dashboards": [],
200 | "environmentMetadata": {
201 | "base_environment": "",
202 | "environment_version": "4"
203 | },
204 | "inputWidgetPreferences": null,
205 | "language": "python",
206 | "notebookMetadata": {
207 | "mostRecentlyExecutedCommandWithImplicitDF": {
208 | "commandId": 7900721791748484,
209 | "dataframes": [
210 | "_sqldf"
211 | ]
212 | },
213 | "pythonIndentUnit": 4
214 | },
215 | "notebookName": "4_child_nb_dataload",
216 | "widgets": {}
217 | },
218 | "language_info": {
219 | "name": "python"
220 | }
221 | },
222 | "nbformat": 4,
223 | "nbformat_minor": 0
224 | }
225 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/3_Notebook_workflow_utils_notebooks_widgets_invoking_passing_params.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md
3 | # MAGIC #####1. Display the list databricks utils
4 |
5 | # COMMAND ----------
6 |
7 | dbutils.widgets.removeAll()
8 |
9 | # COMMAND ----------
10 |
11 | # MAGIC %md
12 | # MAGIC ######Below dbutils is the comprehensive one, out of which we are going to concentrate currently on notebook, widgets and fs for now
13 |
14 | # COMMAND ----------
15 |
16 | dbutils.help()
17 | #important utils are
18 | #fs, jobs, notebook, widgets
19 |
20 | # COMMAND ----------
21 |
22 | # MAGIC %md
23 | # MAGIC #####2. Notebook utils help
24 |
25 | # COMMAND ----------
26 |
27 | dbutils.help()
28 |
29 | # COMMAND ----------
30 |
31 | # MAGIC %md
32 | # MAGIC ###3. FS Commands
33 |
34 | # COMMAND ----------
35 |
36 | dbutils.fs.help()
37 |
38 | # COMMAND ----------
39 |
40 | print("lets learn all fs commands options...")
41 | print("copying")
42 | dbutils.fs.cp("/Volumes/workspace/default/volumewd36/sample_healthcare_patients.csv","/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv")
43 | print("head of 10 rows")
44 | print(dbutils.fs.head("/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv"))
45 | print("listing")
46 | dbutils.fs.ls("/Volumes/workspace/default/volumewd36/")
47 | print("make directory")
48 | dbutils.fs.mkdirs("/Volumes/workspace/default/volumewd36/healthcare/")
49 | print("move")
50 | dbutils.fs.mv("/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv","/Volumes/workspace/default/volumewd36/healthcare/sample_healthcare_patients1.csv")
51 | dbutils.fs.ls("/Volumes/workspace/default/volumewd36/healthcare/")
52 | dbutils.fs.cp("/Volumes/workspace/default/volumewd36/sample_healthcare_patients.csv","/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv")
53 | print("put to write some data into a file")
54 |
55 | # COMMAND ----------
56 |
57 | print("try below command without the 3rd argument of true, you will find the dbfs-> hadoop -> spark -> s3 bucket")
58 | #dbutils.fs.put("dbfs:///Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv","put something",False)
59 | print(dbutils.fs.head("/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv"))
60 | dbutils.fs.put("dbfs:///Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv","put something",True)
61 | print("see the data in the file")
62 | print(dbutils.fs.head("/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv"))
63 | dbutils.fs.rm("/Volumes/workspace/default/volumewd36/healthcare/sample_healthcare_patients1.csv")
64 |
65 | # COMMAND ----------
66 |
67 | # MAGIC %md
68 | # MAGIC #####4. Widgets utils help
69 |
70 | # COMMAND ----------
71 |
72 | dbutils.widgets.help()
73 |
74 | # COMMAND ----------
75 |
76 | # MAGIC %md
77 | # MAGIC ###Widgets utility used for adding the components/widgets into our notebook for creating
78 | # MAGIC dynamic/parameterized approaches
79 |
80 | # COMMAND ----------
81 |
82 | print("can you create a textbox widget")
83 | dbutils.widgets.text("tablename","cities","enter the tablename to query")
84 |
85 | # COMMAND ----------
86 |
87 | print("can you get the value of the widget using dbutils.widgets.get and store into a local python variable tblname")
88 | tblname=dbutils.widgets.get("tablename")
89 | print("user passed the value of ?",tblname)
90 |
91 | # COMMAND ----------
92 |
93 | #Implemented dynamic SQL usecase in Databricks
94 | display(spark.sql(f"select * from default.{tblname} limit 10"))
95 |
96 | # COMMAND ----------
97 |
98 | dbutils.widgets.removeAll()
99 |
100 | # COMMAND ----------
101 |
102 | dbutils.widgets.help()
103 |
104 | # COMMAND ----------
105 |
106 | dbutils.widgets.dropdown("dropdown_widget","Senthil",["Senthil","Balaji","Arun"],"Select your name")
107 | aspirant_name_chosen=dbutils.widgets.get("dropdown_widget")
108 | print("Good morning",aspirant_name_chosen)
109 |
110 | # COMMAND ----------
111 |
112 | dbutils.widgets.multiselect("multiselect_widget","wd36",["wd32","we43","we45","wd36"],"Select your team name")
113 | all_batches=dbutils.widgets.get("multiselect_widget")
114 | all_batches_lst=all_batches.split(",")
115 | for i in all_batches_lst:
116 | print(f"hello team {i}")
117 | #print("You have chosen the team name as",all_batches)
118 |
119 | # COMMAND ----------
120 |
121 | #Interview question- how to access some value from the given string
122 | fullname="mohamed kader irfan"
123 | fname=fullname.split(" ")[0]
124 | lname=fullname.split(" ")[-1]
125 | print(fname, 'and', lname)
126 |
127 | # COMMAND ----------
128 |
129 | dbutils.widgets.combobox("combobox_widget","wd36",["wd32","we43","we45","wd36"],"Select your team name")
130 | combobox_value=dbutils.widgets.get("combobox_widget")
131 | print("Good morning",combobox_value)
132 |
133 | # COMMAND ----------
134 |
135 | dbutils.widgets.text("team_name","WD36","This is to represent our team name")
136 |
137 | # COMMAND ----------
138 |
139 | text_box_value1=dbutils.widgets.get("team_name")
140 | print("Good Morning ",text_box_value1)
141 |
142 | # COMMAND ----------
143 |
144 | dbutils.widgets.dropdown("listbox","wd36",["wd32","we43","we45","wd36"],"Team names drop down")
145 | listbox_value2=dbutils.widgets.get("listbox")
146 | print("Good morning",listbox_value2)
147 |
148 | # COMMAND ----------
149 |
150 | dbutils.widgets.combobox("combobox","we47",["wd32","we43","we45","we47"],"Team names combo box")
151 |
152 | # COMMAND ----------
153 |
154 | dbutils.widgets.multiselect("multiselect","wd36",["wd32","we43","we45","wd36"],"Team names multiselect")
155 |
156 | # COMMAND ----------
157 |
158 | dict_all_widgets=dbutils.widgets.getAll()
159 | print(dict_all_widgets)
160 |
161 | # COMMAND ----------
162 |
163 | # MAGIC %md
164 | # MAGIC #####4. Calling a child notebook (example_child_notebook.ipynb) from this parent notebook with parameters
165 | # MAGIC dbutils.widgets.text("param1", "default_value", "Your input parameter")
166 | # MAGIC param_value = dbutils.widgets.get("param1")
167 | # MAGIC print("printing the parameters",param_value)
168 |
169 | # COMMAND ----------
170 |
171 | child_return_value=dbutils.notebook.run("/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook", 180,{"table_name":"cities1"})
172 |
173 | # COMMAND ----------
174 |
175 | if True:
176 | dbutils.notebook.run("/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook",600)
177 | else:
178 | dbutils.notebook.run("/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook",300)
179 |
180 | # COMMAND ----------
181 |
182 | import time
183 | for i in range(13):
184 | dbutils.notebook.run("/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook",300)
185 | time.sleep(10)
186 |
187 | # COMMAND ----------
188 |
189 | dbutils.widgets.removeAll()
190 |
--------------------------------------------------------------------------------
/databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/3_Notebook_workflow_utils_notebooks_widgets_invoking_passing_params.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | dbutils.widgets.removeAll()
3 |
4 | # COMMAND ----------
5 |
6 | # MAGIC %md
7 | # MAGIC #We are going to learn usage of dbutils (DB Utilities...) + widgets (interesting dbutil)
8 |
9 | # COMMAND ----------
10 |
11 | # MAGIC %fs ls
12 |
13 | # COMMAND ----------
14 |
15 | # MAGIC %md
16 | # MAGIC #####1. Display the list databricks utils
17 | # MAGIC ######Below dbutils is the comprehensive one, out of which we are going to concentrate currently on notebook, widgets and fs for now
18 |
19 | # COMMAND ----------
20 |
21 | dbutils.help()
22 | #Some of the important utils...
23 | #fs, notebook, widgets, secrets (security management)
24 |
25 | # COMMAND ----------
26 |
27 | # MAGIC %md
28 | # MAGIC #####2. Notebook's particular utils help
29 |
30 | # COMMAND ----------
31 |
32 | dbutils.fs.help()
33 |
34 | # COMMAND ----------
35 |
36 | # MAGIC %md
37 | # MAGIC #####3. Widgets utils help
38 |
39 | # COMMAND ----------
40 |
41 | dbutils.widgets.help()
42 | #4 Important widgets
43 | #combobox, dropdown, text, multiselect
44 |
45 | # COMMAND ----------
46 |
47 | # MAGIC %md
48 | # MAGIC #####4. Let's create all those widgets/plugins/components, attach to this notebook, capture the widget content and make use of it...
49 |
50 | # COMMAND ----------
51 |
52 | dbutils.widgets.removeAll()
53 |
54 | # COMMAND ----------
55 |
56 | # MAGIC %md
57 | # MAGIC ######A. Text widget
58 |
59 | # COMMAND ----------
60 |
61 | #creating and attaching a widget (simple and important widget)
62 | dbutils.widgets.text("aspirant_name","Thilaga","enter our aspirant name to wish")
63 |
64 | # COMMAND ----------
65 |
66 | #capture the widget input in a variable
67 | name_of_aspirant=dbutils.widgets.get("aspirant_name")
68 | #use that variable for some purpose
69 | print(f"Congratulations!!! {name_of_aspirant}")
70 |
71 | # COMMAND ----------
72 |
73 | # MAGIC %md
74 | # MAGIC ######B. Dropdown widget
75 |
76 | # COMMAND ----------
77 |
78 | dbutils.widgets.dropdown("aspirant_gender","Female",["Male","Female"])
79 | gender=dbutils.widgets.get("aspirant_gender")
80 | print(f"Gender of aspirant is {gender}")
81 |
82 | # COMMAND ----------
83 |
84 | # MAGIC %md
85 | # MAGIC ######C. Combobox widget - Used to choose only one value from the dropdown by searching
86 |
87 | # COMMAND ----------
88 |
89 | dbutils.widgets.combobox("aspirant_country_combo","India",["India","USA","UK","Canada","Australia"])
90 | country=dbutils.widgets.get("aspirant_country_combo")
91 | print(f"Country of aspirant is {country}")
92 |
93 | # COMMAND ----------
94 |
95 | # MAGIC %md
96 | # MAGIC ######D. Multiselect widget - Used to choose multiple values from the dropdown by searching
97 |
98 | # COMMAND ----------
99 |
100 | dbutils.widgets.multiselect("aspirant_hobbies_multiselect","Dance",["Dance","Music","Sports","Reading","Writing"])
101 | hobbies=dbutils.widgets.get("aspirant_hobbies_multiselect")
102 | print(f"Hobbies of aspirant are {hobbies}",type(hobbies))
103 | print("Top and Least hobbies ?", hobbies.split(",")[0],hobbies.split(",")[-1])
104 |
105 | # COMMAND ----------
106 |
107 | all_widgets=dbutils.widgets.getAll()
108 | print(all_widgets)
109 |
110 | # COMMAND ----------
111 |
112 | # MAGIC %md
113 | # MAGIC #####5. Dynamic SQL usecase to try on dropdown widget?
114 | # MAGIC 1. Collect the list of tables present in the catalog/schema/tables
115 | # MAGIC 2. substitute in the dropdown widgets
116 | # MAGIC 3. allow user to choose the respective table and execute the query to return the total number of rows in that table chosen.
117 | # MAGIC 4. How you can explain this in the interview?
118 | # MAGIC ..............................................................................................................
119 | # MAGIC
120 |
121 | # COMMAND ----------
122 |
123 | # MAGIC %md
124 | # MAGIC #####6. DBUtils FS Commands - For doing DBFS operations
125 |
126 | # COMMAND ----------
127 |
128 | dbutils.fs.help()
129 | #cp(from: String, to: String, recurse: boolean = false): boolean -> Copies a file or directory, possibly across FileSystems
130 | #head(file: String, maxBytes: int = 65536): String -> Returns up to the first 'maxBytes' bytes of the given file as a String encoded in UTF-8
131 | #ls(dir: String): Seq -> Lists the contents of a directory
132 | #mkdirs(dir: String): boolean -> Creates the given directory if it does not exist, also creating any necessary parent directories
133 | #mv(from: String, to: String, recurse: boolean = false): boolean -> Moves a file or directory, possibly across FileSystems
134 | #put(file: String, contents: String, overwrite: boolean = false): boolean -> Writes the given String out to a file, encoded in UTF-8
135 | #rm(dir: String, recurse: boolean = false): boolean -> Removes a file or directory
136 |
137 |
138 | # COMMAND ----------
139 |
140 | dbutils.fs.mkdirs("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1")
141 | data="hello team"
142 | dbutils.fs.put("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt",data,True)
143 | dbutils.fs.ls("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1")
144 | print(dbutils.fs.head("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt",5))#Want to see the top 5 bytes of data
145 | dbutils.fs.cp("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt","dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt_copy2.csv")
146 | dbutils.fs.mv("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt_copy2.csv","dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt_moved.csv")
147 | dbutils.fs.rm("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt")
148 |
149 | # COMMAND ----------
150 |
151 | # MAGIC %md
152 | # MAGIC #####7. Calling a child notebook (example_child_notebook.ipynb) from this parent notebook with parameters
153 | # MAGIC dbutils.widgets.text("param1", "default_value", "Your input parameter")
154 | # MAGIC param_value = dbutils.widgets.get("param1")
155 | # MAGIC print("printing the parameters",param_value)
156 |
157 | # COMMAND ----------
158 |
159 | # MAGIC %md
160 | # MAGIC #####Imporantant interview question: Difference between run magic and dbutils command?
161 | # MAGIC A. The %run magic command will run some other notebook inline in this current notebook itself, so we don't have write some other notebook code, rather we can just run it...
162 | # MAGIC B. The dbutils.notebook.run() command will trigger some other notebook in the respective notebook environment itself, and we can add additional parameters such as timeout seconds and custom parameters to the widgets we can pass...
163 |
164 | # COMMAND ----------
165 |
166 | # MAGIC %run "/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook"
167 |
168 | # COMMAND ----------
169 |
170 | return_status=dbutils.notebook.run("/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook",90,{"table_name":"cust"})
171 | print("child notebook ",return_status)
172 |
173 | # COMMAND ----------
174 |
175 | #####Interview question...
176 | fullname="inceptez technologies"
177 | print(fullname)
178 | fullname_lst=fullname.split(" ")
179 | print(fullname_lst)
180 | fname=fullname.split(" ")[0]
181 | lname=fullname.split(" ")[-1]
182 | print(fname,lname)
183 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/2_Spark_DataFrame_Read_Write_Operations/2-Advanced-Readops.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "4a070bc4-17e0-4059-97ef-0e29d09c8cf3",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | "### 1. Options for handling quotes & Escape\n",
17 | "\n",
18 | "id,name,remarks\n",
19 | "1,'Ramesh, K.P','Good performer'\n",
20 | "2,'Manoj','Needs ~'special~' attention'"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {
26 | "application/vnd.databricks.v1+cell": {
27 | "cellMetadata": {},
28 | "inputWidgets": {},
29 | "nuid": "2878e608-057e-4b0d-a33d-3ad862025f23",
30 | "showTitle": false,
31 | "tableResultSettingsMap": {},
32 | "title": ""
33 | }
34 | },
35 | "source": [
36 | "### 2. Comments, Multi line, leading and trailing whitespace handling, null and nan handling"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {
42 | "application/vnd.databricks.v1+cell": {
43 | "cellMetadata": {},
44 | "inputWidgets": {},
45 | "nuid": "7a395acd-7233-49e8-806d-800d56de7195",
46 | "showTitle": false,
47 | "tableResultSettingsMap": {},
48 | "title": ""
49 | }
50 | },
51 | "source": [
52 | "### 3. Read modes in csv"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {
58 | "application/vnd.databricks.v1+cell": {
59 | "cellMetadata": {},
60 | "inputWidgets": {},
61 | "nuid": "63c96d50-7254-47d0-9b35-76c8c44bb49d",
62 | "showTitle": false,
63 | "tableResultSettingsMap": {},
64 | "title": ""
65 | }
66 | },
67 | "source": [
68 | "### There are 3 typical read modes and the default read mode is permissive.\n",
69 | "##### 1. permissive — All fields are set to null and corrupted records are placed in a string column called _corrupt_record\n",
70 | "##### \t2. dropMalformed — Drops all rows containing corrupt records.\n",
71 | "##### 3. failFast — Fails when corrupt records are encountered."
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {
77 | "application/vnd.databricks.v1+cell": {
78 | "cellMetadata": {},
79 | "inputWidgets": {},
80 | "nuid": "b570db0d-c137-41f4-a6a6-43a82eb56d06",
81 | "showTitle": false,
82 | "tableResultSettingsMap": {},
83 | "title": ""
84 | }
85 | },
86 | "source": [
87 | "####4. Max advanced features used...\n",
88 | "\\#This is a commented line and should be ignored\n",
89 | "\"ID\",\"Name\",\"Age\",\"Salary\",\"JoinDate\",\"LastLogin\",\"Notes\"\n",
90 | "1,\"John Doe\",28,45000.50,01-2025-25,2024-01-25 10:15:45,\"New employee\"\n",
91 | "2,\"Jane, Smith\",32,55000.00,2023-12-30,2024-01-25 14:05:10\n",
92 | "3,\"Ravi Kumar\",-1,67000.75,2023-11-05,2024-02-01 08:30:00,\"Null age\",\"addon cols\"\n",
93 | "4,\"李小龍\",45,88000.00,2022-05-18,2024-01-19 13:45:22,\"UTF-8 Chinese name\"\n",
94 | "5,\"Carlos \\\"The Boss\\\" Pérez\",38,72000.30,2023-02-11,2024-01-28 09:55:05,\"Contains quotes\"\n",
95 | "6,\"Manoj\",29,50000,2024-02-10,2024-02-10 17:25:55,\"Line\n",
96 | "break\n",
97 | "inside notes\"\n",
98 | "7,\"Anita\",41,na,2023-10-08,2024-02-02 11:11:11,\"Salary is NaN\"\n",
99 | "8,\"Robert\",34,47000.20,2023-06-22,2024-01-27 18:40:40, \"Leading and trailing spaces\" \n",
100 | "9,\"\",30,39000.00,2023-09-19,2024-01-26 16:20:20,\"Empty name field\"\n",
101 | "10,\"#NotAComment\",37,51000.10,02-2025-25,2024-02-03 12:55:30,\"Starts with # but not a comment\""
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {
107 | "application/vnd.databricks.v1+cell": {
108 | "cellMetadata": {},
109 | "inputWidgets": {},
110 | "nuid": "c793a1ba-d061-45d9-852d-88a5598cd125",
111 | "showTitle": false,
112 | "tableResultSettingsMap": {},
113 | "title": ""
114 | }
115 | },
116 | "source": [
117 | "####5. Reading data from other formats (Try the below usecases after completing the 3-Basic-WriteOps)"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {
123 | "application/vnd.databricks.v1+cell": {
124 | "cellMetadata": {},
125 | "inputWidgets": {},
126 | "nuid": "1c69af82-91aa-4854-b6ce-b1eb57c70629",
127 | "showTitle": false,
128 | "tableResultSettingsMap": {},
129 | "title": ""
130 | }
131 | },
132 | "source": [
133 | "####1. Reading csv data"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 0,
139 | "metadata": {
140 | "application/vnd.databricks.v1+cell": {
141 | "cellMetadata": {
142 | "byteLimit": 2048000,
143 | "rowLimit": 10000
144 | },
145 | "inputWidgets": {},
146 | "nuid": "6f4039ed-9dec-418d-b54b-a2fcfbd0000a",
147 | "showTitle": false,
148 | "tableResultSettingsMap": {},
149 | "title": ""
150 | }
151 | },
152 | "outputs": [],
153 | "source": [
154 | "spark.read.csv(\"/Volumes/workspace/wd36schema/ingestion_volume/target/csvout\").show(2)"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {
160 | "application/vnd.databricks.v1+cell": {
161 | "cellMetadata": {},
162 | "inputWidgets": {},
163 | "nuid": "748f2729-c766-4363-ab86-b4d73205c76c",
164 | "showTitle": false,
165 | "tableResultSettingsMap": {},
166 | "title": ""
167 | }
168 | },
169 | "source": [
170 | "####2. Reading json data"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 0,
176 | "metadata": {
177 | "application/vnd.databricks.v1+cell": {
178 | "cellMetadata": {
179 | "byteLimit": 2048000,
180 | "rowLimit": 10000
181 | },
182 | "inputWidgets": {},
183 | "nuid": "a9f1957d-69c5-4d4e-b452-99bb5683cc24",
184 | "showTitle": false,
185 | "tableResultSettingsMap": {},
186 | "title": ""
187 | }
188 | },
189 | "outputs": [],
190 | "source": [
191 | "spark.read.json(\"/Volumes/workspace/wd36schema/ingestion_volume/target/jsonout\").show(2)"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {
197 | "application/vnd.databricks.v1+cell": {
198 | "cellMetadata": {},
199 | "inputWidgets": {},
200 | "nuid": "e5df82fe-767f-4ab1-a316-15e9a3e57f38",
201 | "showTitle": false,
202 | "tableResultSettingsMap": {},
203 | "title": ""
204 | }
205 | },
206 | "source": [
207 | "####3. Reading xml data"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 0,
213 | "metadata": {
214 | "application/vnd.databricks.v1+cell": {
215 | "cellMetadata": {
216 | "byteLimit": 2048000,
217 | "rowLimit": 10000
218 | },
219 | "inputWidgets": {},
220 | "nuid": "b35a6186-f61a-4c61-9f5f-c5f303fe6b7c",
221 | "showTitle": false,
222 | "tableResultSettingsMap": {},
223 | "title": ""
224 | }
225 | },
226 | "outputs": [],
227 | "source": [
228 | "spark.read.xml(\"/Volumes/workspace/wd36schema/ingestion_volume/target/xmlout\",rowTag=\"cust\").show(2)"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {
234 | "application/vnd.databricks.v1+cell": {
235 | "cellMetadata": {},
236 | "inputWidgets": {},
237 | "nuid": "e26a3f08-51ab-4b48-89e8-0d077a3becd9",
238 | "showTitle": false,
239 | "tableResultSettingsMap": {},
240 | "title": ""
241 | }
242 | },
243 | "source": [
244 | "####4. Reading serialized data (orc/parquet/delta)"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "metadata": {
250 | "application/vnd.databricks.v1+cell": {
251 | "cellMetadata": {},
252 | "inputWidgets": {},
253 | "nuid": "2d1d6b33-8169-4811-8acd-c94d1e463da8",
254 | "showTitle": false,
255 | "tableResultSettingsMap": {},
256 | "title": ""
257 | }
258 | },
259 | "source": [
260 | "####5. Reading delta/hive table data"
261 | ]
262 | }
263 | ],
264 | "metadata": {
265 | "application/vnd.databricks.v1+notebook": {
266 | "computePreferences": null,
267 | "dashboards": [],
268 | "environmentMetadata": {
269 | "base_environment": "",
270 | "environment_version": "3"
271 | },
272 | "inputWidgetPreferences": null,
273 | "language": "python",
274 | "notebookMetadata": {
275 | "pythonIndentUnit": 4
276 | },
277 | "notebookName": "2-Advanced-Readops",
278 | "widgets": {}
279 | },
280 | "language_info": {
281 | "name": "python"
282 | }
283 | },
284 | "nbformat": 4,
285 | "nbformat_minor": 0
286 | }
287 |
--------------------------------------------------------------------------------
/databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/2_Explore_Notebook_Markdowns.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "ea376574-7777-4592-936b-38eb25b6e1d9",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | ""
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "application/vnd.databricks.v1+cell": {
23 | "cellMetadata": {},
24 | "inputWidgets": {},
25 | "nuid": "96aa1a34-733e-4d2f-b705-1e076af575eb",
26 | "showTitle": false,
27 | "tableResultSettingsMap": {},
28 | "title": ""
29 | }
30 | },
31 | "source": [
32 | "#1. Basics of Python Programing"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {
38 | "application/vnd.databricks.v1+cell": {
39 | "cellMetadata": {},
40 | "inputWidgets": {},
41 | "nuid": "523792e4-d5c5-4831-9694-7243855d6ead",
42 | "showTitle": false,
43 | "tableResultSettingsMap": {},
44 | "title": ""
45 | }
46 | },
47 | "source": [
48 | "##A. Python is an indent based programming language\n",
49 | "Why Python uses indend based programing ->\n",
50 | "1. Managing the program more efficiently\n",
51 | "2. Better Readablility of the code\n",
52 | "3. For creating the hierarchy of programming.\n",
53 | "4. By default 4 spaces we will give for indends, but more/less spaces or tabs also can be used..."
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 0,
59 | "metadata": {
60 | "application/vnd.databricks.v1+cell": {
61 | "cellMetadata": {},
62 | "inputWidgets": {},
63 | "nuid": "289c8a44-dee8-4046-9f80-a34ba4707f9b",
64 | "showTitle": false,
65 | "tableResultSettingsMap": {},
66 | "title": ""
67 | }
68 | },
69 | "outputs": [],
70 | "source": [
71 | "%python\n",
72 | "aspirants_list=['Jeeva','Bharathi','Vaanmathy','Nag']\n",
73 | "for aspirants in aspirants_list:\n",
74 | " print(\"good afternoon \",aspirants)\n",
75 | "print(\"good after all aspirants\")"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {
81 | "application/vnd.databricks.v1+cell": {
82 | "cellMetadata": {},
83 | "inputWidgets": {},
84 | "nuid": "320756df-0857-43ea-817d-a382622f54b2",
85 | "showTitle": false,
86 | "tableResultSettingsMap": {},
87 | "title": ""
88 | }
89 | },
90 | "source": [
91 | "##B. This is a commented line in Python"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 0,
97 | "metadata": {
98 | "application/vnd.databricks.v1+cell": {
99 | "cellMetadata": {},
100 | "inputWidgets": {},
101 | "nuid": "01f6ba35-cb48-45e2-93c7-9d56d31381ef",
102 | "showTitle": false,
103 | "tableResultSettingsMap": {},
104 | "title": ""
105 | }
106 | },
107 | "outputs": [],
108 | "source": [
109 | "%python\n",
110 | "#1. Single line comment - use # in the starting\n",
111 | "'''2.Multi line comment''' \n",
112 | "# - use ''' comment ''' or \"\"\" comment \"\"\""
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {
118 | "application/vnd.databricks.v1+cell": {
119 | "cellMetadata": {
120 | "byteLimit": 2048000,
121 | "rowLimit": 10000
122 | },
123 | "inputWidgets": {},
124 | "nuid": "a7967bfd-6528-4b42-bd8b-8f8f251fba02",
125 | "showTitle": false,
126 | "tableResultSettingsMap": {},
127 | "title": ""
128 | }
129 | },
130 | "source": [
131 | "#Main Heading1 using #
How to do some markdowns design
using the magic command"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {
137 | "application/vnd.databricks.v1+cell": {
138 | "cellMetadata": {
139 | "byteLimit": 2048000,
140 | "rowLimit": 10000
141 | },
142 | "inputWidgets": {},
143 | "nuid": "e8d1dcea-9fc3-45db-86c5-43c19d75710a",
144 | "showTitle": false,
145 | "tableResultSettingsMap": {},
146 | "title": ""
147 | }
148 | },
149 | "source": [
150 | "## Main Heading2 - prefix with \"2#\""
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {
156 | "application/vnd.databricks.v1+cell": {
157 | "cellMetadata": {},
158 | "inputWidgets": {},
159 | "nuid": "86589943-f7b2-476a-b758-1efffeb2a73e",
160 | "showTitle": false,
161 | "tableResultSettingsMap": {},
162 | "title": ""
163 | }
164 | },
165 | "source": [
166 | "### Main Heading3 - prefix with \"3#\""
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {
172 | "application/vnd.databricks.v1+cell": {
173 | "cellMetadata": {
174 | "byteLimit": 2048000,
175 | "rowLimit": 10000
176 | },
177 | "inputWidgets": {},
178 | "nuid": "7c9fc9b4-2d30-4bb3-8932-01495c8b5786",
179 | "showTitle": false,
180 | "tableResultSettingsMap": {},
181 | "title": ""
182 | }
183 | },
184 | "source": [
185 | "#### Sub Heading1 - prefix with \"max 4#\""
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {
191 | "application/vnd.databricks.v1+cell": {
192 | "cellMetadata": {
193 | "byteLimit": 2048000,
194 | "rowLimit": 10000
195 | },
196 | "inputWidgets": {},
197 | "nuid": "7b8d623b-f654-4e28-9399-6e1acad1ffc0",
198 | "showTitle": false,
199 | "tableResultSettingsMap": {},
200 | "title": ""
201 | }
202 | },
203 | "source": [
204 | "##### Sub Heading2 - prefix with \"max 5#\""
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {
210 | "application/vnd.databricks.v1+cell": {
211 | "cellMetadata": {
212 | "byteLimit": 2048000,
213 | "rowLimit": 10000
214 | },
215 | "inputWidgets": {},
216 | "nuid": "486d7884-73e3-4a60-9fc2-2ef0a49fc2b4",
217 | "showTitle": false,
218 | "tableResultSettingsMap": {},
219 | "title": ""
220 | }
221 | },
222 | "source": [
223 | "###### Sub Heading3 - prefix with \"max 6#\""
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {
229 | "application/vnd.databricks.v1+cell": {
230 | "cellMetadata": {},
231 | "inputWidgets": {},
232 | "nuid": "8b40e4c3-aed9-437a-86dc-0c62ce3d5751",
233 | "showTitle": false,
234 | "tableResultSettingsMap": {},
235 | "title": ""
236 | }
237 | },
238 | "source": [
239 | "####### Sub Heading3 - prefix with \"max 6#\""
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {
245 | "application/vnd.databricks.v1+cell": {
246 | "cellMetadata": {
247 | "byteLimit": 2048000,
248 | "rowLimit": 10000
249 | },
250 | "inputWidgets": {},
251 | "nuid": "d8bf1fd6-2aac-4e22-8e88-27b54eb3de92",
252 | "showTitle": false,
253 | "tableResultSettingsMap": {},
254 | "title": ""
255 | }
256 | },
257 | "source": [
258 | "######Lets learn about bold\n",
259 | "1. Bold - using html tagging \n",
260 | "2. **Bold** - prefixed and suffixed with **"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {
266 | "application/vnd.databricks.v1+cell": {
267 | "cellMetadata": {
268 | "byteLimit": 2048000,
269 | "rowLimit": 10000
270 | },
271 | "inputWidgets": {},
272 | "nuid": "2ca1c554-d730-451d-9dce-6ee98bb9d136",
273 | "showTitle": false,
274 | "tableResultSettingsMap": {},
275 | "title": ""
276 | }
277 | },
278 | "source": [
279 | "###### Lets learn about Italics\n",
280 | "*Italics* - prefixed and suffixed with *"
281 | ]
282 | },
283 | {
284 | "cell_type": "markdown",
285 | "metadata": {
286 | "application/vnd.databricks.v1+cell": {
287 | "cellMetadata": {
288 | "byteLimit": 2048000,
289 | "rowLimit": 10000
290 | },
291 | "inputWidgets": {},
292 | "nuid": "ea06f315-b1ab-4e41-b9e0-7b561e633cf9",
293 | "showTitle": false,
294 | "tableResultSettingsMap": {},
295 | "title": ""
296 | }
297 | },
298 | "source": [
299 | "###### Lets learn about bullet points\n",
300 | "\n",
301 | "- bullet points - prefix with -\n",
302 | "- bullet points - prefix with -"
303 | ]
304 | },
305 | {
306 | "cell_type": "markdown",
307 | "metadata": {
308 | "application/vnd.databricks.v1+cell": {
309 | "cellMetadata": {
310 | "byteLimit": 2048000,
311 | "rowLimit": 10000
312 | },
313 | "inputWidgets": {},
314 | "nuid": "6e366d03-811c-4548-9a77-b38ca97d2ac4",
315 | "showTitle": false,
316 | "tableResultSettingsMap": {},
317 | "title": ""
318 | }
319 | },
320 | "source": [
321 | "###### Lets learn about Color codes\n",
322 | "$${\\color{pink}Text}$$\n",
323 | "$${\\color{black}Black-color}$$\n",
324 | "$${\\color{red}Red}$$\n",
325 | "$${\\color{green}Green}$$\n",
326 | "$${\\color{blue}Blue}$$\t"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {
332 | "application/vnd.databricks.v1+cell": {
333 | "cellMetadata": {
334 | "byteLimit": 2048000,
335 | "rowLimit": 10000
336 | },
337 | "inputWidgets": {},
338 | "nuid": "a8a1de18-2d42-447e-a677-210fc56e138e",
339 | "showTitle": false,
340 | "tableResultSettingsMap": {},
341 | "title": ""
342 | }
343 | },
344 | "source": [
345 | "###### Lets learn about Embedding urls\n",
346 | "Click here for [Inceptez Webpage](https://www.inceptez.in/)"
347 | ]
348 | },
349 | {
350 | "cell_type": "markdown",
351 | "metadata": {
352 | "application/vnd.databricks.v1+cell": {
353 | "cellMetadata": {
354 | "byteLimit": 2048000,
355 | "rowLimit": 10000
356 | },
357 | "inputWidgets": {},
358 | "nuid": "f88a1ce7-b687-434f-81ed-5dd7ec5c0e6b",
359 | "showTitle": false,
360 | "tableResultSettingsMap": {},
361 | "title": ""
362 | }
363 | },
364 | "source": [
365 | "######To learn markdowns more in detail\n",
366 | "Click here [Microsoft markdown cheatsheet](https://docs.databricks.com/aws/en/notebooks/notebook-media)"
367 | ]
368 | }
369 | ],
370 | "metadata": {
371 | "application/vnd.databricks.v1+notebook": {
372 | "computePreferences": {
373 | "hardware": {
374 | "accelerator": null,
375 | "gpuPoolId": null,
376 | "memory": null
377 | }
378 | },
379 | "dashboards": [],
380 | "environmentMetadata": {
381 | "base_environment": "",
382 | "environment_version": "4"
383 | },
384 | "inputWidgetPreferences": null,
385 | "language": "python",
386 | "notebookMetadata": {
387 | "pythonIndentUnit": 4
388 | },
389 | "notebookName": "2_Explore_Notebook_Markdowns",
390 | "widgets": {}
391 | },
392 | "language_info": {
393 | "name": "python"
394 | }
395 | },
396 | "nbformat": 4,
397 | "nbformat_minor": 0
398 | }
399 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/1_USECASES_NB_FUNDAMENTALS/2_Usecase_md_dbutils_widgets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "411d4d08-41b4-4ba2-9976-bc04181fd083",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | ""
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "application/vnd.databricks.v1+cell": {
23 | "cellMetadata": {},
24 | "inputWidgets": {},
25 | "nuid": "12568f05-eab3-411e-a659-99ab5018dde4",
26 | "showTitle": false,
27 | "tableResultSettingsMap": {},
28 | "title": ""
29 | }
30 | },
31 | "source": [
32 | "# Healthcare Data Utilities Usecase2\n",
33 | "\n",
34 | "## Objective\n",
35 | "This notebook demonstrates how to design Databricks notebook using Markdown\n",
36 | "and how to work with Databricks utilities such as dbutils.fs, dbutils.widgets,\n",
37 | "and dbutils.notebook using Volumes.\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {
43 | "application/vnd.databricks.v1+cell": {
44 | "cellMetadata": {},
45 | "inputWidgets": {},
46 | "nuid": "c156024f-930c-4e29-82c5-5620442bfab3",
47 | "showTitle": false,
48 | "tableResultSettingsMap": {},
49 | "title": ""
50 | }
51 | },
52 | "source": [
53 | "## Project Workflow\n",
54 | "1. Create folder structure using Volumes\n",
55 | "2. Create sample healthcare data\n",
56 | "3. Perform file operations using dbutils.fs\n",
57 | "4. Parameterize execution using widgets\n",
58 | "5. Exit notebook with execution status"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {
64 | "application/vnd.databricks.v1+cell": {
65 | "cellMetadata": {},
66 | "inputWidgets": {},
67 | "nuid": "6183ff53-1730-4c06-9a40-10ad0589d66f",
68 | "showTitle": false,
69 | "tableResultSettingsMap": {},
70 | "title": ""
71 | }
72 | },
73 | "source": [
74 | "## Folder Structure\n",
75 | "\n",
76 | "| Folder | Purpose |\n",
77 | "|------|---------|\n",
78 | "| raw | Incoming healthcare files |\n",
79 | "| processed | Validated healthcare data |\n",
80 | "| archive | Historical data |\n"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {
86 | "application/vnd.databricks.v1+cell": {
87 | "cellMetadata": {},
88 | "inputWidgets": {},
89 | "nuid": "8dbd7c13-06c7-4313-bae5-f889c84da16c",
90 | "showTitle": false,
91 | "tableResultSettingsMap": {},
92 | "title": ""
93 | }
94 | },
95 | "source": [
96 | "## Learning Outcome\n",
97 | "Our Aspirants will understand notebook design, parameterization, and fs, notebook, widgets using Databricks utilities."
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {
103 | "application/vnd.databricks.v1+cell": {
104 | "cellMetadata": {},
105 | "inputWidgets": {},
106 | "nuid": "a7ad7451-2275-4a6c-ab9e-cfe5de48106f",
107 | "showTitle": false,
108 | "tableResultSettingsMap": {},
109 | "title": ""
110 | }
111 | },
112 | "source": [
113 | "1. Define Base Paths using python variable
\n",
114 | "base_path = \"/Volumes/workspace/default/volumewd36\"
\n",
115 | "Create raw_path, processed_path and archive_path as given below...
\n",
116 | "raw_path = f\"{base_path}/raw\"
\n",
117 | "processed_path = f\"{base_path}/processed\"
\n",
118 | "archive_path = f\"{base_path}/archive\""
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 0,
124 | "metadata": {
125 | "application/vnd.databricks.v1+cell": {
126 | "cellMetadata": {},
127 | "inputWidgets": {},
128 | "nuid": "6a6fbdfb-0a62-46c7-a639-02474c550929",
129 | "showTitle": false,
130 | "tableResultSettingsMap": {},
131 | "title": ""
132 | }
133 | },
134 | "outputs": [],
135 | "source": []
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {
140 | "application/vnd.databricks.v1+cell": {
141 | "cellMetadata": {},
142 | "inputWidgets": {},
143 | "nuid": "061e8e6c-65be-477e-a831-a333b88ae3b0",
144 | "showTitle": false,
145 | "tableResultSettingsMap": {},
146 | "title": ""
147 | }
148 | },
149 | "source": [
150 | "2. dbutils Usecase – Create Directories using the above path variables.."
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 0,
156 | "metadata": {
157 | "application/vnd.databricks.v1+cell": {
158 | "cellMetadata": {},
159 | "inputWidgets": {},
160 | "nuid": "0d397356-1c43-4fdc-9190-d00eb33c0338",
161 | "showTitle": false,
162 | "tableResultSettingsMap": {},
163 | "title": ""
164 | }
165 | },
166 | "outputs": [],
167 | "source": []
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {
172 | "application/vnd.databricks.v1+cell": {
173 | "cellMetadata": {},
174 | "inputWidgets": {},
175 | "nuid": "ab99fabf-7144-4430-aabe-d42f3ffdfec5",
176 | "showTitle": false,
177 | "tableResultSettingsMap": {},
178 | "title": ""
179 | }
180 | },
181 | "source": [
182 | "3. dbutils Usecase – Create Sample Healthcare File
\n",
183 | "sample_data = \"\"\"patient_id,patient_name,age,gender\n",
184 | "1,John Doe,68,M\n",
185 | "2,Jane Smith,54,F\n",
186 | "\"\"\"\n",
187 | "\n",
188 | "TODO: Write this file content into raw folder created earlier... using dbutils.fs......."
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 0,
194 | "metadata": {
195 | "application/vnd.databricks.v1+cell": {
196 | "cellMetadata": {},
197 | "inputWidgets": {},
198 | "nuid": "80f89788-6f61-4a6c-ab53-e6a07c323c03",
199 | "showTitle": false,
200 | "tableResultSettingsMap": {},
201 | "title": ""
202 | }
203 | },
204 | "outputs": [],
205 | "source": []
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {
210 | "application/vnd.databricks.v1+cell": {
211 | "cellMetadata": {},
212 | "inputWidgets": {},
213 | "nuid": "072284ab-41b9-44d0-b9ea-da3658ab19fe",
214 | "showTitle": false,
215 | "tableResultSettingsMap": {},
216 | "title": ""
217 | }
218 | },
219 | "source": [
220 | "4. dbutils Usecase - list the file created
\n",
221 | "TODO: List all files available in raw folder using the dbutils command
\n",
222 | "dbutils.fs......"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 0,
228 | "metadata": {
229 | "application/vnd.databricks.v1+cell": {
230 | "cellMetadata": {},
231 | "inputWidgets": {},
232 | "nuid": "1fde88db-8cf8-4795-8733-b9e7a64f3751",
233 | "showTitle": false,
234 | "tableResultSettingsMap": {},
235 | "title": ""
236 | }
237 | },
238 | "outputs": [],
239 | "source": []
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {
244 | "application/vnd.databricks.v1+cell": {
245 | "cellMetadata": {},
246 | "inputWidgets": {},
247 | "nuid": "da30d1c1-9a8b-4a8c-babf-a44bc10f2415",
248 | "showTitle": false,
249 | "tableResultSettingsMap": {},
250 | "title": ""
251 | }
252 | },
253 | "source": [
254 | "5. dbutils Usecase – Copy File (raw → processed)"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 0,
260 | "metadata": {
261 | "application/vnd.databricks.v1+cell": {
262 | "cellMetadata": {},
263 | "inputWidgets": {},
264 | "nuid": "db57a210-0d54-4cb1-91b1-445f4bb05101",
265 | "showTitle": false,
266 | "tableResultSettingsMap": {},
267 | "title": ""
268 | }
269 | },
270 | "outputs": [],
271 | "source": []
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {
276 | "application/vnd.databricks.v1+cell": {
277 | "cellMetadata": {},
278 | "inputWidgets": {},
279 | "nuid": "7ec73c87-5fcd-40c4-bb12-adf6bb15e1cb",
280 | "showTitle": false,
281 | "tableResultSettingsMap": {},
282 | "title": ""
283 | }
284 | },
285 | "source": [
286 | "6. dbutils widget usecase - Create dropdown and text widgets...
\n",
287 | "TODO: Create a dropdown widget for environment (dev, qa, prod) using
\n",
288 | "TODO: Create a text widget for owner name\n",
289 | "\n"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 0,
295 | "metadata": {
296 | "application/vnd.databricks.v1+cell": {
297 | "cellMetadata": {},
298 | "inputWidgets": {},
299 | "nuid": "ba2d9ddd-c9cf-4ac0-b09b-5b509b3c686e",
300 | "showTitle": false,
301 | "tableResultSettingsMap": {},
302 | "title": ""
303 | }
304 | },
305 | "outputs": [],
306 | "source": []
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {
311 | "application/vnd.databricks.v1+cell": {
312 | "cellMetadata": {},
313 | "inputWidgets": {},
314 | "nuid": "530d9f4b-7142-41de-a23b-e0e2ed4b6f4f",
315 | "showTitle": false,
316 | "tableResultSettingsMap": {},
317 | "title": ""
318 | }
319 | },
320 | "source": [
321 | "7. dbutils widget Usecase – Read Widget Values environment and owner and print in the screen"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 0,
327 | "metadata": {
328 | "application/vnd.databricks.v1+cell": {
329 | "cellMetadata": {},
330 | "inputWidgets": {},
331 | "nuid": "54aaeb8a-a4c5-47a6-941c-18bd24732d67",
332 | "showTitle": false,
333 | "tableResultSettingsMap": {},
334 | "title": ""
335 | }
336 | },
337 | "outputs": [],
338 | "source": []
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {
343 | "application/vnd.databricks.v1+cell": {
344 | "cellMetadata": {},
345 | "inputWidgets": {},
346 | "nuid": "c3bdfb8c-d886-43ca-8cf0-d61bbc1f883f",
347 | "showTitle": false,
348 | "tableResultSettingsMap": {},
349 | "title": ""
350 | }
351 | },
352 | "source": [
353 | "8. dbutils widget Usecase – Move the above processed File to Archive"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 0,
359 | "metadata": {
360 | "application/vnd.databricks.v1+cell": {
361 | "cellMetadata": {},
362 | "inputWidgets": {},
363 | "nuid": "2697f7a9-fed9-4e37-abc5-e8f1dc36aa48",
364 | "showTitle": false,
365 | "tableResultSettingsMap": {},
366 | "title": ""
367 | }
368 | },
369 | "outputs": [],
370 | "source": []
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {
375 | "application/vnd.databricks.v1+cell": {
376 | "cellMetadata": {},
377 | "inputWidgets": {},
378 | "nuid": "69804d19-6262-43aa-83f8-a71da479aad2",
379 | "showTitle": false,
380 | "tableResultSettingsMap": {},
381 | "title": ""
382 | }
383 | },
384 | "source": [
385 | "9. dbutils notebook usecase - Run the notebook4 using the dbutils command\n",
386 | "/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_USECASES_NB_FUNDAMENTALS/4_child_nb_dataload"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 0,
392 | "metadata": {
393 | "application/vnd.databricks.v1+cell": {
394 | "cellMetadata": {},
395 | "inputWidgets": {},
396 | "nuid": "a54cd602-18c3-4ca4-8311-e4787f69fd25",
397 | "showTitle": false,
398 | "tableResultSettingsMap": {},
399 | "title": ""
400 | }
401 | },
402 | "outputs": [],
403 | "source": []
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {
408 | "application/vnd.databricks.v1+cell": {
409 | "cellMetadata": {},
410 | "inputWidgets": {},
411 | "nuid": "f1191fc8-a393-47f3-882c-c24ae95959d9",
412 | "showTitle": false,
413 | "tableResultSettingsMap": {},
414 | "title": ""
415 | }
416 | },
417 | "source": [
418 | "10. dbutils notebook usecase - exit this notebook \n",
419 | "TODO: Exit notebook with a success message\n",
420 | "dbutils.notebook._____(\"Pipeline completed successfully\")\n"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": 0,
426 | "metadata": {
427 | "application/vnd.databricks.v1+cell": {
428 | "cellMetadata": {},
429 | "inputWidgets": {},
430 | "nuid": "6ec32fa5-0791-48f4-a54f-f1294e1146c2",
431 | "showTitle": false,
432 | "tableResultSettingsMap": {},
433 | "title": ""
434 | }
435 | },
436 | "outputs": [],
437 | "source": []
438 | }
439 | ],
440 | "metadata": {
441 | "application/vnd.databricks.v1+notebook": {
442 | "computePreferences": null,
443 | "dashboards": [],
444 | "environmentMetadata": {
445 | "base_environment": "",
446 | "environment_version": "4"
447 | },
448 | "inputWidgetPreferences": null,
449 | "language": "python",
450 | "notebookMetadata": {
451 | "pythonIndentUnit": 4
452 | },
453 | "notebookName": "2_Usecase_md_dbutils_widgets",
454 | "widgets": {}
455 | },
456 | "language_info": {
457 | "name": "python"
458 | }
459 | },
460 | "nbformat": 4,
461 | "nbformat_minor": 0
462 | }
463 |
--------------------------------------------------------------------------------
/databricks_workouts_2025_WE47/1_USECASES_NB_FUNDAMENTALS/2_Usecase_md_dbutils_widgets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "411d4d08-41b4-4ba2-9976-bc04181fd083",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | ""
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "application/vnd.databricks.v1+cell": {
23 | "cellMetadata": {},
24 | "inputWidgets": {},
25 | "nuid": "12568f05-eab3-411e-a659-99ab5018dde4",
26 | "showTitle": false,
27 | "tableResultSettingsMap": {},
28 | "title": ""
29 | }
30 | },
31 | "source": [
32 | "# Healthcare Data Utilities Usecase2\n",
33 | "\n",
34 | "## Objective\n",
35 | "This notebook demonstrates how to design Databricks notebook using Markdown\n",
36 | "and how to work with Databricks utilities such as dbutils.fs, dbutils.widgets,\n",
37 | "and dbutils.notebook using Volumes.\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {
43 | "application/vnd.databricks.v1+cell": {
44 | "cellMetadata": {},
45 | "inputWidgets": {},
46 | "nuid": "c156024f-930c-4e29-82c5-5620442bfab3",
47 | "showTitle": false,
48 | "tableResultSettingsMap": {},
49 | "title": ""
50 | }
51 | },
52 | "source": [
53 | "## Project Workflow\n",
54 | "1. Create folder structure using Volumes\n",
55 | "2. Create sample healthcare data\n",
56 | "3. Perform file operations using dbutils.fs\n",
57 | "4. Parameterize execution using widgets\n",
58 | "5. Exit notebook with execution status"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {
64 | "application/vnd.databricks.v1+cell": {
65 | "cellMetadata": {},
66 | "inputWidgets": {},
67 | "nuid": "6183ff53-1730-4c06-9a40-10ad0589d66f",
68 | "showTitle": false,
69 | "tableResultSettingsMap": {},
70 | "title": ""
71 | }
72 | },
73 | "source": [
74 | "## Folder Structure\n",
75 | "\n",
76 | "| Folder | Purpose |\n",
77 | "|------|---------|\n",
78 | "| raw | Incoming healthcare files |\n",
79 | "| processed | Validated healthcare data |\n",
80 | "| archive | Historical data |\n"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {
86 | "application/vnd.databricks.v1+cell": {
87 | "cellMetadata": {},
88 | "inputWidgets": {},
89 | "nuid": "8dbd7c13-06c7-4313-bae5-f889c84da16c",
90 | "showTitle": false,
91 | "tableResultSettingsMap": {},
92 | "title": ""
93 | }
94 | },
95 | "source": [
96 | "## Learning Outcome\n",
97 | "Our Aspirants will understand notebook design, parameterization, and fs, notebook, widgets using Databricks utilities."
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {
103 | "application/vnd.databricks.v1+cell": {
104 | "cellMetadata": {},
105 | "inputWidgets": {},
106 | "nuid": "a7ad7451-2275-4a6c-ab9e-cfe5de48106f",
107 | "showTitle": false,
108 | "tableResultSettingsMap": {},
109 | "title": ""
110 | }
111 | },
112 | "source": [
113 | "1. Define Base Paths using python variable
\n",
114 | "base_path = \"/Volumes/workspace/default/volumewd36\"
\n",
115 | "Create raw_path, processed_path and archive_path as given below...
\n",
116 | "raw_path = f\"{base_path}/raw\"
\n",
117 | "processed_path = f\"{base_path}/processed\"
\n",
118 | "archive_path = f\"{base_path}/archive\""
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 0,
124 | "metadata": {
125 | "application/vnd.databricks.v1+cell": {
126 | "cellMetadata": {},
127 | "inputWidgets": {},
128 | "nuid": "6a6fbdfb-0a62-46c7-a639-02474c550929",
129 | "showTitle": false,
130 | "tableResultSettingsMap": {},
131 | "title": ""
132 | }
133 | },
134 | "outputs": [],
135 | "source": []
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {
140 | "application/vnd.databricks.v1+cell": {
141 | "cellMetadata": {},
142 | "inputWidgets": {},
143 | "nuid": "061e8e6c-65be-477e-a831-a333b88ae3b0",
144 | "showTitle": false,
145 | "tableResultSettingsMap": {},
146 | "title": ""
147 | }
148 | },
149 | "source": [
150 | "2. dbutils Usecase – Create Directories using the above path variables.."
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 0,
156 | "metadata": {
157 | "application/vnd.databricks.v1+cell": {
158 | "cellMetadata": {},
159 | "inputWidgets": {},
160 | "nuid": "0d397356-1c43-4fdc-9190-d00eb33c0338",
161 | "showTitle": false,
162 | "tableResultSettingsMap": {},
163 | "title": ""
164 | }
165 | },
166 | "outputs": [],
167 | "source": []
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {
172 | "application/vnd.databricks.v1+cell": {
173 | "cellMetadata": {},
174 | "inputWidgets": {},
175 | "nuid": "ab99fabf-7144-4430-aabe-d42f3ffdfec5",
176 | "showTitle": false,
177 | "tableResultSettingsMap": {},
178 | "title": ""
179 | }
180 | },
181 | "source": [
182 | "3. dbutils Usecase – Create Sample Healthcare File
\n",
183 | "sample_data = \"\"\"patient_id,patient_name,age,gender\n",
184 | "1,John Doe,68,M\n",
185 | "2,Jane Smith,54,F\n",
186 | "\"\"\"\n",
187 | "\n",
188 | "TODO: Write this file content into raw folder created earlier... using dbutils.fs......."
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 0,
194 | "metadata": {
195 | "application/vnd.databricks.v1+cell": {
196 | "cellMetadata": {},
197 | "inputWidgets": {},
198 | "nuid": "80f89788-6f61-4a6c-ab53-e6a07c323c03",
199 | "showTitle": false,
200 | "tableResultSettingsMap": {},
201 | "title": ""
202 | }
203 | },
204 | "outputs": [],
205 | "source": []
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {
210 | "application/vnd.databricks.v1+cell": {
211 | "cellMetadata": {},
212 | "inputWidgets": {},
213 | "nuid": "072284ab-41b9-44d0-b9ea-da3658ab19fe",
214 | "showTitle": false,
215 | "tableResultSettingsMap": {},
216 | "title": ""
217 | }
218 | },
219 | "source": [
220 | "4. dbutils Usecase - list the file created
\n",
221 | "TODO: List all files available in raw folder using the dbutils command
\n",
222 | "dbutils.fs......"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 0,
228 | "metadata": {
229 | "application/vnd.databricks.v1+cell": {
230 | "cellMetadata": {},
231 | "inputWidgets": {},
232 | "nuid": "1fde88db-8cf8-4795-8733-b9e7a64f3751",
233 | "showTitle": false,
234 | "tableResultSettingsMap": {},
235 | "title": ""
236 | }
237 | },
238 | "outputs": [],
239 | "source": []
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {
244 | "application/vnd.databricks.v1+cell": {
245 | "cellMetadata": {},
246 | "inputWidgets": {},
247 | "nuid": "da30d1c1-9a8b-4a8c-babf-a44bc10f2415",
248 | "showTitle": false,
249 | "tableResultSettingsMap": {},
250 | "title": ""
251 | }
252 | },
253 | "source": [
254 | "5. dbutils Usecase – Copy File (raw → processed)"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 0,
260 | "metadata": {
261 | "application/vnd.databricks.v1+cell": {
262 | "cellMetadata": {},
263 | "inputWidgets": {},
264 | "nuid": "db57a210-0d54-4cb1-91b1-445f4bb05101",
265 | "showTitle": false,
266 | "tableResultSettingsMap": {},
267 | "title": ""
268 | }
269 | },
270 | "outputs": [],
271 | "source": []
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {
276 | "application/vnd.databricks.v1+cell": {
277 | "cellMetadata": {},
278 | "inputWidgets": {},
279 | "nuid": "7ec73c87-5fcd-40c4-bb12-adf6bb15e1cb",
280 | "showTitle": false,
281 | "tableResultSettingsMap": {},
282 | "title": ""
283 | }
284 | },
285 | "source": [
286 | "6. dbutils widget usecase - Create dropdown and text widgets...
\n",
287 | "TODO: Create a dropdown widget for environment (dev, qa, prod) using
\n",
288 | "TODO: Create a text widget for owner name\n",
289 | "\n"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 0,
295 | "metadata": {
296 | "application/vnd.databricks.v1+cell": {
297 | "cellMetadata": {},
298 | "inputWidgets": {},
299 | "nuid": "ba2d9ddd-c9cf-4ac0-b09b-5b509b3c686e",
300 | "showTitle": false,
301 | "tableResultSettingsMap": {},
302 | "title": ""
303 | }
304 | },
305 | "outputs": [],
306 | "source": []
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {
311 | "application/vnd.databricks.v1+cell": {
312 | "cellMetadata": {},
313 | "inputWidgets": {},
314 | "nuid": "530d9f4b-7142-41de-a23b-e0e2ed4b6f4f",
315 | "showTitle": false,
316 | "tableResultSettingsMap": {},
317 | "title": ""
318 | }
319 | },
320 | "source": [
321 | "7. dbutils widget Usecase – Read Widget Values environment and owner and print in the screen"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 0,
327 | "metadata": {
328 | "application/vnd.databricks.v1+cell": {
329 | "cellMetadata": {},
330 | "inputWidgets": {},
331 | "nuid": "54aaeb8a-a4c5-47a6-941c-18bd24732d67",
332 | "showTitle": false,
333 | "tableResultSettingsMap": {},
334 | "title": ""
335 | }
336 | },
337 | "outputs": [],
338 | "source": []
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {
343 | "application/vnd.databricks.v1+cell": {
344 | "cellMetadata": {},
345 | "inputWidgets": {},
346 | "nuid": "c3bdfb8c-d886-43ca-8cf0-d61bbc1f883f",
347 | "showTitle": false,
348 | "tableResultSettingsMap": {},
349 | "title": ""
350 | }
351 | },
352 | "source": [
353 | "8. dbutils widget Usecase – Move the above processed File to Archive"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 0,
359 | "metadata": {
360 | "application/vnd.databricks.v1+cell": {
361 | "cellMetadata": {},
362 | "inputWidgets": {},
363 | "nuid": "2697f7a9-fed9-4e37-abc5-e8f1dc36aa48",
364 | "showTitle": false,
365 | "tableResultSettingsMap": {},
366 | "title": ""
367 | }
368 | },
369 | "outputs": [],
370 | "source": []
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {
375 | "application/vnd.databricks.v1+cell": {
376 | "cellMetadata": {},
377 | "inputWidgets": {},
378 | "nuid": "69804d19-6262-43aa-83f8-a71da479aad2",
379 | "showTitle": false,
380 | "tableResultSettingsMap": {},
381 | "title": ""
382 | }
383 | },
384 | "source": [
385 | "9. dbutils notebook usecase - Run the notebook4 using the dbutils command\n",
386 | "/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_USECASES_NB_FUNDAMENTALS/4_child_nb_dataload"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 0,
392 | "metadata": {
393 | "application/vnd.databricks.v1+cell": {
394 | "cellMetadata": {},
395 | "inputWidgets": {},
396 | "nuid": "a54cd602-18c3-4ca4-8311-e4787f69fd25",
397 | "showTitle": false,
398 | "tableResultSettingsMap": {},
399 | "title": ""
400 | }
401 | },
402 | "outputs": [],
403 | "source": []
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {
408 | "application/vnd.databricks.v1+cell": {
409 | "cellMetadata": {},
410 | "inputWidgets": {},
411 | "nuid": "f1191fc8-a393-47f3-882c-c24ae95959d9",
412 | "showTitle": false,
413 | "tableResultSettingsMap": {},
414 | "title": ""
415 | }
416 | },
417 | "source": [
418 | "10. dbutils notebook usecase - exit this notebook \n",
419 | "TODO: Exit notebook with a success message\n",
420 | "dbutils.notebook._____(\"Pipeline completed successfully\")\n"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": 0,
426 | "metadata": {
427 | "application/vnd.databricks.v1+cell": {
428 | "cellMetadata": {},
429 | "inputWidgets": {},
430 | "nuid": "6ec32fa5-0791-48f4-a54f-f1294e1146c2",
431 | "showTitle": false,
432 | "tableResultSettingsMap": {},
433 | "title": ""
434 | }
435 | },
436 | "outputs": [],
437 | "source": []
438 | }
439 | ],
440 | "metadata": {
441 | "application/vnd.databricks.v1+notebook": {
442 | "computePreferences": null,
443 | "dashboards": [],
444 | "environmentMetadata": {
445 | "base_environment": "",
446 | "environment_version": "4"
447 | },
448 | "inputWidgetPreferences": null,
449 | "language": "python",
450 | "notebookMetadata": {
451 | "pythonIndentUnit": 4
452 | },
453 | "notebookName": "2_Usecase_md_dbutils_widgets",
454 | "widgets": {}
455 | },
456 | "language_info": {
457 | "name": "python"
458 | }
459 | },
460 | "nbformat": 4,
461 | "nbformat_minor": 0
462 | }
463 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/2_Explore_Notebook_Markdowns.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "ea376574-7777-4592-936b-38eb25b6e1d9",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | ""
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "application/vnd.databricks.v1+cell": {
23 | "cellMetadata": {},
24 | "inputWidgets": {},
25 | "nuid": "96aa1a34-733e-4d2f-b705-1e076af575eb",
26 | "showTitle": false,
27 | "tableResultSettingsMap": {},
28 | "title": ""
29 | }
30 | },
31 | "source": [
32 | "#1. Basics of Python Programing"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {
38 | "application/vnd.databricks.v1+cell": {
39 | "cellMetadata": {},
40 | "inputWidgets": {},
41 | "nuid": "523792e4-d5c5-4831-9694-7243855d6ead",
42 | "showTitle": false,
43 | "tableResultSettingsMap": {},
44 | "title": ""
45 | }
46 | },
47 | "source": [
48 | "##A. Python is an indent based programming language\n",
49 | "Why Python uses indend based programing ->\n",
50 | "1. Managing the program more efficiently\n",
51 | "2. Better Readablility of the code\n",
52 | "3. For creating the hierarchy of programming.\n",
53 | "4. By default 4 spaces we will give for indends, but more/less spaces or tabs also can be used..."
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {
59 | "application/vnd.databricks.v1+cell": {
60 | "cellMetadata": {},
61 | "inputWidgets": {},
62 | "nuid": "b50093e4-02c9-41df-a552-4081040e16f6",
63 | "showTitle": false,
64 | "tableResultSettingsMap": {},
65 | "title": ""
66 | }
67 | },
68 | "source": [
69 | "###How many space for intending"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 0,
75 | "metadata": {
76 | "application/vnd.databricks.v1+cell": {
77 | "cellMetadata": {},
78 | "inputWidgets": {},
79 | "nuid": "eccd8947-afed-41d7-a9a8-4aa7cdf267ad",
80 | "showTitle": false,
81 | "tableResultSettingsMap": {},
82 | "title": ""
83 | }
84 | },
85 | "outputs": [],
86 | "source": [
87 | "if True:\n",
88 | " print(\"hello\")"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {
94 | "application/vnd.databricks.v1+cell": {
95 | "cellMetadata": {},
96 | "inputWidgets": {},
97 | "nuid": "2d9344d7-e471-4ceb-a2ee-039351f00e91",
98 | "showTitle": false,
99 | "tableResultSettingsMap": {},
100 | "title": ""
101 | }
102 | },
103 | "source": [
104 | "###Multiple intents"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 0,
110 | "metadata": {
111 | "application/vnd.databricks.v1+cell": {
112 | "cellMetadata": {},
113 | "inputWidgets": {},
114 | "nuid": "289c8a44-dee8-4046-9f80-a34ba4707f9b",
115 | "showTitle": false,
116 | "tableResultSettingsMap": {},
117 | "title": ""
118 | }
119 | },
120 | "outputs": [],
121 | "source": [
122 | "%python\n",
123 | "aspirants_list=['Jeeva','Bharathi','Vaanmathy','Nag']\n",
124 | "for aspirants in aspirants_list:\n",
125 | " print(\"good afternoon \",aspirants)\n",
126 | "print(\"good after all aspirants\")"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {
132 | "application/vnd.databricks.v1+cell": {
133 | "cellMetadata": {},
134 | "inputWidgets": {},
135 | "nuid": "320756df-0857-43ea-817d-a382622f54b2",
136 | "showTitle": false,
137 | "tableResultSettingsMap": {},
138 | "title": ""
139 | }
140 | },
141 | "source": [
142 | "##B. This is a commented line in Python"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 0,
148 | "metadata": {
149 | "application/vnd.databricks.v1+cell": {
150 | "cellMetadata": {},
151 | "inputWidgets": {},
152 | "nuid": "01f6ba35-cb48-45e2-93c7-9d56d31381ef",
153 | "showTitle": false,
154 | "tableResultSettingsMap": {},
155 | "title": ""
156 | }
157 | },
158 | "outputs": [],
159 | "source": [
160 | "%python\n",
161 | "#1. Single line comment - use # in the starting\n",
162 | "'''2.Multi line comment''' \n",
163 | "# - use ''' comment ''' or \"\"\" comment \"\"\""
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {
169 | "application/vnd.databricks.v1+cell": {
170 | "cellMetadata": {
171 | "byteLimit": 2048000,
172 | "rowLimit": 10000
173 | },
174 | "inputWidgets": {},
175 | "nuid": "a7967bfd-6528-4b42-bd8b-8f8f251fba02",
176 | "showTitle": false,
177 | "tableResultSettingsMap": {},
178 | "title": ""
179 | }
180 | },
181 | "source": [
182 | "#Main Heading1 using #
How to do some markdowns design
using the magic command"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {
188 | "application/vnd.databricks.v1+cell": {
189 | "cellMetadata": {
190 | "byteLimit": 2048000,
191 | "rowLimit": 10000
192 | },
193 | "inputWidgets": {},
194 | "nuid": "e8d1dcea-9fc3-45db-86c5-43c19d75710a",
195 | "showTitle": false,
196 | "tableResultSettingsMap": {},
197 | "title": ""
198 | }
199 | },
200 | "source": [
201 | "## Main Heading2 - prefix with \"2#\""
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {
207 | "application/vnd.databricks.v1+cell": {
208 | "cellMetadata": {},
209 | "inputWidgets": {},
210 | "nuid": "86589943-f7b2-476a-b758-1efffeb2a73e",
211 | "showTitle": false,
212 | "tableResultSettingsMap": {},
213 | "title": ""
214 | }
215 | },
216 | "source": [
217 | "### Main Heading3 - prefix with \"3#\""
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {
223 | "application/vnd.databricks.v1+cell": {
224 | "cellMetadata": {
225 | "byteLimit": 2048000,
226 | "rowLimit": 10000
227 | },
228 | "inputWidgets": {},
229 | "nuid": "7c9fc9b4-2d30-4bb3-8932-01495c8b5786",
230 | "showTitle": false,
231 | "tableResultSettingsMap": {},
232 | "title": ""
233 | }
234 | },
235 | "source": [
236 | "#### Sub Heading1 - prefix with \"max 4#\""
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {
242 | "application/vnd.databricks.v1+cell": {
243 | "cellMetadata": {
244 | "byteLimit": 2048000,
245 | "rowLimit": 10000
246 | },
247 | "inputWidgets": {},
248 | "nuid": "7b8d623b-f654-4e28-9399-6e1acad1ffc0",
249 | "showTitle": false,
250 | "tableResultSettingsMap": {},
251 | "title": ""
252 | }
253 | },
254 | "source": [
255 | "##### Sub Heading2 - prefix with \"max 5#\""
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "metadata": {
261 | "application/vnd.databricks.v1+cell": {
262 | "cellMetadata": {
263 | "byteLimit": 2048000,
264 | "rowLimit": 10000
265 | },
266 | "inputWidgets": {},
267 | "nuid": "486d7884-73e3-4a60-9fc2-2ef0a49fc2b4",
268 | "showTitle": false,
269 | "tableResultSettingsMap": {},
270 | "title": ""
271 | }
272 | },
273 | "source": [
274 | "###### Sub Heading3 - prefix with \"max 6#\""
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {
280 | "application/vnd.databricks.v1+cell": {
281 | "cellMetadata": {},
282 | "inputWidgets": {},
283 | "nuid": "8b40e4c3-aed9-437a-86dc-0c62ce3d5751",
284 | "showTitle": false,
285 | "tableResultSettingsMap": {},
286 | "title": ""
287 | }
288 | },
289 | "source": [
290 | "####### Sub Heading3 - prefix with \"max 6#\""
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {
296 | "application/vnd.databricks.v1+cell": {
297 | "cellMetadata": {
298 | "byteLimit": 2048000,
299 | "rowLimit": 10000
300 | },
301 | "inputWidgets": {},
302 | "nuid": "d8bf1fd6-2aac-4e22-8e88-27b54eb3de92",
303 | "showTitle": false,
304 | "tableResultSettingsMap": {},
305 | "title": ""
306 | }
307 | },
308 | "source": [
309 | "######Lets learn about bold\n",
310 | "1. Bold - using html tagging \n",
311 | "2. **Bold** - prefixed and suffixed with **"
312 | ]
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {
317 | "application/vnd.databricks.v1+cell": {
318 | "cellMetadata": {
319 | "byteLimit": 2048000,
320 | "rowLimit": 10000
321 | },
322 | "inputWidgets": {},
323 | "nuid": "2ca1c554-d730-451d-9dce-6ee98bb9d136",
324 | "showTitle": false,
325 | "tableResultSettingsMap": {},
326 | "title": ""
327 | }
328 | },
329 | "source": [
330 | "###### Lets learn about Italics\n",
331 | "*Italics* - prefixed and suffixed with *"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {
337 | "application/vnd.databricks.v1+cell": {
338 | "cellMetadata": {
339 | "byteLimit": 2048000,
340 | "rowLimit": 10000
341 | },
342 | "inputWidgets": {},
343 | "nuid": "ea06f315-b1ab-4e41-b9e0-7b561e633cf9",
344 | "showTitle": false,
345 | "tableResultSettingsMap": {},
346 | "title": ""
347 | }
348 | },
349 | "source": [
350 | "###### Lets learn about bullet points\n",
351 | "\n",
352 | "- bullet points - prefix with -\n",
353 | "- bullet points - prefix with -"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {
359 | "application/vnd.databricks.v1+cell": {
360 | "cellMetadata": {
361 | "byteLimit": 2048000,
362 | "rowLimit": 10000
363 | },
364 | "inputWidgets": {},
365 | "nuid": "6e366d03-811c-4548-9a77-b38ca97d2ac4",
366 | "showTitle": false,
367 | "tableResultSettingsMap": {},
368 | "title": ""
369 | }
370 | },
371 | "source": [
372 | "###### Lets learn about Color codes\n",
373 | "$${\\color{pink}text-to-display}$$\n",
374 | "$${\\color{black}Black-color}$$\n",
375 | "$${\\color{red}Red}$$\n",
376 | "$${\\color{green}Green}$$\n",
377 | "$${\\color{blue}Blue}$$\t"
378 | ]
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {
383 | "application/vnd.databricks.v1+cell": {
384 | "cellMetadata": {
385 | "byteLimit": 2048000,
386 | "rowLimit": 10000
387 | },
388 | "inputWidgets": {},
389 | "nuid": "a8a1de18-2d42-447e-a677-210fc56e138e",
390 | "showTitle": false,
391 | "tableResultSettingsMap": {},
392 | "title": ""
393 | }
394 | },
395 | "source": [
396 | "###### Lets learn about Embedding urls\n",
397 | "[click here <-](https://www.google.com/search?q=whether+databricks+uses+hive+in+the+behind%3F&sca_esv=d340eac8d7c27e5b&sxsrf=AE3TifM0tbhMSJ32VMGLkFYoRjocGCu6jw%3A1765160969262&ei=CTg2abXhD4PD4-EPsuGjiAo&ved=0ahUKEwj1ia2E-ayRAxWD4TgGHbLwCKEQ4dUDCBE&uact=5&oq=whether+databricks+uses+hive+in+the+behind%3F&gs_lp=Egxnd3Mtd2l6LXNlcnAiK3doZXRoZXIgZGF0YWJyaWNrcyB1c2VzIGhpdmUgaW4gdGhlIGJlaGluZD8yBRAhGKABMgUQIRigATIFECEYoAEyBRAhGKABSM1VUABY7VFwBXgBkAEAmAGRAaABzyKqAQQwLjM2uAEDyAEA-AEBmAIkoAL0HsICBxAAGIAEGA3CAgYQABgHGB7CAggQABgHGAgYHsICCBAAGAgYDRgewgILEAAYgAQYhgMYigXCAgYQABgNGB7CAggQABiABBiiBMICBRAhGJ8FwgIEECEYFcICBxAhGKABGAqYAwCSBwQ1LjMxoAeJ8AGyBwQwLjMxuAfjHsIHBjIuMzIuMsgHRIAIAA&sclient=gws-wiz-serp)\n",
398 | "Click here for [Inceptez Webpage](https://www.inceptez.in/)"
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "metadata": {
404 | "application/vnd.databricks.v1+cell": {
405 | "cellMetadata": {
406 | "byteLimit": 2048000,
407 | "rowLimit": 10000
408 | },
409 | "inputWidgets": {},
410 | "nuid": "f88a1ce7-b687-434f-81ed-5dd7ec5c0e6b",
411 | "showTitle": false,
412 | "tableResultSettingsMap": {},
413 | "title": ""
414 | }
415 | },
416 | "source": [
417 | "######To learn markdowns more in detail\n",
418 | "Click here [Microsoft markdown cheatsheet](https://docs.databricks.com/aws/en/notebooks/notebook-media)"
419 | ]
420 | },
421 | {
422 | "cell_type": "markdown",
423 | "metadata": {
424 | "application/vnd.databricks.v1+cell": {
425 | "cellMetadata": {},
426 | "inputWidgets": {},
427 | "nuid": "f11e8389-002a-4099-9207-0f7ab9b554fa",
428 | "showTitle": false,
429 | "tableResultSettingsMap": {},
430 | "title": ""
431 | }
432 | },
433 | "source": [
434 | "| col1 | col2 |\n",
435 | "|------|------|\n",
436 | "| a | b |\n",
437 | "| c | d |"
438 | ]
439 | }
440 | ],
441 | "metadata": {
442 | "application/vnd.databricks.v1+notebook": {
443 | "computePreferences": {
444 | "hardware": {
445 | "accelerator": null,
446 | "gpuPoolId": null,
447 | "memory": null
448 | }
449 | },
450 | "dashboards": [],
451 | "environmentMetadata": {
452 | "base_environment": "",
453 | "environment_version": "4"
454 | },
455 | "inputWidgetPreferences": null,
456 | "language": "python",
457 | "notebookMetadata": {
458 | "pythonIndentUnit": 4
459 | },
460 | "notebookName": "2_Explore_Notebook_Markdowns",
461 | "widgets": {}
462 | },
463 | "language_info": {
464 | "name": "python"
465 | }
466 | },
467 | "nbformat": 4,
468 | "nbformat_minor": 0
469 | }
470 |
--------------------------------------------------------------------------------
/databricks_workouts_2025_WE47/2_Spark_DataFrame_Read_Write_Operations/read_write_usecases.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "8ba86a20-5a3a-4130-86f5-e312f4a7901b",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | "#Telecom Domain Read & Write Ops Assignment - Building Datalake & Lakehouse\n",
17 | "This notebook contains assignments to practice Spark read options and Databricks volumes.
\n",
18 | "Sections: Sample data creation, Catalog & Volume creation, Copying data into Volumes, Path glob/recursive reads, toDF() column renaming variants, inferSchema/header/separator experiments, and exercises.
"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "application/vnd.databricks.v1+cell": {
25 | "cellMetadata": {},
26 | "inputWidgets": {},
27 | "nuid": "841c7ed8-ef18-486a-8187-07685e499b84",
28 | "showTitle": false,
29 | "tableResultSettingsMap": {},
30 | "title": ""
31 | }
32 | },
33 | "source": [
34 | "\n",
35 | ""
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {
41 | "application/vnd.databricks.v1+cell": {
42 | "cellMetadata": {},
43 | "inputWidgets": {},
44 | "nuid": "d4aa0a44-8cd6-41cf-921d-abb5ff67615b",
45 | "showTitle": false,
46 | "tableResultSettingsMap": {},
47 | "title": ""
48 | }
49 | },
50 | "source": [
51 | "##First Import all required libraries & Create spark session object"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {
57 | "application/vnd.databricks.v1+cell": {
58 | "cellMetadata": {},
59 | "inputWidgets": {},
60 | "nuid": "d0b67823-2e4e-45e2-aa25-80550a3ac580",
61 | "showTitle": false,
62 | "tableResultSettingsMap": {},
63 | "title": ""
64 | }
65 | },
66 | "source": [
67 | "##1. Write SQL statements to create:\n",
68 | "1. A catalog named telecom_catalog_assign\n",
69 | "2. A schema landing_zone\n",
70 | "3. A volume landing_vol\n",
71 | "4. Using dbutils.fs.mkdirs, create folders:
\n",
72 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/\n",
73 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/\n",
74 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/\n",
75 | "5. Explain the difference between (Just google and understand why we are going for volume concept for prod ready systems):
\n",
76 | "a. Volume vs DBFS/FileStore
\n",
77 | "b. Why production teams prefer Volumes for regulated data
"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {
83 | "application/vnd.databricks.v1+cell": {
84 | "cellMetadata": {},
85 | "inputWidgets": {},
86 | "nuid": "26d8bd3d-b575-448b-ae22-8173d15ca671",
87 | "showTitle": false,
88 | "tableResultSettingsMap": {},
89 | "title": ""
90 | }
91 | },
92 | "source": [
93 | "##Data files to use in this usecase:\n",
94 | "customer_csv = '''\n",
95 | "101,Arun,31,Chennai,PREPAID\n",
96 | "102,Meera,45,Bangalore,POSTPAID\n",
97 | "103,Irfan,29,Hyderabad,PREPAID\n",
98 | "104,Raj,52,Mumbai,POSTPAID\n",
99 | "105,,27,Delhi,PREPAID\n",
100 | "106,Sneha,abc,Pune,PREPAID\n",
101 | "'''\n",
102 | "\n",
103 | "usage_tsv = '''customer_id\\tvoice_mins\\tdata_mb\\tsms_count\n",
104 | "101\\t320\\t1500\\t20\n",
105 | "102\\t120\\t4000\\t5\n",
106 | "103\\t540\\t600\\t52\n",
107 | "104\\t45\\t200\\t2\n",
108 | "105\\t0\\t0\\t0\n",
109 | "'''\n",
110 | "\n",
111 | "tower_logs_region1 = '''event_id|customer_id|tower_id|signal_strength|timestamp\n",
112 | "5001|101|TWR01|-80|2025-01-10 10:21:54\n",
113 | "5004|104|TWR05|-75|2025-01-10 11:01:12\n",
114 | "'''"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {
120 | "application/vnd.databricks.v1+cell": {
121 | "cellMetadata": {},
122 | "inputWidgets": {},
123 | "nuid": "9540d2e2-2562-4be7-897f-0a7d57adaa72",
124 | "showTitle": false,
125 | "tableResultSettingsMap": {},
126 | "title": ""
127 | }
128 | },
129 | "source": [
130 | "##2. Filesystem operations\n",
131 | "1. Write dbutils.fs code to copy the above datasets into your created Volume folders:\n",
132 | "Customer → /Volumes/.../customer/\n",
133 | "Usage → /Volumes/.../usage/\n",
134 | "Tower (region-based) → /Volumes/.../tower/region1/ and /Volumes/.../tower/region2/\n",
135 | "\n",
136 | "2. Write a command to validate whether files were successfully copied"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {
142 | "application/vnd.databricks.v1+cell": {
143 | "cellMetadata": {},
144 | "inputWidgets": {},
145 | "nuid": "8767735b-24d3-428a-ad12-ae821903e2ce",
146 | "showTitle": false,
147 | "tableResultSettingsMap": {},
148 | "title": ""
149 | }
150 | },
151 | "source": [
152 | "##3. Spark Directory Read Use Cases\n",
153 | "1. Read all tower logs using:\n",
154 | "Path glob filter (example: *.csv)\n",
155 | "Multiple paths input\n",
156 | "Recursive lookup\n",
157 | "\n",
158 | "2. Demonstrate these 3 reads separately:\n",
159 | "Using pathGlobFilter\n",
160 | "Using list of paths in spark.read.csv([path1, path2])\n",
161 | "Using .option(\"recursiveFileLookup\",\"true\")\n",
162 | "\n",
163 | "3. Compare the outputs and understand when each should be used."
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {
169 | "application/vnd.databricks.v1+cell": {
170 | "cellMetadata": {},
171 | "inputWidgets": {},
172 | "nuid": "9f7147c1-5d58-47e1-84fe-7ebd26a217b9",
173 | "showTitle": false,
174 | "tableResultSettingsMap": {},
175 | "title": ""
176 | }
177 | },
178 | "source": [
179 | "##4. Schema Inference, Header, and Separator\n",
180 | "1. Try the Customer, Usage files with the option and options using read.csv and format function:
\n",
181 | "header=false, inferSchema=false
\n",
182 | "or
\n",
183 | "header=true, inferSchema=true
\n",
184 | "2. Write a note on What changed when we use header or inferSchema with true/false?
\n",
185 | "3. How schema inference handled “abc” in age?
"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {
191 | "application/vnd.databricks.v1+cell": {
192 | "cellMetadata": {},
193 | "inputWidgets": {},
194 | "nuid": "15d8dad0-bc63-47f1-9a90-72837cba6c4f",
195 | "showTitle": false,
196 | "tableResultSettingsMap": {},
197 | "title": ""
198 | }
199 | },
200 | "source": [
201 | "##5. Column Renaming Usecases\n",
202 | "1. Apply column names using string using toDF function for customer data\n",
203 | "2. Apply column names and datatype using the schema function for usage data\n",
204 | "3. Apply column names and datatype using the StructType with IntegerType, StringType, TimestampType and other classes for towers data "
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {
210 | "application/vnd.databricks.v1+cell": {
211 | "cellMetadata": {},
212 | "inputWidgets": {},
213 | "nuid": "6e1d6d88-7bcc-4548-a0d1-15d37f6fc0be",
214 | "showTitle": false,
215 | "tableResultSettingsMap": {},
216 | "title": ""
217 | }
218 | },
219 | "source": [
220 | "## Spark Write Operations using \n",
221 | "- csv, json, orc, parquet, delta, saveAsTable, insertInto, xml with different write mode, header and sep options"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {
227 | "application/vnd.databricks.v1+cell": {
228 | "cellMetadata": {},
229 | "inputWidgets": {},
230 | "nuid": "8e34c3bc-962d-438d-a1b6-ac27d2da6608",
231 | "showTitle": false,
232 | "tableResultSettingsMap": {},
233 | "title": ""
234 | }
235 | },
236 | "source": [
237 | "##6. Write Operations (Data Conversion/Schema migration) – CSV Format Usecases\n",
238 | "1. Write customer data into CSV format using overwrite mode\n",
239 | "2. Write usage data into CSV format using append mode\n",
240 | "3. Write tower data into CSV format with header enabled and custom separator (|)\n",
241 | "4. Read the tower data in a dataframe and show only 5 rows.\n",
242 | "5. Download the file into local from the catalog volume location and see the data of any of the above files opening in a notepad++."
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {
248 | "application/vnd.databricks.v1+cell": {
249 | "cellMetadata": {},
250 | "inputWidgets": {},
251 | "nuid": "34158cf6-dd7f-40d6-9969-ed76710540a4",
252 | "showTitle": false,
253 | "tableResultSettingsMap": {},
254 | "title": ""
255 | }
256 | },
257 | "source": [
258 | "##7. Write Operations (Data Conversion/Schema migration)– JSON Format Usecases\n",
259 | "1. Write customer data into JSON format using overwrite mode\n",
260 | "2. Write usage data into JSON format using append mode and snappy compression format\n",
261 | "3. Write tower data into JSON format using ignore mode and observe the behavior of this mode\n",
262 | "4. Read the tower data in a dataframe and show only 5 rows.\n",
263 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++."
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {
269 | "application/vnd.databricks.v1+cell": {
270 | "cellMetadata": {},
271 | "inputWidgets": {},
272 | "nuid": "26f2ba69-3cde-4ec6-8945-e4ef9f7bb109",
273 | "showTitle": false,
274 | "tableResultSettingsMap": {},
275 | "title": ""
276 | }
277 | },
278 | "source": [
279 | "##8. Write Operations (Data Conversion/Schema migration) – Parquet Format Usecases\n",
280 | "1. Write customer data into Parquet format using overwrite mode and in a gzip format\n",
281 | "2. Write usage data into Parquet format using error mode\n",
282 | "3. Write tower data into Parquet format with gzip compression option\n",
283 | "4. Read the usage data in a dataframe and show only 5 rows.\n",
284 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++."
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {
290 | "application/vnd.databricks.v1+cell": {
291 | "cellMetadata": {},
292 | "inputWidgets": {},
293 | "nuid": "b41c794f-5cfc-4aeb-a599-e6d4a47a0f3f",
294 | "showTitle": false,
295 | "tableResultSettingsMap": {},
296 | "title": ""
297 | }
298 | },
299 | "source": [
300 | "##9. Write Operations (Data Conversion/Schema migration) – Orc Format Usecases\n",
301 | "1. Write customer data into ORC format using overwrite mode\n",
302 | "2. Write usage data into ORC format using append mode\n",
303 | "3. Write tower data into ORC format and see the output file structure\n",
304 | "4. Read the usage data in a dataframe and show only 5 rows.\n",
305 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++."
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {
311 | "application/vnd.databricks.v1+cell": {
312 | "cellMetadata": {},
313 | "inputWidgets": {},
314 | "nuid": "35761315-0b0f-46ff-9c3d-c0405bce7b62",
315 | "showTitle": false,
316 | "tableResultSettingsMap": {},
317 | "title": ""
318 | }
319 | },
320 | "source": [
321 | "##10. Write Operations (Data Conversion/Schema migration) – Delta Format Usecases\n",
322 | "1. Write customer data into Delta format using overwrite mode\n",
323 | "2. Write usage data into Delta format using append mode\n",
324 | "3. Write tower data into Delta format and see the output file structure\n",
325 | "4. Read the usage data in a dataframe and show only 5 rows.\n",
326 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++.\n",
327 | "6. Compare the parquet location and delta location and try to understand what is the differentiating factor, as both are parquet files only."
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {
333 | "application/vnd.databricks.v1+cell": {
334 | "cellMetadata": {},
335 | "inputWidgets": {},
336 | "nuid": "e6dd0890-02bd-4acd-b837-daceb256c706",
337 | "showTitle": false,
338 | "tableResultSettingsMap": {},
339 | "title": ""
340 | }
341 | },
342 | "source": [
343 | "##11. Write Operations (Lakehouse Usecases) – Delta table Usecases\n",
344 | "1. Write customer data using saveAsTable() as a managed table\n",
345 | "2. Write usage data using saveAsTable() with overwrite mode\n",
346 | "3. Drop the managed table and verify data removal\n",
347 | "4. Go and check the table overview and realize it is in delta format in the Catalog.\n",
348 | "5. Use spark.read.sql to write some simple queries on the above tables created.\n"
349 | ]
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "metadata": {
354 | "application/vnd.databricks.v1+cell": {
355 | "cellMetadata": {},
356 | "inputWidgets": {},
357 | "nuid": "1aac447b-690b-4562-99dd-0ce096e9ad55",
358 | "showTitle": false,
359 | "tableResultSettingsMap": {},
360 | "title": ""
361 | }
362 | },
363 | "source": [
364 | "##12. Write Operations (Lakehouse Usecases) – Delta table Usecases\n",
365 | "1. Write customer data using insertInto() in a new table and find the behavior\n",
366 | "2. Write usage data using insertTable() with overwrite mode"
367 | ]
368 | },
369 | {
370 | "cell_type": "markdown",
371 | "metadata": {
372 | "application/vnd.databricks.v1+cell": {
373 | "cellMetadata": {},
374 | "inputWidgets": {},
375 | "nuid": "e3c4bce3-4bd3-4db6-a074-02bb24c5f91a",
376 | "showTitle": false,
377 | "tableResultSettingsMap": {},
378 | "title": ""
379 | }
380 | },
381 | "source": [
382 | "##13. Write Operations (Lakehouse Usecases) – Delta table Usecases\n",
383 | "1. Write customer data into XML format using rowTag as cust\n",
384 | "2. Write usage data into XML format using overwrite mode with the rowTag as usage\n",
385 | "3. Download the xml data and open the file in notepad++ and see how the xml file looks like."
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {
391 | "application/vnd.databricks.v1+cell": {
392 | "cellMetadata": {},
393 | "inputWidgets": {},
394 | "nuid": "83e2fe69-9352-4ec9-bf70-15d760c89aa3",
395 | "showTitle": false,
396 | "tableResultSettingsMap": {},
397 | "title": ""
398 | }
399 | },
400 | "source": [
401 | "##14. Compare all the downloaded files (csv, json, orc, parquet, delta and xml) \n",
402 | "1. Capture the size occupied between all of these file formats and list the formats below based on the order of size from small to big."
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {
408 | "application/vnd.databricks.v1+cell": {
409 | "cellMetadata": {},
410 | "inputWidgets": {},
411 | "nuid": "3d6e39ec-752d-4183-9656-2b6d7938922d",
412 | "showTitle": false,
413 | "tableResultSettingsMap": {},
414 | "title": ""
415 | }
416 | },
417 | "source": [
418 | "##15. Do a final exercise of defining one/two liner of... \n",
419 | "1. When to use/benifits csv\n",
420 | "2. When to use/benifits json\n",
421 | "3. When to use/benifit orc\n",
422 | "4. When to use/benifit parquet\n",
423 | "5. When to use/benifit delta\n",
424 | "6. When to use/benifit xml\n",
425 | "7. When to use/benifit delta tables\n"
426 | ]
427 | }
428 | ],
429 | "metadata": {
430 | "application/vnd.databricks.v1+notebook": {
431 | "computePreferences": null,
432 | "dashboards": [],
433 | "environmentMetadata": {
434 | "base_environment": "",
435 | "environment_version": "4"
436 | },
437 | "inputWidgetPreferences": null,
438 | "language": "python",
439 | "notebookMetadata": {
440 | "mostRecentlyExecutedCommandWithImplicitDF": {
441 | "commandId": -1,
442 | "dataframes": [
443 | "_sqldf"
444 | ]
445 | },
446 | "pythonIndentUnit": 4
447 | },
448 | "notebookName": "read_write_usecases",
449 | "widgets": {}
450 | },
451 | "language_info": {
452 | "name": "python"
453 | }
454 | },
455 | "nbformat": 4,
456 | "nbformat_minor": 0
457 | }
458 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/1_Explore_Notebooks_magic_commands.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {
8 | "byteLimit": 2048000,
9 | "rowLimit": 10000
10 | },
11 | "inputWidgets": {},
12 | "nuid": "2c477351-9470-4b26-8f73-6f967c37729e",
13 | "showTitle": false,
14 | "tableResultSettingsMap": {},
15 | "title": ""
16 | }
17 | },
18 | "source": [
19 | "#Welcome to Inceptez Technologies\n",
20 | "Let us understand about creating notebooks & magical commands\n",
21 | "https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png\n",
22 | ""
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {
28 | "application/vnd.databricks.v1+cell": {
29 | "cellMetadata": {
30 | "byteLimit": 2048000,
31 | "rowLimit": 10000
32 | },
33 | "inputWidgets": {},
34 | "nuid": "3ae38262-0e89-4a3b-9130-322375328fe4",
35 | "showTitle": false,
36 | "tableResultSettingsMap": {},
37 | "title": ""
38 | }
39 | },
40 | "source": [
41 | "##Let us learn first about Magical Commands\n",
42 | "**Important Magic Commands**\n",
43 | "- %md: allows you to write markdown text to design the notebook.\n",
44 | "- %run: runs a Python file or a notebook.\n",
45 | "- %sh: executes shell commands on the cluster edge/client node.\n",
46 | "- %fs: allows you to interact with the Databricks file system (Datalake command (cloud storage s3/adls/gcs))\n",
47 | "- %sql: allows you to run Spark SQL/HQL queries.\n",
48 | "- %python: switches the notebook context to Python.\n",
49 | "- %pip: allows you to install Python packages.\n",
50 | "\n",
51 | "**Not Important Magic Commands or We learn few of these where we have Cloud(Azure) dependency**\n",
52 | "- %scala: switches the notebook context to Scala.\n",
53 | "- %r: switches the notebook context to R.\n",
54 | "- %lsmagic: lists all the available magic commands.\n",
55 | "- %config: allows you to set configuration options for the notebook.\n",
56 | "- %load: loads the contents of a file into a cell.\n",
57 | "- %who: lists all the variables in the current scope."
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {
63 | "application/vnd.databricks.v1+cell": {
64 | "cellMetadata": {
65 | "byteLimit": 2048000,
66 | "rowLimit": 10000
67 | },
68 | "inputWidgets": {},
69 | "nuid": "b35163da-292b-4c60-b6a9-a62521e22343",
70 | "showTitle": false,
71 | "tableResultSettingsMap": {},
72 | "title": ""
73 | }
74 | },
75 | "source": [
76 | "####How to call a notebook from the current notebook using %run magic command"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 0,
82 | "metadata": {
83 | "application/vnd.databricks.v1+cell": {
84 | "cellMetadata": {
85 | "byteLimit": 2048000,
86 | "rowLimit": 10000
87 | },
88 | "inputWidgets": {},
89 | "nuid": "cb391d1b-fb52-49ab-a463-2849a7f60fea",
90 | "showTitle": false,
91 | "tableResultSettingsMap": {},
92 | "title": ""
93 | }
94 | },
95 | "outputs": [],
96 | "source": [
97 | "%run \"/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook\""
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {
103 | "application/vnd.databricks.v1+cell": {
104 | "cellMetadata": {
105 | "byteLimit": 2048000,
106 | "rowLimit": 10000
107 | },
108 | "inputWidgets": {},
109 | "nuid": "7b7260e3-4612-4e86-ac30-51e7c91fe669",
110 | "showTitle": false,
111 | "tableResultSettingsMap": {},
112 | "title": ""
113 | }
114 | },
115 | "source": [
116 | "####How to run a linux commands inside a notebook using %sh magic command"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 0,
122 | "metadata": {
123 | "application/vnd.databricks.v1+cell": {
124 | "cellMetadata": {
125 | "byteLimit": 2048000,
126 | "rowLimit": 10000
127 | },
128 | "inputWidgets": {},
129 | "nuid": "7f274612-5fb2-4589-86c6-d236abeb9aba",
130 | "showTitle": false,
131 | "tableResultSettingsMap": {},
132 | "title": ""
133 | }
134 | },
135 | "outputs": [],
136 | "source": [
137 | "%sh\n",
138 | "ls -l /databricks-datasets/airlines\n",
139 | "head -1 /databricks-datasets/airlines/part-01902"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {
145 | "application/vnd.databricks.v1+cell": {
146 | "cellMetadata": {},
147 | "inputWidgets": {},
148 | "nuid": "85ffa4e3-dc14-4ea4-bab4-94151bc22f9d",
149 | "showTitle": false,
150 | "tableResultSettingsMap": {},
151 | "title": ""
152 | }
153 | },
154 | "source": [
155 | "We are going to use Databricks Unity Catalog (We don't know about it yet)\n",
156 | "to create tables and files under the volume (catalog/schema/volume/folder/files)"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 0,
162 | "metadata": {
163 | "application/vnd.databricks.v1+cell": {
164 | "cellMetadata": {
165 | "byteLimit": 2048000,
166 | "implicitDf": true,
167 | "rowLimit": 10000
168 | },
169 | "inputWidgets": {},
170 | "nuid": "85719c48-7508-45d6-b879-d5ab1e956bcc",
171 | "showTitle": false,
172 | "tableResultSettingsMap": {},
173 | "title": ""
174 | }
175 | },
176 | "outputs": [],
177 | "source": [
178 | "%sql\n",
179 | "CREATE VOLUME IF NOT EXISTS workspace.default.volumewe47_datalake;"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {
185 | "application/vnd.databricks.v1+cell": {
186 | "cellMetadata": {
187 | "byteLimit": 2048000,
188 | "rowLimit": 10000
189 | },
190 | "inputWidgets": {},
191 | "nuid": "97a1ed02-dc2f-4eac-9b34-ed9237d3cd20",
192 | "showTitle": false,
193 | "tableResultSettingsMap": {},
194 | "title": ""
195 | }
196 | },
197 | "source": [
198 | "####Upload some sample data going into (Catalog -> My Organization -> Workspace -> Default -> Volumes)
How to run a DBFS (like Hadoop) FS commands inside a notebook using %fs magic command to copy the uploaded data into some other volume from the uploaded volume"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 0,
204 | "metadata": {
205 | "application/vnd.databricks.v1+cell": {
206 | "cellMetadata": {
207 | "byteLimit": 2048000,
208 | "rowLimit": 10000
209 | },
210 | "inputWidgets": {},
211 | "nuid": "6947aa56-af9f-42de-8262-b7bc680203f7",
212 | "showTitle": false,
213 | "tableResultSettingsMap": {},
214 | "title": ""
215 | }
216 | },
217 | "outputs": [],
218 | "source": [
219 | "%fs ls \"dbfs:///Volumes/workspace/default/volumewe47_datalake\""
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 0,
225 | "metadata": {
226 | "application/vnd.databricks.v1+cell": {
227 | "cellMetadata": {
228 | "byteLimit": 2048000,
229 | "rowLimit": 10000
230 | },
231 | "inputWidgets": {},
232 | "nuid": "e68958aa-a396-4a19-b4e5-3541c3d5d756",
233 | "showTitle": false,
234 | "tableResultSettingsMap": {},
235 | "title": ""
236 | }
237 | },
238 | "outputs": [],
239 | "source": [
240 | "%fs cp \"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients.csv\" \"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy.csv\""
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {
246 | "application/vnd.databricks.v1+cell": {
247 | "cellMetadata": {},
248 | "inputWidgets": {},
249 | "nuid": "0a8e80b8-4546-468b-87ed-7bb6012c1a7b",
250 | "showTitle": false,
251 | "tableResultSettingsMap": {},
252 | "title": ""
253 | }
254 | },
255 | "source": [
256 | "Learning for the first time the dbutils, we learn in detail later\n",
257 | "Rather using fs command, we can use databricks utility command (comprehensive) to copy the data/any other filesystem operations in the DBFS"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 0,
263 | "metadata": {
264 | "application/vnd.databricks.v1+cell": {
265 | "cellMetadata": {
266 | "byteLimit": 2048000,
267 | "rowLimit": 10000
268 | },
269 | "inputWidgets": {},
270 | "nuid": "f29a64d1-087b-46f7-8e97-63c38991fd40",
271 | "showTitle": false,
272 | "tableResultSettingsMap": {},
273 | "title": ""
274 | }
275 | },
276 | "outputs": [],
277 | "source": [
278 | "%python\n",
279 | "dbutils.fs.cp(\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients.csv\",\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy2.csv\")\n",
280 | "dbutils.fs.rm(\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy.csv\")"
281 | ]
282 | },
283 | {
284 | "cell_type": "markdown",
285 | "metadata": {
286 | "application/vnd.databricks.v1+cell": {
287 | "cellMetadata": {
288 | "byteLimit": 2048000,
289 | "rowLimit": 10000
290 | },
291 | "inputWidgets": {},
292 | "nuid": "a43dd280-49bc-4f22-97fb-08a6b23218cc",
293 | "showTitle": false,
294 | "tableResultSettingsMap": {},
295 | "title": ""
296 | }
297 | },
298 | "source": [
299 | "####How to run a Spark SQL/HQL Queries inside a notebook using %sql magic command"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 0,
305 | "metadata": {
306 | "application/vnd.databricks.v1+cell": {
307 | "cellMetadata": {
308 | "byteLimit": 2048000,
309 | "implicitDf": true,
310 | "rowLimit": 10000
311 | },
312 | "inputWidgets": {},
313 | "nuid": "0421bbb7-38d3-4c49-bc4a-28f0c30072e1",
314 | "showTitle": false,
315 | "tableResultSettingsMap": {},
316 | "title": ""
317 | }
318 | },
319 | "outputs": [],
320 | "source": [
321 | "%sql\n",
322 | "create table if not exists default.cities2(id int,city string);\n",
323 | "insert into default.cities2 values(3,'Mumbai'),(4,'Lucknow');\n",
324 | "select * from cities2;"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 0,
330 | "metadata": {
331 | "application/vnd.databricks.v1+cell": {
332 | "cellMetadata": {
333 | "byteLimit": 2048000,
334 | "rowLimit": 10000
335 | },
336 | "inputWidgets": {},
337 | "nuid": "610c68dd-c905-4abc-9f33-ad5623ba4dcb",
338 | "showTitle": false,
339 | "tableResultSettingsMap": {},
340 | "title": ""
341 | }
342 | },
343 | "outputs": [],
344 | "source": [
345 | "%python\n",
346 | "spark.sql(\"select * from cities2\").explain(True)"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": 0,
352 | "metadata": {
353 | "application/vnd.databricks.v1+cell": {
354 | "cellMetadata": {
355 | "byteLimit": 2048000,
356 | "implicitDf": true,
357 | "rowLimit": 10000
358 | },
359 | "inputWidgets": {},
360 | "nuid": "d1fe763e-6058-4ad3-980e-ae595fc32499",
361 | "showTitle": false,
362 | "tableResultSettingsMap": {},
363 | "title": ""
364 | }
365 | },
366 | "outputs": [],
367 | "source": [
368 | "%sql\n",
369 | "update cities1 set city='Kolkata' where id=4;"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": 0,
375 | "metadata": {
376 | "application/vnd.databricks.v1+cell": {
377 | "cellMetadata": {
378 | "byteLimit": 2048000,
379 | "implicitDf": true,
380 | "rowLimit": 10000
381 | },
382 | "inputWidgets": {},
383 | "nuid": "2cdb43f7-2838-42d6-a19b-60d2377d1812",
384 | "showTitle": false,
385 | "tableResultSettingsMap": {},
386 | "title": ""
387 | }
388 | },
389 | "outputs": [],
390 | "source": [
391 | "%sql\n",
392 | "show create table cities1;"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 0,
398 | "metadata": {
399 | "application/vnd.databricks.v1+cell": {
400 | "cellMetadata": {
401 | "byteLimit": 2048000,
402 | "implicitDf": true,
403 | "rowLimit": 10000
404 | },
405 | "inputWidgets": {},
406 | "nuid": "3485856e-b35c-415f-a588-b74aadcbeafd",
407 | "showTitle": false,
408 | "tableResultSettingsMap": {},
409 | "title": ""
410 | }
411 | },
412 | "outputs": [],
413 | "source": [
414 | "%sql\n",
415 | "from cities1 select *;"
416 | ]
417 | },
418 | {
419 | "cell_type": "markdown",
420 | "metadata": {
421 | "application/vnd.databricks.v1+cell": {
422 | "cellMetadata": {
423 | "byteLimit": 2048000,
424 | "rowLimit": 10000
425 | },
426 | "inputWidgets": {},
427 | "nuid": "0cb845bd-3334-4f02-8794-28797af494a0",
428 | "showTitle": false,
429 | "tableResultSettingsMap": {},
430 | "title": ""
431 | }
432 | },
433 | "source": [
434 | "####How to run a Python Program inside a notebook using %python magic command or by default the cell will be enabled with python interpretter only"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 0,
440 | "metadata": {
441 | "application/vnd.databricks.v1+cell": {
442 | "cellMetadata": {
443 | "byteLimit": 2048000,
444 | "rowLimit": 10000
445 | },
446 | "inputWidgets": {},
447 | "nuid": "4745b1ba-c25c-4c6f-9a4b-6ab4ad27bd98",
448 | "showTitle": false,
449 | "tableResultSettingsMap": {},
450 | "title": ""
451 | }
452 | },
453 | "outputs": [],
454 | "source": [
455 | "def sqrt(a):\n",
456 | " return a*a"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": 0,
462 | "metadata": {
463 | "application/vnd.databricks.v1+cell": {
464 | "cellMetadata": {
465 | "byteLimit": 2048000,
466 | "rowLimit": 10000
467 | },
468 | "inputWidgets": {},
469 | "nuid": "86607897-fad2-454f-86ab-d1f639889cca",
470 | "showTitle": false,
471 | "tableResultSettingsMap": {},
472 | "title": ""
473 | }
474 | },
475 | "outputs": [],
476 | "source": [
477 | "print(\"square root function call \",sqrt(10))"
478 | ]
479 | },
480 | {
481 | "cell_type": "markdown",
482 | "metadata": {
483 | "application/vnd.databricks.v1+cell": {
484 | "cellMetadata": {},
485 | "inputWidgets": {},
486 | "nuid": "bb77dbe6-2686-4f10-9781-6e37357e121e",
487 | "showTitle": false,
488 | "tableResultSettingsMap": {},
489 | "title": ""
490 | }
491 | },
492 | "source": [
493 | "In the python magic cell itself, we already have spark session object instantiated,
\n",
494 | "so we can lavishly write spark programs"
495 | ]
496 | },
497 | {
498 | "cell_type": "markdown",
499 | "metadata": {
500 | "application/vnd.databricks.v1+cell": {
501 | "cellMetadata": {
502 | "byteLimit": 2048000,
503 | "rowLimit": 10000
504 | },
505 | "inputWidgets": {},
506 | "nuid": "1c3c9245-15f8-488a-99f1-c4ea979f607e",
507 | "showTitle": false,
508 | "tableResultSettingsMap": {},
509 | "title": ""
510 | }
511 | },
512 | "source": [
513 | "####How to install additional libraries in this current Python Interpreter using %pip magic command"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": 0,
519 | "metadata": {
520 | "application/vnd.databricks.v1+cell": {
521 | "cellMetadata": {
522 | "byteLimit": 2048000,
523 | "rowLimit": 10000
524 | },
525 | "inputWidgets": {},
526 | "nuid": "11ac3360-e8f1-4454-a37d-9ca2f965fcdd",
527 | "showTitle": false,
528 | "tableResultSettingsMap": {},
529 | "title": ""
530 | }
531 | },
532 | "outputs": [],
533 | "source": [
534 | "%pip install pypi"
535 | ]
536 | }
537 | ],
538 | "metadata": {
539 | "application/vnd.databricks.v1+notebook": {
540 | "computePreferences": {
541 | "hardware": {
542 | "accelerator": null,
543 | "gpuPoolId": null,
544 | "memory": null
545 | }
546 | },
547 | "dashboards": [],
548 | "environmentMetadata": null,
549 | "inputWidgetPreferences": null,
550 | "language": "python",
551 | "notebookMetadata": {
552 | "mostRecentlyExecutedCommandWithImplicitDF": {
553 | "commandId": 6631791723001833,
554 | "dataframes": [
555 | "_sqldf"
556 | ]
557 | },
558 | "pythonIndentUnit": 4
559 | },
560 | "notebookName": "1_Explore_Notebooks_magic_commands",
561 | "widgets": {}
562 | },
563 | "language_info": {
564 | "name": "python"
565 | }
566 | },
567 | "nbformat": 4,
568 | "nbformat_minor": 0
569 | }
570 |
--------------------------------------------------------------------------------
/databricks_workouts_2025_WE47/2_Spark_DataFrame_Read_Write_Operations/3-Basic-WriteOps.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "e3756d01-4aa7-45d1-bffa-b7e3afd67e3c",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | "#By Knowing this notebook, we can become a eligible \"DATA EGRESS DEVELOPER\""
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "application/vnd.databricks.v1+cell": {
23 | "cellMetadata": {},
24 | "inputWidgets": {},
25 | "nuid": "71713fcb-b659-4e62-bbee-1d3092f13683",
26 | "showTitle": false,
27 | "tableResultSettingsMap": {},
28 | "title": ""
29 | }
30 | },
31 | "source": [
32 | "### Let's get some data we have already..."
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 0,
38 | "metadata": {
39 | "application/vnd.databricks.v1+cell": {
40 | "cellMetadata": {
41 | "byteLimit": 2048000,
42 | "rowLimit": 10000
43 | },
44 | "inputWidgets": {},
45 | "nuid": "059b8ec6-31ff-46f7-a19d-069340242a79",
46 | "showTitle": false,
47 | "tableResultSettingsMap": {},
48 | "title": ""
49 | }
50 | },
51 | "outputs": [],
52 | "source": [
53 | "df1=spark.read.csv(path=\"/Volumes/we47catalog/we47schema/we47_volume/we47_dir1/custs_header\",header=True,inferSchema=True)\n",
54 | "df1.show(2)"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {
60 | "application/vnd.databricks.v1+cell": {
61 | "cellMetadata": {},
62 | "inputWidgets": {},
63 | "nuid": "4de366d1-b620-4eb3-8014-943802d1b67a",
64 | "showTitle": false,
65 | "tableResultSettingsMap": {},
66 | "title": ""
67 | }
68 | },
69 | "source": [
70 | "### Writing the data in Builtin - different file formats & different targets (all targets in this world we can write the data also...)"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {
76 | "application/vnd.databricks.v1+cell": {
77 | "cellMetadata": {},
78 | "inputWidgets": {},
79 | "nuid": "265d5989-ca7e-4f2f-8e6a-93e75c23948d",
80 | "showTitle": false,
81 | "tableResultSettingsMap": {},
82 | "title": ""
83 | }
84 | },
85 | "source": [
86 | "####1. Writing in csv format with few basic options listed below\n",
87 | "- header\n",
88 | "- sep\n",
89 | "- mode"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 0,
95 | "metadata": {
96 | "application/vnd.databricks.v1+cell": {
97 | "cellMetadata": {
98 | "byteLimit": 2048000,
99 | "rowLimit": 10000
100 | },
101 | "inputWidgets": {},
102 | "nuid": "a0ae6f6b-aae1-4749-ad83-da37c28e41bc",
103 | "showTitle": false,
104 | "tableResultSettingsMap": {},
105 | "title": ""
106 | }
107 | },
108 | "outputs": [],
109 | "source": [
110 | "\n",
111 | "#We did a schema migration from comma to tilde delimiter\n",
112 | "df1.write.csv(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/csv_targetdata\",header=True,sep='~',mode='overwrite')"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {
118 | "application/vnd.databricks.v1+cell": {
119 | "cellMetadata": {},
120 | "inputWidgets": {},
121 | "nuid": "a29a4365-30ff-49c1-af16-5ddbbaa9b3ca",
122 | "showTitle": false,
123 | "tableResultSettingsMap": {},
124 | "title": ""
125 | }
126 | },
127 | "source": [
128 | "####2. Writing in json format with few basic options listed below\n",
129 | "path
\n",
130 | "mode\n",
131 | "- We did a schema migration and data conversion from csv to json format (ie structued to semi structured format)\n",
132 | "- json - we learn a lot subsequently, \n",
133 | "- what is json - fundamentally it is a dictionary of dictionaries\n",
134 | "- json - java script object notation\n",
135 | "- format - {\"k1\":v1,\"k2\":v2,\"k3\":v2} where key has to be unique & enclosed in double quotes and value can be anything\n",
136 | "- **when to go with json or benifits** - \n",
137 | "- a. If we have data in a semistructure format(variable data format with dynamic schema)\n",
138 | "- b. columns and the types and the order can be different\n",
139 | "- c. json will be provided by the sources if the data is dynamic in nature or if the data is api response in nature.\n",
140 | "- d. json is a efficient data format (serialized/encoded) for performing data exchange between applications via network & good for parsing also.\n",
141 | "- e. json can be used to group or create hierarchy of data in a complex or in a nested format."
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 0,
147 | "metadata": {
148 | "application/vnd.databricks.v1+cell": {
149 | "cellMetadata": {
150 | "byteLimit": 2048000,
151 | "rowLimit": 10000
152 | },
153 | "inputWidgets": {},
154 | "nuid": "f04f4317-79d6-4fe5-98e8-f41727c31739",
155 | "showTitle": false,
156 | "tableResultSettingsMap": {},
157 | "title": ""
158 | }
159 | },
160 | "outputs": [],
161 | "source": [
162 | "df1.write.json(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/json_targetdata\",mode='append')"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {
168 | "application/vnd.databricks.v1+cell": {
169 | "cellMetadata": {},
170 | "inputWidgets": {},
171 | "nuid": "3d9b6ca7-9aa4-4d18-bb18-0d62829dddd2",
172 | "showTitle": false,
173 | "tableResultSettingsMap": {},
174 | "title": ""
175 | }
176 | },
177 | "source": [
178 | "####3.Serialization & Deserialization File formats (Brainy File formats)\n",
179 | "What are the (builtin) serialized file formats we are going to learn?\n",
180 | "orc\n",
181 | "parquet\n",
182 | "delta(databricks properatory)\n",
183 | "\n",
184 | "- We did a schema migration and data conversion from csv/json to serialized data format (ie structued to sturctured(internall binary unstructured) format)\n",
185 | "- We learn/use a lot/heavily subsequently, \n",
186 | "- what is serialized - fundamentally they are intelligent/encoded/serialized/binary data formats applied with lot of optimization & space reduction strategies..\n",
187 | "- orc - optimized row column format\n",
188 | "- parquet - tiled data format\n",
189 | "- delta(databricks properatory) enriched parquet format - Delta (modified) operations can be performed\n",
190 | "- format - serialized/encoded , we can't see with mere eyes, only some library is used deserialized/decoded data can be accessed as structured data\n",
191 | "- **when to go with serialized or benifits** - \n",
192 | "- a. For storage benifits for eg. orc will save 65+% of space for eg. if i store 1gb data it occupy 350 space, with compression it can improved more...\n",
193 | "- b. For processing optimization. Orc/parquet/delta will provide the required data alone if you query using Pushdown optimization .\n",
194 | "- c. Interoperability feature - this data format can be understandable in multiple environments for eg. bigquery can parse this data.\n",
195 | "- d. Secured\n",
196 | "- **In the projects/environments when to use what fileformats - we learn in detail later..."
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 0,
202 | "metadata": {
203 | "application/vnd.databricks.v1+cell": {
204 | "cellMetadata": {
205 | "byteLimit": 2048000,
206 | "rowLimit": 10000
207 | },
208 | "inputWidgets": {},
209 | "nuid": "f4fb6848-a995-4977-a1d0-ff547c686cd5",
210 | "showTitle": false,
211 | "tableResultSettingsMap": {},
212 | "title": ""
213 | }
214 | },
215 | "outputs": [],
216 | "source": [
217 | "df1.write.orc(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/orc_targetdata\",mode='ignore')#serialization\n",
218 | "spark.read.orc(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/orc_targetdata\").show(2)#deserialization"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 0,
224 | "metadata": {
225 | "application/vnd.databricks.v1+cell": {
226 | "cellMetadata": {
227 | "byteLimit": 2048000,
228 | "rowLimit": 10000
229 | },
230 | "inputWidgets": {},
231 | "nuid": "aed8a769-b528-44ee-879f-b8c145e72c80",
232 | "showTitle": false,
233 | "tableResultSettingsMap": {},
234 | "title": ""
235 | }
236 | },
237 | "outputs": [],
238 | "source": [
239 | "df1.write.option(\"maxRecordsPerFile\",1).parquet(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/parquet_targetdata2\",mode='error',compression='gzip')"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 0,
245 | "metadata": {
246 | "application/vnd.databricks.v1+cell": {
247 | "cellMetadata": {
248 | "byteLimit": 2048000,
249 | "rowLimit": 10000
250 | },
251 | "inputWidgets": {},
252 | "nuid": "27ff59ea-6793-4d25-9896-a14460d241a0",
253 | "showTitle": false,
254 | "tableResultSettingsMap": {},
255 | "title": ""
256 | }
257 | },
258 | "outputs": [],
259 | "source": [
260 | "#df1.write.delta(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/delta_targetdata\")\n",
261 | "df1.write.format(\"delta\").save(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/delta_targetdata\",mode='overwrite')\n",
262 | "spark.read.format(\"delta\").load(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/delta_targetdata\").show(2)"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 0,
268 | "metadata": {
269 | "application/vnd.databricks.v1+cell": {
270 | "cellMetadata": {
271 | "byteLimit": 2048000,
272 | "rowLimit": 10000
273 | },
274 | "inputWidgets": {},
275 | "nuid": "2b1d83f7-b903-42f8-aa39-3094dba9b94d",
276 | "showTitle": false,
277 | "tableResultSettingsMap": {},
278 | "title": ""
279 | }
280 | },
281 | "outputs": [],
282 | "source": [
283 | "#What is the default format of file will be generated with, when we don't mention the format explicitly?\n",
284 | "#It is Parquet(Delta)\n",
285 | "df1.write.save(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/what_targetdata\",mode='overwrite')"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {
291 | "application/vnd.databricks.v1+cell": {
292 | "cellMetadata": {},
293 | "inputWidgets": {},
294 | "nuid": "5bb4ab26-481f-4b00-a5cb-675b105863d2",
295 | "showTitle": false,
296 | "tableResultSettingsMap": {},
297 | "title": ""
298 | }
299 | },
300 | "source": [
301 | "####4.Table Load Operations - Building LAKEHOUSE ON TOP OF DATALAKE\n",
302 | "Can we do SQL operations directly on the tables like a database or datawarehouse? or Can we build a Lakehouse in Databricks?\n",
303 | "- We learn/use a lot/heavily subsequently, \n",
304 | "- what is Lakehouse - A SQL/Datawarehouse/Query layer on top of the Datalake is called Lakehouse\n",
305 | "- We have different lakehouses which we are going to learn further - \n",
306 | "1. delta tables (lakehouse) in databricks\n",
307 | "2. hive in onprem\n",
308 | "3. bigquery in GCP\n",
309 | "4. synapse in azure\n",
310 | "5. athena in aws\n",
311 | "- **when to go with lakehouse** - \n",
312 | "- a. Transformation\n",
313 | "- b. Analysis/Analytics\n",
314 | "- c. AI/BI\n",
315 | "- d. Literally we are going to learn SQL & Advanced SQL"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 0,
321 | "metadata": {
322 | "application/vnd.databricks.v1+cell": {
323 | "cellMetadata": {
324 | "byteLimit": 2048000,
325 | "rowLimit": 10000
326 | },
327 | "inputWidgets": {},
328 | "nuid": "ad625815-1e9d-4917-b87a-8e8d756bee72",
329 | "showTitle": false,
330 | "tableResultSettingsMap": {},
331 | "title": ""
332 | }
333 | },
334 | "outputs": [],
335 | "source": [
336 | "#Out of 18 write.functions, we know 9 functions, lets go with few more basic functions (xml, saveAsTable,InsertInto)\n",
337 | "df1.write.saveAsTable(\"default.customertbl\",mode='overwrite')#default delta format\n",
338 | "spark.read.table(\"default.customertbl\").show(2)"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 0,
344 | "metadata": {
345 | "application/vnd.databricks.v1+cell": {
346 | "cellMetadata": {
347 | "byteLimit": 2048000,
348 | "rowLimit": 10000
349 | },
350 | "inputWidgets": {},
351 | "nuid": "9f323284-afde-43f8-9a7c-c0838afa3391",
352 | "showTitle": false,
353 | "tableResultSettingsMap": {},
354 | "title": ""
355 | }
356 | },
357 | "outputs": [],
358 | "source": [
359 | "#Notes Unlike :meth:`DataFrameWriter.saveAsTable`, :meth:`DataFrameWriter.insertInto` ignores the column names and just uses position-based resolution.\n",
360 | "# table has to be present already\n",
361 | "# this will be used for some minimal data write operation hence preferred function is saveAsTable()\n",
362 | "df1.write.insertInto(\"customertbl\",overwrite=True)"
363 | ]
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "metadata": {
368 | "application/vnd.databricks.v1+cell": {
369 | "cellMetadata": {},
370 | "inputWidgets": {},
371 | "nuid": "7586d9b9-5766-44e9-9c4a-51c1805f316c",
372 | "showTitle": false,
373 | "tableResultSettingsMap": {},
374 | "title": ""
375 | }
376 | },
377 | "source": [
378 | "####5. XML Format - Semi structured data format (most of the json features can be applied in xml also, but in DE world not so famous like json)\n",
379 | "- Used rarely on demand (by certain target/source systems eg. mainframes)\n",
380 | "- Can be related with json, but not so much efficient like json\n",
381 | "- Databricks provides xml as a inbuild function"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": 0,
387 | "metadata": {
388 | "application/vnd.databricks.v1+cell": {
389 | "cellMetadata": {
390 | "byteLimit": 2048000,
391 | "rowLimit": 10000
392 | },
393 | "inputWidgets": {},
394 | "nuid": "82e3bc79-afad-4cbb-a84a-1f98f0f06c1f",
395 | "showTitle": false,
396 | "tableResultSettingsMap": {},
397 | "title": ""
398 | }
399 | },
400 | "outputs": [],
401 | "source": [
402 | "df1.write.xml(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/xml_targetdata\",rowTag='customer',mode='overwrite')"
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "execution_count": 0,
408 | "metadata": {
409 | "application/vnd.databricks.v1+cell": {
410 | "cellMetadata": {
411 | "byteLimit": 2048000,
412 | "rowLimit": 10000
413 | },
414 | "inputWidgets": {},
415 | "nuid": "a1c88733-687d-4f52-b864-1eec9a7eb87e",
416 | "showTitle": false,
417 | "tableResultSettingsMap": {},
418 | "title": ""
419 | }
420 | },
421 | "outputs": [],
422 | "source": [
423 | "spark.read.xml(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/xml_targetdata\",rowTag='customer').show(2)"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {
429 | "application/vnd.databricks.v1+cell": {
430 | "cellMetadata": {},
431 | "inputWidgets": {},
432 | "nuid": "7abb6500-0b7e-4339-b814-3e356c78d7ce",
433 | "showTitle": false,
434 | "tableResultSettingsMap": {},
435 | "title": ""
436 | }
437 | },
438 | "source": [
439 | "### Modes in Writing\n",
440 | "1. **Append** - Adds the new data to the existing data. It does not overwrite anything.\n",
441 | "2. **Overwrite** - Replaces the existing data entirely at the destination.\n",
442 | "3. **ErrorIfexist**(default) - Throws an error if data already exists at the destination.\n",
443 | "4. **Ignore** - Skips the write operation if data already exists at the destination."
444 | ]
445 | }
446 | ],
447 | "metadata": {
448 | "application/vnd.databricks.v1+notebook": {
449 | "computePreferences": null,
450 | "dashboards": [],
451 | "environmentMetadata": {
452 | "base_environment": "",
453 | "environment_version": "3"
454 | },
455 | "inputWidgetPreferences": null,
456 | "language": "python",
457 | "notebookMetadata": {
458 | "mostRecentlyExecutedCommandWithImplicitDF": {
459 | "commandId": 7347217471020383,
460 | "dataframes": [
461 | "_sqldf"
462 | ]
463 | },
464 | "pythonIndentUnit": 4
465 | },
466 | "notebookName": "3-Basic-WriteOps",
467 | "widgets": {}
468 | },
469 | "language_info": {
470 | "name": "python"
471 | }
472 | },
473 | "nbformat": 4,
474 | "nbformat_minor": 0
475 | }
476 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/2_Spark_DataFrame_Read_Write_Operations/read_write_usecases.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "8ba86a20-5a3a-4130-86f5-e312f4a7901b",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | "#Telecom Domain Read & Write Ops Assignment - Building Datalake & Lakehouse\n",
17 | "This notebook contains assignments to practice Spark read options and Databricks volumes.
\n",
18 | "Sections: Sample data creation, Catalog & Volume creation, Copying data into Volumes, Path glob/recursive reads, toDF() column renaming variants, inferSchema/header/separator experiments, and exercises.
"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "application/vnd.databricks.v1+cell": {
25 | "cellMetadata": {},
26 | "inputWidgets": {},
27 | "nuid": "841c7ed8-ef18-486a-8187-07685e499b84",
28 | "showTitle": false,
29 | "tableResultSettingsMap": {},
30 | "title": ""
31 | }
32 | },
33 | "source": [
34 | "\n",
35 | ""
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {
41 | "application/vnd.databricks.v1+cell": {
42 | "cellMetadata": {},
43 | "inputWidgets": {},
44 | "nuid": "d4aa0a44-8cd6-41cf-921d-abb5ff67615b",
45 | "showTitle": false,
46 | "tableResultSettingsMap": {},
47 | "title": ""
48 | }
49 | },
50 | "source": [
51 | "##First Import all required libraries & Create spark session object"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {
57 | "application/vnd.databricks.v1+cell": {
58 | "cellMetadata": {},
59 | "inputWidgets": {},
60 | "nuid": "d0b67823-2e4e-45e2-aa25-80550a3ac580",
61 | "showTitle": false,
62 | "tableResultSettingsMap": {},
63 | "title": ""
64 | }
65 | },
66 | "source": [
67 | "##1. Write SQL statements to create:\n",
68 | "1. A catalog named telecom_catalog_assign\n",
69 | "2. A schema landing_zone\n",
70 | "3. A volume landing_vol\n",
71 | "4. Using dbutils.fs.mkdirs, create folders:
\n",
72 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/\n",
73 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/\n",
74 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/\n",
75 | "5. Explain the difference between (Just google and understand why we are going for volume concept for prod ready systems):
\n",
76 | "a. Volume vs DBFS/FileStore
\n",
77 | "b. Why production teams prefer Volumes for regulated data
"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {
83 | "application/vnd.databricks.v1+cell": {
84 | "cellMetadata": {},
85 | "inputWidgets": {},
86 | "nuid": "26d8bd3d-b575-448b-ae22-8173d15ca671",
87 | "showTitle": false,
88 | "tableResultSettingsMap": {},
89 | "title": ""
90 | }
91 | },
92 | "source": [
93 | "##Data files to use in this usecase:\n",
94 | "customer_csv = '''\n",
95 | "101,Arun,31,Chennai,PREPAID\n",
96 | "102,Meera,45,Bangalore,POSTPAID\n",
97 | "103,Irfan,29,Hyderabad,PREPAID\n",
98 | "104,Raj,52,Mumbai,POSTPAID\n",
99 | "105,,27,Delhi,PREPAID\n",
100 | "106,Sneha,abc,Pune,PREPAID\n",
101 | "'''\n",
102 | "\n",
103 | "usage_tsv = '''customer_id\\tvoice_mins\\tdata_mb\\tsms_count\n",
104 | "101\\t320\\t1500\\t20\n",
105 | "102\\t120\\t4000\\t5\n",
106 | "103\\t540\\t600\\t52\n",
107 | "104\\t45\\t200\\t2\n",
108 | "105\\t0\\t0\\t0\n",
109 | "'''\n",
110 | "\n",
111 | "tower_logs_region1 = '''event_id|customer_id|tower_id|signal_strength|timestamp\n",
112 | "5001|101|TWR01|-80|2025-01-10 10:21:54\n",
113 | "5004|104|TWR05|-75|2025-01-10 11:01:12\n",
114 | "'''"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {
120 | "application/vnd.databricks.v1+cell": {
121 | "cellMetadata": {},
122 | "inputWidgets": {},
123 | "nuid": "9540d2e2-2562-4be7-897f-0a7d57adaa72",
124 | "showTitle": false,
125 | "tableResultSettingsMap": {},
126 | "title": ""
127 | }
128 | },
129 | "source": [
130 | "##2. Filesystem operations\n",
131 | "1. Write dbutils.fs code to copy the above datasets into your created Volume folders:\n",
132 | "Customer → /Volumes/.../customer/\n",
133 | "Usage → /Volumes/.../usage/\n",
134 | "Tower (region-based) → /Volumes/.../tower/region1/ and /Volumes/.../tower/region2/\n",
135 | "\n",
136 | "2. Write a command to validate whether files were successfully copied"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {
142 | "application/vnd.databricks.v1+cell": {
143 | "cellMetadata": {},
144 | "inputWidgets": {},
145 | "nuid": "8767735b-24d3-428a-ad12-ae821903e2ce",
146 | "showTitle": false,
147 | "tableResultSettingsMap": {},
148 | "title": ""
149 | }
150 | },
151 | "source": [
152 | "##3. Spark Directory Read Use Cases\n",
153 | "1. Read all tower logs using:\n",
154 | "Path glob filter (example: *.csv)\n",
155 | "Multiple paths input\n",
156 | "Recursive lookup\n",
157 | "\n",
158 | "2. Demonstrate these 3 reads separately:\n",
159 | "Using pathGlobFilter\n",
160 | "Using list of paths in spark.read.csv([path1, path2])\n",
161 | "Using .option(\"recursiveFileLookup\",\"true\")\n",
162 | "\n",
163 | "3. Compare the outputs and understand when each should be used."
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {
169 | "application/vnd.databricks.v1+cell": {
170 | "cellMetadata": {},
171 | "inputWidgets": {},
172 | "nuid": "9f7147c1-5d58-47e1-84fe-7ebd26a217b9",
173 | "showTitle": false,
174 | "tableResultSettingsMap": {},
175 | "title": ""
176 | }
177 | },
178 | "source": [
179 | "##4. Schema Inference, Header, and Separator\n",
180 | "1. Try the Customer, Usage files with the option and options using read.csv and format function:
\n",
181 | "header=false, inferSchema=false
\n",
182 | "or
\n",
183 | "header=true, inferSchema=true
\n",
184 | "2. Write a note on What changed when we use header or inferSchema with true/false?
\n",
185 | "3. How schema inference handled “abc” in age?
"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {
191 | "application/vnd.databricks.v1+cell": {
192 | "cellMetadata": {},
193 | "inputWidgets": {},
194 | "nuid": "15d8dad0-bc63-47f1-9a90-72837cba6c4f",
195 | "showTitle": false,
196 | "tableResultSettingsMap": {},
197 | "title": ""
198 | }
199 | },
200 | "source": [
201 | "##5. Column Renaming Usecases\n",
202 | "1. Apply column names using string using toDF function for customer data\n",
203 | "2. Apply column names and datatype using the schema function for usage data\n",
204 | "3. Apply column names and datatype using the StructType with IntegerType, StringType, TimestampType and other classes for towers data "
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {
210 | "application/vnd.databricks.v1+cell": {
211 | "cellMetadata": {},
212 | "inputWidgets": {},
213 | "nuid": "6e1d6d88-7bcc-4548-a0d1-15d37f6fc0be",
214 | "showTitle": false,
215 | "tableResultSettingsMap": {},
216 | "title": ""
217 | }
218 | },
219 | "source": [
220 | "## Spark Write Operations using \n",
221 | "- csv, json, orc, parquet, delta, saveAsTable, insertInto, xml with different write mode, header and sep options"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {
227 | "application/vnd.databricks.v1+cell": {
228 | "cellMetadata": {},
229 | "inputWidgets": {},
230 | "nuid": "8e34c3bc-962d-438d-a1b6-ac27d2da6608",
231 | "showTitle": false,
232 | "tableResultSettingsMap": {},
233 | "title": ""
234 | }
235 | },
236 | "source": [
237 | "##6. Write Operations (Data Conversion/Schema migration) – CSV Format Usecases\n",
238 | "1. Write customer data into CSV format using overwrite mode\n",
239 | "2. Write usage data into CSV format using append mode\n",
240 | "3. Write tower data into CSV format with header enabled and custom separator (|)\n",
241 | "4. Read the tower data in a dataframe and show only 5 rows.\n",
242 | "5. Download the file into local from the catalog volume location and see the data of any of the above files opening in a notepad++."
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {
248 | "application/vnd.databricks.v1+cell": {
249 | "cellMetadata": {},
250 | "inputWidgets": {},
251 | "nuid": "34158cf6-dd7f-40d6-9969-ed76710540a4",
252 | "showTitle": false,
253 | "tableResultSettingsMap": {},
254 | "title": ""
255 | }
256 | },
257 | "source": [
258 | "##7. Write Operations (Data Conversion/Schema migration)– JSON Format Usecases\n",
259 | "1. Write customer data into JSON format using overwrite mode\n",
260 | "2. Write usage data into JSON format using append mode and snappy compression format\n",
261 | "3. Write tower data into JSON format using ignore mode and observe the behavior of this mode\n",
262 | "4. Read the tower data in a dataframe and show only 5 rows.\n",
263 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++."
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {
269 | "application/vnd.databricks.v1+cell": {
270 | "cellMetadata": {},
271 | "inputWidgets": {},
272 | "nuid": "26f2ba69-3cde-4ec6-8945-e4ef9f7bb109",
273 | "showTitle": false,
274 | "tableResultSettingsMap": {},
275 | "title": ""
276 | }
277 | },
278 | "source": [
279 | "##8. Write Operations (Data Conversion/Schema migration) – Parquet Format Usecases\n",
280 | "1. Write customer data into Parquet format using overwrite mode and in a gzip format\n",
281 | "2. Write usage data into Parquet format using error mode\n",
282 | "3. Write tower data into Parquet format with gzip compression option\n",
283 | "4. Read the usage data in a dataframe and show only 5 rows.\n",
284 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++."
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {
290 | "application/vnd.databricks.v1+cell": {
291 | "cellMetadata": {},
292 | "inputWidgets": {},
293 | "nuid": "b41c794f-5cfc-4aeb-a599-e6d4a47a0f3f",
294 | "showTitle": false,
295 | "tableResultSettingsMap": {},
296 | "title": ""
297 | }
298 | },
299 | "source": [
300 | "##9. Write Operations (Data Conversion/Schema migration) – Orc Format Usecases\n",
301 | "1. Write customer data into ORC format using overwrite mode\n",
302 | "2. Write usage data into ORC format using append mode\n",
303 | "3. Write tower data into ORC format and see the output file structure\n",
304 | "4. Read the usage data in a dataframe and show only 5 rows.\n",
305 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++."
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {
311 | "application/vnd.databricks.v1+cell": {
312 | "cellMetadata": {},
313 | "inputWidgets": {},
314 | "nuid": "35761315-0b0f-46ff-9c3d-c0405bce7b62",
315 | "showTitle": false,
316 | "tableResultSettingsMap": {},
317 | "title": ""
318 | }
319 | },
320 | "source": [
321 | "##10. Write Operations (Data Conversion/Schema migration) – Delta Format Usecases\n",
322 | "1. Write customer data into Delta format using overwrite mode\n",
323 | "2. Write usage data into Delta format using append mode\n",
324 | "3. Write tower data into Delta format and see the output file structure\n",
325 | "4. Read the usage data in a dataframe and show only 5 rows.\n",
326 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++.\n",
327 | "6. Compare the parquet location and delta location and try to understand what is the differentiating factor, as both are parquet files only."
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {
333 | "application/vnd.databricks.v1+cell": {
334 | "cellMetadata": {},
335 | "inputWidgets": {},
336 | "nuid": "e6dd0890-02bd-4acd-b837-daceb256c706",
337 | "showTitle": false,
338 | "tableResultSettingsMap": {},
339 | "title": ""
340 | }
341 | },
342 | "source": [
343 | "##11. Write Operations (Lakehouse Usecases) – Delta table Usecases\n",
344 | "1. Write customer data using saveAsTable() as a managed table\n",
345 | "2. Write usage data using saveAsTable() with overwrite mode\n",
346 | "3. Drop the managed table and verify data removal\n",
347 | "4. Go and check the table overview and realize it is in delta format in the Catalog.\n",
348 | "5. Use spark.read.sql to write some simple queries on the above tables created.\n"
349 | ]
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "metadata": {
354 | "application/vnd.databricks.v1+cell": {
355 | "cellMetadata": {},
356 | "inputWidgets": {},
357 | "nuid": "1aac447b-690b-4562-99dd-0ce096e9ad55",
358 | "showTitle": false,
359 | "tableResultSettingsMap": {},
360 | "title": ""
361 | }
362 | },
363 | "source": [
364 | "##12. Write Operations (Lakehouse Usecases) – Delta table Usecases\n",
365 | "1. Write customer data using insertInto() in a new table and find the behavior\n",
366 | "2. Write usage data using insertTable() with overwrite mode"
367 | ]
368 | },
369 | {
370 | "cell_type": "markdown",
371 | "metadata": {
372 | "application/vnd.databricks.v1+cell": {
373 | "cellMetadata": {},
374 | "inputWidgets": {},
375 | "nuid": "e3c4bce3-4bd3-4db6-a074-02bb24c5f91a",
376 | "showTitle": false,
377 | "tableResultSettingsMap": {},
378 | "title": ""
379 | }
380 | },
381 | "source": [
382 | "##13. Write Operations (Lakehouse Usecases) – Delta table Usecases\n",
383 | "1. Write customer data into XML format using rowTag as cust\n",
384 | "2. Write usage data into XML format using overwrite mode with the rowTag as usage\n",
385 | "3. Download the xml data and open the file in notepad++ and see how the xml file looks like."
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {
391 | "application/vnd.databricks.v1+cell": {
392 | "cellMetadata": {},
393 | "inputWidgets": {},
394 | "nuid": "83e2fe69-9352-4ec9-bf70-15d760c89aa3",
395 | "showTitle": false,
396 | "tableResultSettingsMap": {},
397 | "title": ""
398 | }
399 | },
400 | "source": [
401 | "##14. Compare all the downloaded files (csv, json, orc, parquet, delta and xml) \n",
402 | "1. Capture the size occupied between all of these file formats and list the formats below based on the order of size from small to big."
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {
408 | "application/vnd.databricks.v1+cell": {
409 | "cellMetadata": {},
410 | "inputWidgets": {},
411 | "nuid": "7d4d5ddc-359f-48fe-bac4-c40271b48163",
412 | "showTitle": false,
413 | "tableResultSettingsMap": {},
414 | "title": ""
415 | }
416 | },
417 | "source": [
418 | "###15. Try to do permutation and combination of performing Schema Migration & Data Conversion operations like...\n",
419 | "1. Read any one of the above orc data in a dataframe and write it to dbfs in a parquet format\n",
420 | "2. Read any one of the above parquet data in a dataframe and write it to dbfs in a delta format\n",
421 | "3. Read any one of the above delta data in a dataframe and write it to dbfs in a xml format\n",
422 | "4. Read any one of the above delta table in a dataframe and write it to dbfs in a json format\n",
423 | "5. Read any one of the above delta table in a dataframe and write it to another table"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {
429 | "application/vnd.databricks.v1+cell": {
430 | "cellMetadata": {},
431 | "inputWidgets": {},
432 | "nuid": "3d6e39ec-752d-4183-9656-2b6d7938922d",
433 | "showTitle": false,
434 | "tableResultSettingsMap": {},
435 | "title": ""
436 | }
437 | },
438 | "source": [
439 | "##16. Do a final exercise of defining one/two liner of... \n",
440 | "1. When to use/benifits csv\n",
441 | "2. When to use/benifits json\n",
442 | "3. When to use/benifit orc\n",
443 | "4. When to use/benifit parquet\n",
444 | "5. When to use/benifit delta\n",
445 | "6. When to use/benifit xml\n",
446 | "7. When to use/benifit delta tables\n"
447 | ]
448 | }
449 | ],
450 | "metadata": {
451 | "application/vnd.databricks.v1+notebook": {
452 | "computePreferences": null,
453 | "dashboards": [],
454 | "environmentMetadata": {
455 | "base_environment": "",
456 | "environment_version": "4"
457 | },
458 | "inputWidgetPreferences": null,
459 | "language": "python",
460 | "notebookMetadata": {
461 | "mostRecentlyExecutedCommandWithImplicitDF": {
462 | "commandId": -1,
463 | "dataframes": [
464 | "_sqldf"
465 | ]
466 | },
467 | "pythonIndentUnit": 4
468 | },
469 | "notebookName": "read_write_usecases",
470 | "widgets": {}
471 | },
472 | "language_info": {
473 | "name": "python"
474 | }
475 | },
476 | "nbformat": 4,
477 | "nbformat_minor": 0
478 | }
479 |
--------------------------------------------------------------------------------
/databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/1_Explore_Notebooks_magic_commands.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {
8 | "byteLimit": 2048000,
9 | "rowLimit": 10000
10 | },
11 | "inputWidgets": {},
12 | "nuid": "2c477351-9470-4b26-8f73-6f967c37729e",
13 | "showTitle": false,
14 | "tableResultSettingsMap": {},
15 | "title": ""
16 | }
17 | },
18 | "source": [
19 | "#Welcome to Inceptez Technologies\n",
20 | "Let us understand about creating notebooks & magical commands\n",
21 | "https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png\n",
22 | ""
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {
28 | "application/vnd.databricks.v1+cell": {
29 | "cellMetadata": {
30 | "byteLimit": 2048000,
31 | "rowLimit": 10000
32 | },
33 | "inputWidgets": {},
34 | "nuid": "3ae38262-0e89-4a3b-9130-322375328fe4",
35 | "showTitle": false,
36 | "tableResultSettingsMap": {},
37 | "title": ""
38 | }
39 | },
40 | "source": [
41 | "##Let us learn first about Magical Commands\n",
42 | "**Important Magic Commands**\n",
43 | "- %md: allows you to write markdown text to design the notebook.\n",
44 | "- %run: runs a Python file or a notebook.\n",
45 | "- %sh: executes shell commands on the cluster nodes.\n",
46 | "- %fs: allows you to interact with the Databricks file system (Datalake command)\n",
47 | "- %sql: allows you to run Spark SQL/HQL queries.\n",
48 | "- %python: switches the notebook context to Python.\n",
49 | "- %pip: allows you to install Python packages.\n",
50 | "\n",
51 | "**Not Important Magic Commands or We learn few of these where we have Cloud(Azure) dependency**\n",
52 | "- %scala: switches the notebook context to Scala.\n",
53 | "- %r: switches the notebook context to R.\n",
54 | "- %lsmagic: lists all the available magic commands.\n",
55 | "- %config: allows you to set configuration options for the notebook.\n",
56 | "- %load: loads the contents of a file into a cell.\n",
57 | "- %who: lists all the variables in the current scope."
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {
63 | "application/vnd.databricks.v1+cell": {
64 | "cellMetadata": {
65 | "byteLimit": 2048000,
66 | "rowLimit": 10000
67 | },
68 | "inputWidgets": {},
69 | "nuid": "b35163da-292b-4c60-b6a9-a62521e22343",
70 | "showTitle": false,
71 | "tableResultSettingsMap": {},
72 | "title": ""
73 | }
74 | },
75 | "source": [
76 | "####How to call a notebook from the current notebook using %run magic command"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 0,
82 | "metadata": {
83 | "application/vnd.databricks.v1+cell": {
84 | "cellMetadata": {
85 | "byteLimit": 2048000,
86 | "rowLimit": 10000
87 | },
88 | "inputWidgets": {},
89 | "nuid": "cb391d1b-fb52-49ab-a463-2849a7f60fea",
90 | "showTitle": false,
91 | "tableResultSettingsMap": {},
92 | "title": ""
93 | }
94 | },
95 | "outputs": [],
96 | "source": [
97 | "%run \"/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook\""
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {
103 | "application/vnd.databricks.v1+cell": {
104 | "cellMetadata": {
105 | "byteLimit": 2048000,
106 | "rowLimit": 10000
107 | },
108 | "inputWidgets": {},
109 | "nuid": "7b7260e3-4612-4e86-ac30-51e7c91fe669",
110 | "showTitle": false,
111 | "tableResultSettingsMap": {},
112 | "title": ""
113 | }
114 | },
115 | "source": [
116 | "####How to run a linux commands inside a notebook using %sh magic command"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 0,
122 | "metadata": {
123 | "application/vnd.databricks.v1+cell": {
124 | "cellMetadata": {
125 | "byteLimit": 2048000,
126 | "rowLimit": 10000
127 | },
128 | "inputWidgets": {},
129 | "nuid": "7f274612-5fb2-4589-86c6-d236abeb9aba",
130 | "showTitle": false,
131 | "tableResultSettingsMap": {},
132 | "title": ""
133 | }
134 | },
135 | "outputs": [],
136 | "source": [
137 | "%sh\n",
138 | "ls -l /databricks-datasets/airlines\n",
139 | "head -1 /databricks-datasets/airlines/part-01902\n",
140 | "echo \"head completed\"\n",
141 | "tail -1 /databricks-datasets/airlines/part-01902"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {
147 | "application/vnd.databricks.v1+cell": {
148 | "cellMetadata": {},
149 | "inputWidgets": {},
150 | "nuid": "85ffa4e3-dc14-4ea4-bab4-94151bc22f9d",
151 | "showTitle": false,
152 | "tableResultSettingsMap": {},
153 | "title": ""
154 | }
155 | },
156 | "source": [
157 | "We are going to use Databricks Unity Catalog (We don't know about it yet)\n",
158 | "to create tables and files under the volume (catalog/schema/volume/folder/files)"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 0,
164 | "metadata": {
165 | "application/vnd.databricks.v1+cell": {
166 | "cellMetadata": {
167 | "byteLimit": 2048000,
168 | "implicitDf": true,
169 | "rowLimit": 10000
170 | },
171 | "inputWidgets": {},
172 | "nuid": "85719c48-7508-45d6-b879-d5ab1e956bcc",
173 | "showTitle": false,
174 | "tableResultSettingsMap": {},
175 | "title": ""
176 | }
177 | },
178 | "outputs": [],
179 | "source": [
180 | "%sql\n",
181 | "CREATE VOLUME IF NOT EXISTS workspace.default.volumewe47_datalake;"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {
187 | "application/vnd.databricks.v1+cell": {
188 | "cellMetadata": {
189 | "byteLimit": 2048000,
190 | "rowLimit": 10000
191 | },
192 | "inputWidgets": {},
193 | "nuid": "97a1ed02-dc2f-4eac-9b34-ed9237d3cd20",
194 | "showTitle": false,
195 | "tableResultSettingsMap": {},
196 | "title": ""
197 | }
198 | },
199 | "source": [
200 | "####Upload some sample data going into (Catalog -> My Organization -> Workspace -> Default -> Volumes)
How to run a DBFS (like Hadoop) FS commands inside a notebook using %fs magic command to copy the uploaded data into some other volume from the uploaded volume"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 0,
206 | "metadata": {
207 | "application/vnd.databricks.v1+cell": {
208 | "cellMetadata": {
209 | "byteLimit": 2048000,
210 | "rowLimit": 10000
211 | },
212 | "inputWidgets": {},
213 | "nuid": "6947aa56-af9f-42de-8262-b7bc680203f7",
214 | "showTitle": false,
215 | "tableResultSettingsMap": {},
216 | "title": ""
217 | }
218 | },
219 | "outputs": [],
220 | "source": [
221 | "%fs ls \"dbfs:///Volumes/workspace/default/volumewe47_datalake\""
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 0,
227 | "metadata": {
228 | "application/vnd.databricks.v1+cell": {
229 | "cellMetadata": {
230 | "byteLimit": 2048000,
231 | "rowLimit": 10000
232 | },
233 | "inputWidgets": {},
234 | "nuid": "e68958aa-a396-4a19-b4e5-3541c3d5d756",
235 | "showTitle": false,
236 | "tableResultSettingsMap": {},
237 | "title": ""
238 | }
239 | },
240 | "outputs": [],
241 | "source": [
242 | "%fs cp \"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients.csv\" \"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy.csv\""
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {
248 | "application/vnd.databricks.v1+cell": {
249 | "cellMetadata": {},
250 | "inputWidgets": {},
251 | "nuid": "0a8e80b8-4546-468b-87ed-7bb6012c1a7b",
252 | "showTitle": false,
253 | "tableResultSettingsMap": {},
254 | "title": ""
255 | }
256 | },
257 | "source": [
258 | "Learning for the first time the dbutils, we learn in detail later\n",
259 | "Rather using fs command, we can use databricks utility command (comprehensive) to copy the data/any other filesystem operations in the DBFS"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 0,
265 | "metadata": {
266 | "application/vnd.databricks.v1+cell": {
267 | "cellMetadata": {
268 | "byteLimit": 2048000,
269 | "rowLimit": 10000
270 | },
271 | "inputWidgets": {},
272 | "nuid": "f29a64d1-087b-46f7-8e97-63c38991fd40",
273 | "showTitle": false,
274 | "tableResultSettingsMap": {},
275 | "title": ""
276 | }
277 | },
278 | "outputs": [],
279 | "source": [
280 | "%python\n",
281 | "dbutils.fs.cp(\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients.csv\",\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy2.csv\")\n",
282 | "dbutils.fs.rm(\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy.csv\")"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {
288 | "application/vnd.databricks.v1+cell": {
289 | "cellMetadata": {
290 | "byteLimit": 2048000,
291 | "rowLimit": 10000
292 | },
293 | "inputWidgets": {},
294 | "nuid": "a43dd280-49bc-4f22-97fb-08a6b23218cc",
295 | "showTitle": false,
296 | "tableResultSettingsMap": {},
297 | "title": ""
298 | }
299 | },
300 | "source": [
301 | "####How to run a Spark SQL/HQL Queries inside a notebook using %sql magic command"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 0,
307 | "metadata": {
308 | "application/vnd.databricks.v1+cell": {
309 | "cellMetadata": {
310 | "byteLimit": 2048000,
311 | "implicitDf": true,
312 | "rowLimit": 10000
313 | },
314 | "inputWidgets": {},
315 | "nuid": "0421bbb7-38d3-4c49-bc4a-28f0c30072e1",
316 | "showTitle": false,
317 | "tableResultSettingsMap": {},
318 | "title": ""
319 | }
320 | },
321 | "outputs": [],
322 | "source": [
323 | "%sql\n",
324 | "create table if not exists default.cities2(id int,city string);\n",
325 | "insert into default.cities2 values(3,'Mumbai'),(4,'Lucknow');\n",
326 | "select * from cities2;"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 0,
332 | "metadata": {
333 | "application/vnd.databricks.v1+cell": {
334 | "cellMetadata": {
335 | "byteLimit": 2048000,
336 | "rowLimit": 10000
337 | },
338 | "inputWidgets": {},
339 | "nuid": "610c68dd-c905-4abc-9f33-ad5623ba4dcb",
340 | "showTitle": false,
341 | "tableResultSettingsMap": {},
342 | "title": ""
343 | }
344 | },
345 | "outputs": [],
346 | "source": [
347 | "%python\n",
348 | "#OOPS, FBP & Declarative (SQL)\n",
349 | "spark.sql(\"select * from cities2\").explain(True)"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 0,
355 | "metadata": {
356 | "application/vnd.databricks.v1+cell": {
357 | "cellMetadata": {
358 | "byteLimit": 2048000,
359 | "implicitDf": true,
360 | "rowLimit": 10000
361 | },
362 | "inputWidgets": {},
363 | "nuid": "d1fe763e-6058-4ad3-980e-ae595fc32499",
364 | "showTitle": false,
365 | "tableResultSettingsMap": {},
366 | "title": ""
367 | }
368 | },
369 | "outputs": [],
370 | "source": [
371 | "%sql\n",
372 | "update cities1 set city='Kolkata' where id=4;"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": 0,
378 | "metadata": {
379 | "application/vnd.databricks.v1+cell": {
380 | "cellMetadata": {
381 | "byteLimit": 2048000,
382 | "implicitDf": true,
383 | "rowLimit": 10000
384 | },
385 | "inputWidgets": {},
386 | "nuid": "2cdb43f7-2838-42d6-a19b-60d2377d1812",
387 | "showTitle": false,
388 | "tableResultSettingsMap": {},
389 | "title": ""
390 | }
391 | },
392 | "outputs": [],
393 | "source": [
394 | "%sql\n",
395 | "show create table cities1;"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": 0,
401 | "metadata": {
402 | "application/vnd.databricks.v1+cell": {
403 | "cellMetadata": {
404 | "byteLimit": 2048000,
405 | "implicitDf": true,
406 | "rowLimit": 10000
407 | },
408 | "inputWidgets": {},
409 | "nuid": "3485856e-b35c-415f-a588-b74aadcbeafd",
410 | "showTitle": false,
411 | "tableResultSettingsMap": {},
412 | "title": ""
413 | }
414 | },
415 | "outputs": [],
416 | "source": [
417 | "%sql\n",
418 | "from cities1 select *;"
419 | ]
420 | },
421 | {
422 | "cell_type": "markdown",
423 | "metadata": {
424 | "application/vnd.databricks.v1+cell": {
425 | "cellMetadata": {
426 | "byteLimit": 2048000,
427 | "rowLimit": 10000
428 | },
429 | "inputWidgets": {},
430 | "nuid": "0cb845bd-3334-4f02-8794-28797af494a0",
431 | "showTitle": false,
432 | "tableResultSettingsMap": {},
433 | "title": ""
434 | }
435 | },
436 | "source": [
437 | "####How to run a Python Program inside a notebook using %python magic command or by default the cell will be enabled with python interpretter only"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 0,
443 | "metadata": {
444 | "application/vnd.databricks.v1+cell": {
445 | "cellMetadata": {
446 | "byteLimit": 2048000,
447 | "rowLimit": 10000
448 | },
449 | "inputWidgets": {},
450 | "nuid": "4745b1ba-c25c-4c6f-9a4b-6ab4ad27bd98",
451 | "showTitle": false,
452 | "tableResultSettingsMap": {},
453 | "title": ""
454 | }
455 | },
456 | "outputs": [],
457 | "source": [
458 | "def sqrt(a):\n",
459 | " return a*a"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 0,
465 | "metadata": {
466 | "application/vnd.databricks.v1+cell": {
467 | "cellMetadata": {
468 | "byteLimit": 2048000,
469 | "rowLimit": 10000
470 | },
471 | "inputWidgets": {},
472 | "nuid": "86607897-fad2-454f-86ab-d1f639889cca",
473 | "showTitle": false,
474 | "tableResultSettingsMap": {},
475 | "title": ""
476 | }
477 | },
478 | "outputs": [],
479 | "source": [
480 | "print(\"square root function call \",sqrt(10))"
481 | ]
482 | },
483 | {
484 | "cell_type": "markdown",
485 | "metadata": {
486 | "application/vnd.databricks.v1+cell": {
487 | "cellMetadata": {},
488 | "inputWidgets": {},
489 | "nuid": "bb77dbe6-2686-4f10-9781-6e37357e121e",
490 | "showTitle": false,
491 | "tableResultSettingsMap": {},
492 | "title": ""
493 | }
494 | },
495 | "source": [
496 | "In the python magic cell itself, we already have spark session object instantiated,
\n",
497 | "so we can lavishly write spark programs"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": 0,
503 | "metadata": {
504 | "application/vnd.databricks.v1+cell": {
505 | "cellMetadata": {
506 | "byteLimit": 2048000,
507 | "rowLimit": 10000
508 | },
509 | "inputWidgets": {},
510 | "nuid": "5f63c0dd-51cb-4c14-8dc3-f55ab0ec9009",
511 | "showTitle": false,
512 | "tableResultSettingsMap": {
513 | "0": {
514 | "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1765095105596}",
515 | "filterBlob": "{\"version\":1,\"filterGroups\":[],\"syncTimestamp\":1765600528360}",
516 | "queryPlanFiltersBlob": "[]",
517 | "tableResultIndex": 0
518 | }
519 | },
520 | "title": ""
521 | }
522 | },
523 | "outputs": [],
524 | "source": [
525 | "%python\n",
526 | "from pyspark.sql.session import SparkSession\n",
527 | "#import pyspark.sql.session as sprk\n",
528 | "print(spark)\n",
529 | "spark1 = SparkSession.builder.appName(\"Spark DataFrames\").getOrCreate()\n",
530 | "print(spark1)\n",
531 | "df1 = spark1.read.csv(\"/Volumes/workspace/default/volumewe47_datalake/patients.csv\",header=True)\n",
532 | "display(df1)"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": 0,
538 | "metadata": {
539 | "application/vnd.databricks.v1+cell": {
540 | "cellMetadata": {
541 | "byteLimit": 2048000,
542 | "rowLimit": 10000
543 | },
544 | "inputWidgets": {},
545 | "nuid": "852e8016-9d82-465f-80df-f8cc0c4dab02",
546 | "showTitle": false,
547 | "tableResultSettingsMap": {},
548 | "title": ""
549 | }
550 | },
551 | "outputs": [],
552 | "source": [
553 | "#DSL - Domain specific language\n",
554 | "df1.where(\"married='Yes'\").write.saveAsTable(\"default.we47_patients\")"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": 0,
560 | "metadata": {
561 | "application/vnd.databricks.v1+cell": {
562 | "cellMetadata": {
563 | "byteLimit": 2048000,
564 | "implicitDf": true,
565 | "rowLimit": 10000
566 | },
567 | "inputWidgets": {},
568 | "nuid": "1307e5ac-e5fe-4578-9396-299a79f794ef",
569 | "showTitle": false,
570 | "tableResultSettingsMap": {},
571 | "title": ""
572 | }
573 | },
574 | "outputs": [],
575 | "source": [
576 | "spark.sql(\"select count(1),InPatient from default.we47_patients group by 2\").explain(True)"
577 | ]
578 | },
579 | {
580 | "cell_type": "code",
581 | "execution_count": 0,
582 | "metadata": {
583 | "application/vnd.databricks.v1+cell": {
584 | "cellMetadata": {
585 | "byteLimit": 2048000,
586 | "rowLimit": 10000
587 | },
588 | "inputWidgets": {},
589 | "nuid": "f5e2cf2a-3140-4aac-b8e6-babab71ed9bc",
590 | "showTitle": false,
591 | "tableResultSettingsMap": {},
592 | "title": ""
593 | }
594 | },
595 | "outputs": [],
596 | "source": [
597 | "spark.read.table(\"default.we47_patients\").show(2)"
598 | ]
599 | },
600 | {
601 | "cell_type": "markdown",
602 | "metadata": {
603 | "application/vnd.databricks.v1+cell": {
604 | "cellMetadata": {
605 | "byteLimit": 2048000,
606 | "rowLimit": 10000
607 | },
608 | "inputWidgets": {},
609 | "nuid": "1c3c9245-15f8-488a-99f1-c4ea979f607e",
610 | "showTitle": false,
611 | "tableResultSettingsMap": {},
612 | "title": ""
613 | }
614 | },
615 | "source": [
616 | "####How to install additional libraries in this current Python Interpreter using %pip magic command"
617 | ]
618 | },
619 | {
620 | "cell_type": "code",
621 | "execution_count": 0,
622 | "metadata": {
623 | "application/vnd.databricks.v1+cell": {
624 | "cellMetadata": {
625 | "byteLimit": 2048000,
626 | "rowLimit": 10000
627 | },
628 | "inputWidgets": {},
629 | "nuid": "11ac3360-e8f1-4454-a37d-9ca2f965fcdd",
630 | "showTitle": false,
631 | "tableResultSettingsMap": {},
632 | "title": ""
633 | }
634 | },
635 | "outputs": [],
636 | "source": [
637 | "%pip install pypi"
638 | ]
639 | }
640 | ],
641 | "metadata": {
642 | "application/vnd.databricks.v1+notebook": {
643 | "computePreferences": {
644 | "hardware": {
645 | "accelerator": null,
646 | "gpuPoolId": null,
647 | "memory": null
648 | }
649 | },
650 | "dashboards": [],
651 | "environmentMetadata": null,
652 | "inputWidgetPreferences": null,
653 | "language": "python",
654 | "notebookMetadata": {
655 | "mostRecentlyExecutedCommandWithImplicitDF": {
656 | "commandId": -1,
657 | "dataframes": [
658 | "_sqldf"
659 | ]
660 | },
661 | "pythonIndentUnit": 4
662 | },
663 | "notebookName": "1_Explore_Notebooks_magic_commands",
664 | "widgets": {}
665 | },
666 | "language_info": {
667 | "name": "python"
668 | }
669 | },
670 | "nbformat": 4,
671 | "nbformat_minor": 0
672 | }
673 |
--------------------------------------------------------------------------------
/databricks_workouts_2025/2_Spark_DataFrame_Read_Write_Operations/3-Basic-WriteOps.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "e3756d01-4aa7-45d1-bffa-b7e3afd67e3c",
10 | "showTitle": false,
11 | "tableResultSettingsMap": {},
12 | "title": ""
13 | }
14 | },
15 | "source": [
16 | "#By Knowing this notebook, we can become an eligible \"Data Egress Developer/Engineer\"\n",
17 | "###We are writing data in Structured(csv), Semi Structured(JSON/XML), Serialized files (orc/parquet/delta) (Datalake)\n",
18 | "###Table (delta/hive) (Lakehouse) format"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "application/vnd.databricks.v1+cell": {
25 | "cellMetadata": {},
26 | "inputWidgets": {},
27 | "nuid": "71713fcb-b659-4e62-bbee-1d3092f13683",
28 | "showTitle": false,
29 | "tableResultSettingsMap": {},
30 | "title": ""
31 | }
32 | },
33 | "source": [
34 | "### Let's get some data we have already..."
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 0,
40 | "metadata": {
41 | "application/vnd.databricks.v1+cell": {
42 | "cellMetadata": {},
43 | "inputWidgets": {},
44 | "nuid": "88e5b5fc-2d6c-4a71-983a-79f140d24e51",
45 | "showTitle": false,
46 | "tableResultSettingsMap": {},
47 | "title": ""
48 | }
49 | },
50 | "outputs": [],
51 | "source": [
52 | "from spark.sql.session import SparkSession\n",
53 | "spark=SparkSession.builder.appName(\"Spark DataFrames\").getOrCreate()"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 0,
59 | "metadata": {
60 | "application/vnd.databricks.v1+cell": {
61 | "cellMetadata": {
62 | "byteLimit": 2048000,
63 | "rowLimit": 10000
64 | },
65 | "inputWidgets": {},
66 | "nuid": "3f1f4ad3-4c7f-4f15-9011-731aa08e1d82",
67 | "showTitle": false,
68 | "tableResultSettingsMap": {},
69 | "title": ""
70 | }
71 | },
72 | "outputs": [],
73 | "source": [
74 | "#Extract\n",
75 | "ingest_df1=spark.read.csv(\"/Volumes/workspace/wd36schema/ingestion_volume/source/custs_header\",header=True,sep=',',inferSchema=True,samplingRatio=0.10)\n",
76 | "\n",
77 | "#ingest_df1.write.format(\"delta\").save(\"/Volumes/workspace/wd36schema/ingestion_volume/deltadata\")"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {
83 | "application/vnd.databricks.v1+cell": {
84 | "cellMetadata": {},
85 | "inputWidgets": {},
86 | "nuid": "4de366d1-b620-4eb3-8014-943802d1b67a",
87 | "showTitle": false,
88 | "tableResultSettingsMap": {},
89 | "title": ""
90 | }
91 | },
92 | "source": [
93 | "### Writing the data in Builtin - different file formats & different targets (all targets in this world we can write the data also...)"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {
99 | "application/vnd.databricks.v1+cell": {
100 | "cellMetadata": {},
101 | "inputWidgets": {},
102 | "nuid": "265d5989-ca7e-4f2f-8e6a-93e75c23948d",
103 | "showTitle": false,
104 | "tableResultSettingsMap": {},
105 | "title": ""
106 | }
107 | },
108 | "source": [
109 | "####1. Writing in csv (structured data (2D data Table/Frames with rows and columns)) format with few basic options listed below (Schema (structure) Migration)\n",
110 | "custid,fname,lname,age,profession -> custid~fname~lname~prof~age\n",
111 | "- header\n",
112 | "- sep\n",
113 | "- mode"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 0,
119 | "metadata": {
120 | "application/vnd.databricks.v1+cell": {
121 | "cellMetadata": {
122 | "byteLimit": 2048000,
123 | "rowLimit": 10000
124 | },
125 | "inputWidgets": {},
126 | "nuid": "c6e0bf03-6816-4d82-b3a6-6b1103ee2dee",
127 | "showTitle": false,
128 | "tableResultSettingsMap": {},
129 | "title": ""
130 | }
131 | },
132 | "outputs": [],
133 | "source": [
134 | "#We are performing schema migration from comma to tilde delimiter\n",
135 | "ingest_df1.write.csv(path=\"/Volumes/workspace/wd36schema/ingestion_volume/target/csvout\",sep='~',header=True,mode='overwrite')\n",
136 | "#4 modes of writing - append,overwrite,ignore,error"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 0,
142 | "metadata": {
143 | "application/vnd.databricks.v1+cell": {
144 | "cellMetadata": {
145 | "byteLimit": 2048000,
146 | "rowLimit": 10000
147 | },
148 | "inputWidgets": {},
149 | "nuid": "2bb81a50-8f23-40fb-a1e2-0b0e97e269fd",
150 | "showTitle": false,
151 | "tableResultSettingsMap": {
152 | "0": {
153 | "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1765767551187}",
154 | "filterBlob": null,
155 | "queryPlanFiltersBlob": null,
156 | "tableResultIndex": 0
157 | }
158 | },
159 | "title": ""
160 | }
161 | },
162 | "outputs": [],
163 | "source": [
164 | "#We are performing schema migration by applying some transformations (this is our bread and butter that we learn exclusively further)\n",
165 | "#Transform\n",
166 | "transformed_df=ingest_df1.select(\"custid\",\"fname\",\"lname\",\"profession\",\"age\").withColumnRenamed(\"profession\",\"prof\")#DSL transformation (not for now...)\n",
167 | "#Load\n",
168 | "transformed_df.write.csv(path=\"/Volumes/workspace/wd36schema/ingestion_volume/target/csvout\",sep='~',header=True,mode='overwrite',compression='gzip')"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {
174 | "application/vnd.databricks.v1+cell": {
175 | "cellMetadata": {},
176 | "inputWidgets": {},
177 | "nuid": "a29a4365-30ff-49c1-af16-5ddbbaa9b3ca",
178 | "showTitle": false,
179 | "tableResultSettingsMap": {},
180 | "title": ""
181 | }
182 | },
183 | "source": [
184 | "####2. Writing in json format with few basic options listed below\n",
185 | "path
\n",
186 | "mode\n",
187 | "- We did a schema migration and data conversion from csv to json format (ie structued to semi structured format)\n",
188 | "- json - we learn a lot subsequently (nested/hierarchical/complex/multiline...), \n",
189 | "- what is json - fundamentally it is a dictionary of dictionaries\n",
190 | "- json - java script object notation\n",
191 | "- Standard json format (can't be changed) - {\"k1\":\"string value\",\"k2\":numbervalue,\"k3\":v2} where key has to be unique & enclosed in double quotes and value can be anything\n",
192 | "- **when to go with json or benifits** - \n",
193 | "- a. If we have data in a semistructure format (with variable data format with dynamic schema)\n",
194 | "- eg. {\"custid\":4000001,\"profession\":\"Pilot\",\"age\":55,\"city\":\"NY\"}\n",
195 | "- {\"custid\":4000001,\"fname\":\"Kristina\",\"lname\":\"Chung\",\"prof\":\"Pilot\",\"age\":\"55\"}\n",
196 | "- b. columns/column names or the types or the order can be different\n",
197 | "- c. json will be provided by the sources if the data is dynamic in nature (not sure about number or order of columns) or if the data is api response in nature.\n",
198 | "- d. json is a efficient data format (serialized/encoded) for performing data exchange between applications via network & good for parsing also & good for object by object operations (row by row operation in realtime fashion eg. amazon click stream operations)\n",
199 | "- e. json can be used to group or create hierarchy of data in a complex or in a nested format eg. https://randomuser.me/api/"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 0,
205 | "metadata": {
206 | "application/vnd.databricks.v1+cell": {
207 | "cellMetadata": {
208 | "byteLimit": 2048000,
209 | "rowLimit": 10000
210 | },
211 | "inputWidgets": {},
212 | "nuid": "19a05420-114a-4e15-a75b-a8bc3c5c20eb",
213 | "showTitle": false,
214 | "tableResultSettingsMap": {},
215 | "title": ""
216 | }
217 | },
218 | "outputs": [],
219 | "source": [
220 | "#Data Conversion/Schema Migration from Structured to SemiStructured format..\n",
221 | "ingest_df1.write.json(path=\"/Volumes/workspace/wd36schema/ingestion_volume/target/jsonout\",mode='append')\n",
222 | "#Structured -> SemiStruct...\n",
223 | "#custid,fname,lname,age,profession -> {\"custid\":4000001,\"fname\":\"Kristina\",\"lname\":\"Chung\",\"prof\":\"Pilot\",\"age\":55}"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {
229 | "application/vnd.databricks.v1+cell": {
230 | "cellMetadata": {},
231 | "inputWidgets": {},
232 | "nuid": "3d9b6ca7-9aa4-4d18-bb18-0d62829dddd2",
233 | "showTitle": false,
234 | "tableResultSettingsMap": {},
235 | "title": ""
236 | }
237 | },
238 | "source": [
239 | "####3.Serialization (encoding in a more optimized fashion) & Deserialization File formats (Binary/Brainy File formats)\n",
240 | "**Data Mechanics:**\n",
241 | "1. encoding/decoding(machine format) - converting the data from human readable format to machine understandable format for performant data transfer (eg. Network transfer of data will be encoded)\n",
242 | "2. *compression/uncompression(encoding+space+time) - shrinking the data in some format using some libraries (tradeoff between time and size) (eg. Compress before store or transfer) - snappy is a good compression tech used in bigdata platform\n",
243 | "3. encryption (encoding+security) - Addition to encoding, encryption add security hence data is (performant+secured) (using some algos - SHA/MD5/AES/DES/RSA/DSA..)\n",
244 | "4. *Serialization (applicable more for bigdata) - Serialization is encoding + performant by saving space + processing intelligent bigdata format - Fast, Compact, Interoperable, Extensible (additional configs), Scalable (cluster compute operations), Secured (binary format)..\n",
245 | "5. *masking - Encoding of data (in some other format not supposed to be machine format) which should not be allowed to decode (used for security purpose)\n",
246 | "\n",
247 | "What are the (builtin) serialized file formats we are going to learn?\n",
248 | "orc\n",
249 | "parquet\n",
250 | "delta(databricks properatory)\n",
251 | "\n",
252 | "- We did a schema migration and data conversion from csv/json to serialized data format (ie structued to sturctured(internall binary unstructured) format)\n",
253 | "- We learn/use a lot/heavily subsequently\n",
254 | "- what is serialized - fundamentally they are intelligent/encoded/serialized/binary data formats applied with lot of optimization & space reduction strategies.. (encoded/compressed/intelligent)\n",
255 | "- orc - optimized row column format (Columnar formats)\n",
256 | "- parquet - tiled data format (Columnar formats)\n",
257 | "- delta(databricks properatory) enriched parquet format - Delta (modified/changes) operations can be performed (ACID property (DML))\n",
258 | "- format - serialized/encoded , we can't see with mere eyes, only some library is used deserialized/decoded data can be accessed as structured data\n",
259 | "- **when to go with serialized or benifits** - \n",
260 | "- a. For storage benifits for eg. orc will save 65+% of space for eg. if i store 1gb data it occupy 350mb space, with compression (snappy) it can improved more...\n",
261 | "- b. For processing optimization. Orc/parquet/delta will provide the required data alone if you query using Pushdown optimization .\n",
262 | "- c. Interoperability feature - this data format can be understandable in multiple environments for eg. bigquery can parse this data.\n",
263 | "- d. Secured\n",
264 | "- **In the projects/environments when to use what fileformats - we learn in detail later...\n",
265 | "| Format | Schema Type | Storage Efficiency | Analytics/Transformation Performance | Updates Supported |\n",
266 | "|--------|--------------------------|--------------------|-----------------------|------------------|\n",
267 | "| CSV | Structured | Low | Slow | No |\n",
268 | "| JSON | Semi-structured | Low | Slow | No |\n",
269 | "| ORC | Structured / Striped | High | Fast | Limited |\n",
270 | "| Parquet| Structured / Nested | High | Very Fast | Limited |\n",
271 | "| Delta | Structured / Evolving | High | Very Fast | Highly |\n",
272 | "| XML | Semi-structured | Low | Slow | No |"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 0,
278 | "metadata": {
279 | "application/vnd.databricks.v1+cell": {
280 | "cellMetadata": {
281 | "byteLimit": 2048000,
282 | "rowLimit": 10000
283 | },
284 | "inputWidgets": {},
285 | "nuid": "22b04626-5ab2-4dba-b60e-058374748690",
286 | "showTitle": false,
287 | "tableResultSettingsMap": {},
288 | "title": ""
289 | }
290 | },
291 | "outputs": [],
292 | "source": [
293 | "ingest_df1.write.orc(path=\"/Volumes/workspace/wd36schema/ingestion_volume/target/orcout\",mode='overwrite',compression='zlib')#by default orc/parquet uses snappy compression\n",
294 | "spark.read.orc(\"/Volumes/workspace/wd36schema/ingestion_volume/target/orcout\").show(2)#uncompression + deserialization"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 0,
300 | "metadata": {
301 | "application/vnd.databricks.v1+cell": {
302 | "cellMetadata": {
303 | "byteLimit": 2048000,
304 | "rowLimit": 10000
305 | },
306 | "inputWidgets": {},
307 | "nuid": "e128ce1d-d358-479e-9590-42f0da8a4cb9",
308 | "showTitle": false,
309 | "tableResultSettingsMap": {},
310 | "title": ""
311 | }
312 | },
313 | "outputs": [],
314 | "source": [
315 | "#Orc/Parquet follows WORM feature (Write Once Read Many)\n",
316 | "ingest_df1.write.mode(\"overwrite\").option(\"compression\",\"gzip\").option(\"compression\",\"snappy\").parquet(path=\"/Volumes/workspace/wd36schema/ingestion_volume/target/parquetout\")#by default orc/parquet uses snappy compression\n",
317 | "spark.read.parquet(\"/Volumes/workspace/wd36schema/ingestion_volume/target/parquetout\").show(2)#uncompression + deserialization"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 0,
323 | "metadata": {
324 | "application/vnd.databricks.v1+cell": {
325 | "cellMetadata": {
326 | "byteLimit": 2048000,
327 | "rowLimit": 10000
328 | },
329 | "inputWidgets": {},
330 | "nuid": "f52bbbf8-7dbb-4639-9a1e-572114fe9838",
331 | "showTitle": false,
332 | "tableResultSettingsMap": {},
333 | "title": ""
334 | }
335 | },
336 | "outputs": [],
337 | "source": [
338 | "#Delta follows WMRM feature (Write Many Read Many) - We did Delta Lake creation (Datalake + Delta file format)\n",
339 | "ingest_df1.write.format(\"delta\").save(\"/Volumes/workspace/wd36schema/ingestion_volume/target/deltaout\",mode='overwrite')\n",
340 | "spark.read.format(\"delta\").load(\"/Volumes/workspace/wd36schema/ingestion_volume/target/deltaout\").show(2)"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {
346 | "application/vnd.databricks.v1+cell": {
347 | "cellMetadata": {},
348 | "inputWidgets": {},
349 | "nuid": "5bb4ab26-481f-4b00-a5cb-675b105863d2",
350 | "showTitle": false,
351 | "tableResultSettingsMap": {},
352 | "title": ""
353 | }
354 | },
355 | "source": [
356 | "####4.Table Load Operations - Building LAKEHOUSE ON TOP OF DATALAKE\n",
357 | "Can we do SQL operations directly on the tables like a database or datawarehouse? or Can we build a Lakehouse in Databricks?\n",
358 | "- We learn/use a lot/heavily subsequently, \n",
359 | "- what is Lakehouse - A SQL/Datawarehouse/Query layer on top of the Datalake is called Lakehouse\n",
360 | "- We have different lakehouses which we are going to learn further - \n",
361 | "1. delta tables (lakehouse) in databricks\n",
362 | "2. hive in onprem\n",
363 | "3. bigquery in GCP\n",
364 | "4. synapse in azure\n",
365 | "5. athena in aws\n",
366 | "- **when to go with lakehouse** - \n",
367 | "- a. Transformation\n",
368 | "- b. Analysis/Analytics\n",
369 | "- c. AI/BI\n",
370 | "- d. Literally we are going to learn SQL & Advanced SQL"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 0,
376 | "metadata": {
377 | "application/vnd.databricks.v1+cell": {
378 | "cellMetadata": {
379 | "byteLimit": 2048000,
380 | "rowLimit": 10000
381 | },
382 | "inputWidgets": {},
383 | "nuid": "25875988-b00e-4823-95cb-1ff0b6362d44",
384 | "showTitle": false,
385 | "tableResultSettingsMap": {},
386 | "title": ""
387 | }
388 | },
389 | "outputs": [],
390 | "source": [
391 | "#We are building delta tables in databricks (we are building hive tables in onprem/we are building bq tables in gcp...)\n",
392 | "#saveastable (named notation/named arguments)\n",
393 | "#Table\n",
394 | "#cid,prof,age,fname,lname\n",
395 | "#mapping\n",
396 | "#cid,prof,age,fname,lname\n",
397 | "ingest_df1.write.saveAsTable(\"workspace.wd36schema.lh_custtbl\",mode='overwrite')\n",
398 | "#display(spark.sql(\"show create table workspace.wd36schema.lh_custtbl\"))"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 0,
404 | "metadata": {
405 | "application/vnd.databricks.v1+cell": {
406 | "cellMetadata": {
407 | "byteLimit": 2048000,
408 | "rowLimit": 10000
409 | },
410 | "inputWidgets": {},
411 | "nuid": "fcfa70eb-dec9-409d-a5ab-0aa7c01eb5fb",
412 | "showTitle": false,
413 | "tableResultSettingsMap": {},
414 | "title": ""
415 | }
416 | },
417 | "outputs": [],
418 | "source": [
419 | "#1. insertinto function can be used as like saveAstable with few differences\n",
420 | "#a. it works only if the target table exist\n",
421 | "#b. it works by creating insert statements in the behind(not bulk load), hence it is slow, hence we have use for small dataset (safely only if table exists)\n",
422 | "#c. it will load the data from the dataframe by using position, not by using name..\n",
423 | "#insertInto (positional notation/positional arguments)\n",
424 | "#Table\n",
425 | "#cid,prof,age,fname,lname\n",
426 | "#mapping.\n",
427 | "#cid,fname,lname,age,prof\n",
428 | "ingest_df1.write.insertInto(\"workspace.wd36schema.lh_custtbl\",overwrite=True)"
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": 0,
434 | "metadata": {
435 | "application/vnd.databricks.v1+cell": {
436 | "cellMetadata": {},
437 | "inputWidgets": {},
438 | "nuid": "06834a14-c6ec-420d-830d-88b88aa5520c",
439 | "showTitle": false,
440 | "tableResultSettingsMap": {},
441 | "title": ""
442 | }
443 | },
444 | "outputs": [],
445 | "source": [
446 | "ingest_df1.write.format(\"delta\").save(\"location\")"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 0,
452 | "metadata": {
453 | "application/vnd.databricks.v1+cell": {
454 | "cellMetadata": {
455 | "byteLimit": 2048000,
456 | "rowLimit": 10000
457 | },
458 | "inputWidgets": {},
459 | "nuid": "1f3a1c4f-aabb-4db1-96d7-f3c2836b61e3",
460 | "showTitle": false,
461 | "tableResultSettingsMap": {},
462 | "title": ""
463 | }
464 | },
465 | "outputs": [],
466 | "source": [
467 | "#I am using spark engine to pull the data from the lakehouse table backed by dbfs (s3) (datalake) where data in delta format(deltalake) \n",
468 | "display(spark.sql(\"select * from workspace.wd36schema.lh_custtbl\"))#sparkengine+lakehouse+datalake(deltalake)"
469 | ]
470 | },
471 | {
472 | "cell_type": "markdown",
473 | "metadata": {
474 | "application/vnd.databricks.v1+cell": {
475 | "cellMetadata": {},
476 | "inputWidgets": {},
477 | "nuid": "7586d9b9-5766-44e9-9c4a-51c1805f316c",
478 | "showTitle": false,
479 | "tableResultSettingsMap": {},
480 | "title": ""
481 | }
482 | },
483 | "source": [
484 | "####5. XML Format - Semi structured data format (most of the json features can be applied in xml also, but in DE world not so famous like json)\n",
485 | "- Used rarely on demand (by certain target/source systems eg. mainframes)\n",
486 | "- Can be related with json, but not so much efficient like json\n",
487 | "- Databricks provides xml as a inbuild function"
488 | ]
489 | },
490 | {
491 | "cell_type": "code",
492 | "execution_count": 0,
493 | "metadata": {
494 | "application/vnd.databricks.v1+cell": {
495 | "cellMetadata": {
496 | "byteLimit": 2048000,
497 | "rowLimit": 10000
498 | },
499 | "inputWidgets": {},
500 | "nuid": "637512e0-627e-40d7-8fec-245916796b5b",
501 | "showTitle": false,
502 | "tableResultSettingsMap": {},
503 | "title": ""
504 | }
505 | },
506 | "outputs": [],
507 | "source": [
508 | "ingest_df1.write.xml(\"/Volumes/workspace/wd36schema/ingestion_volume/target/xmlout\",mode=\"ignore\",rowTag=\"cust\")"
509 | ]
510 | },
511 | {
512 | "cell_type": "markdown",
513 | "metadata": {
514 | "application/vnd.databricks.v1+cell": {
515 | "cellMetadata": {},
516 | "inputWidgets": {},
517 | "nuid": "7abb6500-0b7e-4339-b814-3e356c78d7ce",
518 | "showTitle": false,
519 | "tableResultSettingsMap": {},
520 | "title": ""
521 | }
522 | },
523 | "source": [
524 | "### Modes in Writing\n",
525 | "1. **Append** - Adds the new data to the existing data. It does not overwrite anything.\n",
526 | "2. **Overwrite** - Replaces the existing data entirely at the destination.\n",
527 | "3. **ErrorIfexist**(default) - Throws an error if data already exists at the destination.\n",
528 | "4. **Ignore** - Skips the write operation if data already exists at the destination."
529 | ]
530 | },
531 | {
532 | "cell_type": "markdown",
533 | "metadata": {
534 | "application/vnd.databricks.v1+cell": {
535 | "cellMetadata": {},
536 | "inputWidgets": {},
537 | "nuid": "c8463feb-5c73-4fea-b9ca-9d09d92e1a95",
538 | "showTitle": false,
539 | "tableResultSettingsMap": {},
540 | "title": ""
541 | }
542 | },
543 | "source": [
544 | "####What are all the overall functions/options we used in this notebook, for learning fundamental spark dataframe WRITE operations in different formats and targets?\n",
545 | "1. We learned dozen of functions (out of 18 functions) in the write module with minimum options...\n",
546 | "2. Functions we learned are (Datalake functions - csv/json/xml/orc/parquet+delta), (Lakehouse functions - saveAsTable/insertInto), (additional options - format/save/option/options/mode).\n",
547 | "3. We have few more performance optimization/advanced options available (jdbc (we learn this soon in the name of foreign catalog), partitionBy,ClusterBy,BucketBy,SortBy,text)\n",
548 | "4. Few of the important read options under csv such as header, sep, mode(append/overwrite/error/ignore), toDF.\n",
549 | "5. Few additional options such as compression, different file formats..."
550 | ]
551 | }
552 | ],
553 | "metadata": {
554 | "application/vnd.databricks.v1+notebook": {
555 | "computePreferences": null,
556 | "dashboards": [],
557 | "environmentMetadata": {
558 | "base_environment": "",
559 | "environment_version": "3"
560 | },
561 | "inputWidgetPreferences": null,
562 | "language": "python",
563 | "notebookMetadata": {
564 | "mostRecentlyExecutedCommandWithImplicitDF": {
565 | "commandId": 1137934375475219,
566 | "dataframes": [
567 | "_sqldf"
568 | ]
569 | },
570 | "pythonIndentUnit": 4
571 | },
572 | "notebookName": "3-Basic-WriteOps",
573 | "widgets": {}
574 | },
575 | "language_info": {
576 | "name": "python"
577 | }
578 | },
579 | "nbformat": 4,
580 | "nbformat_minor": 0
581 | }
582 |
--------------------------------------------------------------------------------