├── README.md ├── databricks_workouts_2025_WE47 ├── 1_DATABRICKS_NOTEBOOK_FUNDAMENTALS │ ├── 4_child_notebook.py │ ├── 3_Notebook_workflow_utils_notebooks_widgets_invoking_passing_params.py │ ├── 2_Explore_Notebook_Markdowns.ipynb │ └── 1_Explore_Notebooks_magic_commands.ipynb ├── 1_USECASES_NB_FUNDAMENTALS │ ├── 1_Usecase_Explore_Notebooks_magic_commands.ipynb │ ├── 4_child_nb_dataload.ipynb │ └── 2_Usecase_md_dbutils_widgets.ipynb └── 2_Spark_DataFrame_Read_Write_Operations │ ├── read_write_usecases.ipynb │ └── 3-Basic-WriteOps.ipynb ├── databricks_workouts_2025 ├── 2_Spark_DataFrame_Read_Write_Operations │ ├── 4-Advanced-WriteOps.ipynb │ ├── 2-Advanced-Readops.ipynb │ ├── read_write_usecases.ipynb │ └── 3-Basic-WriteOps.ipynb ├── 1_DATABRICKS_NOTEBOOK_FUNDAMENTALS │ ├── 4_child_notebook.py │ ├── 3_Notebook_workflow_utils_notebooks_widgets_invoking_passing_params.py │ ├── 2_Explore_Notebook_Markdowns.ipynb │ └── 1_Explore_Notebooks_magic_commands.ipynb └── 1_USECASES_NB_FUNDAMENTALS │ ├── 1_Usecase_Explore_Notebooks_magic_commands.ipynb │ ├── 4_child_nb_dataload.ipynb │ └── 2_Usecase_md_dbutils_widgets.ipynb ├── oops_fundamentals_4.py ├── we47_local_notebooks └── my_first_notebook.ipynb └── LICENSE /README.md: -------------------------------------------------------------------------------- 1 | # databricks-code-repo 2 | Repo to maintain the Databricks notebook and other objects 3 | -------------------------------------------------------------------------------- /databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC #Creating this child notebook for the demo of calling child notebook from the parent notebook 4 | 5 | # COMMAND ---------- 6 | 7 | dbutils.widgets.text("table_name", "cities") 8 | table_name = dbutils.widgets.get("table_name") 9 | print(f"parameter passed is {table_name}") 10 | spark.sql(f"select * from {table_name}").show(2) 11 | 12 | # COMMAND ---------- 13 | 14 | dbutils.notebook.exit("success") 15 | -------------------------------------------------------------------------------- /databricks_workouts_2025/2_Spark_DataFrame_Read_Write_Operations/4-Advanced-WriteOps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": { 4 | "application/vnd.databricks.v1+notebook": { 5 | "computePreferences": null, 6 | "dashboards": [], 7 | "environmentMetadata": { 8 | "base_environment": "", 9 | "environment_version": "3" 10 | }, 11 | "inputWidgetPreferences": null, 12 | "language": "python", 13 | "notebookMetadata": { 14 | "pythonIndentUnit": 4 15 | }, 16 | "notebookName": "4-Advanced-WriteOps", 17 | "widgets": {} 18 | }, 19 | "language_info": { 20 | "name": "python" 21 | } 22 | }, 23 | "nbformat": 4, 24 | "nbformat_minor": 0 25 | } 26 | -------------------------------------------------------------------------------- /databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC #Creating this child notebook for the demo of calling child notebook from the parent notebook 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %sql 8 | # MAGIC select current_timestamp() 9 | 10 | # COMMAND ---------- 11 | 12 | #dbutils.notebook.exit(0) 13 | 14 | # COMMAND ---------- 15 | 16 | dbutils.widgets.text("table_name", "cust") 17 | 18 | # COMMAND ---------- 19 | 20 | text_box_value=dbutils.widgets.get("table_name") 21 | print(text_box_value) 22 | 23 | # COMMAND ---------- 24 | 25 | #Spark SQL 26 | spark.read.table(text_box_value).display()#domain specific lang(FBP) 27 | spark.sql(f"select * from {text_box_value}").display()#Declarative lang 28 | 29 | # COMMAND ---------- 30 | 31 | dbutils.notebook.exit("notebook completed successfully") 32 | -------------------------------------------------------------------------------- /oops_fundamentals_4.py: -------------------------------------------------------------------------------- 1 | #user -> cc agent -> cost installation_util of product 2 | #siva added something 3 | #from pyspark.sql.session import SparkSession 4 | #pkg/subpkg/module/class/obj/const 5 | #functions (75%) fbp- pkg.subpkg.module.functions 6 | #class/functions (25%) oops+fbp-pkg.subpkg.module.class.functions 7 | 8 | #oops minimum concepts (class,members,self,obj,constructor) 9 | #class - is a template or blueprint program contains related members func/variables/subclasses 10 | #member - any var/funct/classes inside the class is a member 11 | #self - Used to identify the given program as a member (it is the reserved first parameter) 12 | #object - Memory Instance/Copy of a class 13 | #Constructor () - Memory in which we construct/instantiate a class is constructor 14 | #Types of Constructor - Non Parameterized, Parameterized, Default 15 | 16 | #class is a main program that holds the subprograms 17 | #xls class (template/blueprint) holds subprograms (functions) (tabs) 18 | print("hello team - in Git") 19 | class xls: 20 | def tab1(self): 21 | pass 22 | def tab2(self): 23 | pass 24 | 25 | class prod_cost: 26 | def installation_cost(self): 27 | installation_cost=100+10 28 | return installation_cost 29 | def total_cost(self): 30 | total_cost=self.installation_cost() 31 | return total_cost 32 | 33 | mohan_open=xls() 34 | print(mohan_open.tab1()) 35 | karthick_open=xls() 36 | print(karthick_open.tab1()) 37 | -------------------------------------------------------------------------------- /databricks_workouts_2025/1_USECASES_NB_FUNDAMENTALS/1_Usecase_Explore_Notebooks_magic_commands.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "74cb71bf-99b6-4b2a-bf2f-3259b376fd41", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "######Task1: Document the Notebook Using mark down %md
\n", 17 | "A good Title
\n", 18 | "Description of the task
\n", 19 | "Your name in some color
\n", 20 | "Bring our Team photo from the given url \"https://fpimages.withfloats.com/actual/6929d1ac956d0a744b5c9822.jpeg\"
\n", 21 | "Use headings, bold, italics appropriately.
\n", 22 | "\n", 23 | "Task2: Create a volume namely usage_metrics using sql magic command %sql\n", 24 | "\n", 25 | "Task3: \n", 26 | "Create a child notebook \"4_child_nb_dataload\" and write code to load data, Using the requests library, perform api call to pull data from \"https://public.tableau.com/app/sample-data/mobile_os_usage.csv\" into a python variable using the magic command %py and write the data into the created volume \"/Volumes/workspace/default/usage_metrics/mobile_os_usage.csv\" using the above variable.text using the magic command dbutils.fs.put(\"volume\",variable.text,overwrite=True)\n", 27 | "\n", 28 | "Task4: Call the notebook 4_child_nb_dataload using the magic command %run\n", 29 | "\n", 30 | "Task5: list the file is created in the given volume or not and do the head of this file using fs magic command %fs \n", 31 | "\n", 32 | "Task6: Create a pyspark dataframe df1 reading the data from the above file using pyspark magic command %python\n", 33 | "\n", 34 | "Task7: Write the above dataframe df1 data into a databricks table called 'default.mobile_os_usage' using pyspark magic command %python\n", 35 | "\n", 36 | "Task8: Write sql query to display the data loaded into the table 'default.mobile_os_usage' using the pyspark magic command %python \n", 37 | "\n", 38 | "Task9: Create a python function to convert the given input to upper case\n", 39 | "\n", 40 | "Task10: Install pandas library using the pip python magic command %pip\n", 41 | "\n", 42 | "Task11: Import pandas, using pandas read_csv and display the output using the magic command %python\n", 43 | "\n", 44 | "Task12: echo \"Magic commands tasks completed\" using the linux shell magic command %sh " 45 | ] 46 | } 47 | ], 48 | "metadata": { 49 | "application/vnd.databricks.v1+notebook": { 50 | "computePreferences": { 51 | "hardware": { 52 | "accelerator": null, 53 | "gpuPoolId": null, 54 | "memory": null 55 | } 56 | }, 57 | "dashboards": [], 58 | "environmentMetadata": { 59 | "base_environment": "", 60 | "environment_version": "4" 61 | }, 62 | "inputWidgetPreferences": null, 63 | "language": "python", 64 | "notebookMetadata": { 65 | "mostRecentlyExecutedCommandWithImplicitDF": { 66 | "commandId": 7900721791748489, 67 | "dataframes": [ 68 | "_sqldf" 69 | ] 70 | }, 71 | "pythonIndentUnit": 4 72 | }, 73 | "notebookName": "1_Usecase_Explore_Notebooks_magic_commands", 74 | "widgets": {} 75 | }, 76 | "language_info": { 77 | "name": "python" 78 | } 79 | }, 80 | "nbformat": 4, 81 | "nbformat_minor": 0 82 | } 83 | -------------------------------------------------------------------------------- /databricks_workouts_2025_WE47/1_USECASES_NB_FUNDAMENTALS/1_Usecase_Explore_Notebooks_magic_commands.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "74cb71bf-99b6-4b2a-bf2f-3259b376fd41", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "######Task1: Document the Notebook Using mark down %md
\n", 17 | "A good Title
\n", 18 | "Description of the task
\n", 19 | "Your name in some color
\n", 20 | "Bring our Team photo from the given url \"https://fpimages.withfloats.com/actual/6936e213e40c3ddda3969dd0.jpeg\"
\n", 21 | "Use headings, bold, italics appropriately.
\n", 22 | "\n", 23 | "Task2: Create a volume namely usage_metrics using sql magic command %sql\n", 24 | "\n", 25 | "Task3: \n", 26 | "Create a child notebook \"4_child_nb_dataload\" and write code to load data, Using the requests library, perform api call to pull data from \"https://public.tableau.com/app/sample-data/mobile_os_usage.csv\" into a python variable using the magic command %py and write the data into the created volume \"/Volumes/workspace/default/usage_metrics/mobile_os_usage.csv\" using the above variable.text using the magic command dbutils.fs.put(\"volume\",variable.text,overwrite=True)\n", 27 | "\n", 28 | "Task4: Call the notebook 4_child_nb_dataload using the magic command %run\n", 29 | "\n", 30 | "Task5: list the file is created in the given volume or not and do the head of this file using fs magic command %fs \n", 31 | "\n", 32 | "Task6: Create a pyspark dataframe df1 reading the data from the above file using pyspark magic command %python\n", 33 | "\n", 34 | "Task7: Write the above dataframe df1 data into a databricks table called 'default.mobile_os_usage' using pyspark magic command %python\n", 35 | "\n", 36 | "Task8: Write sql query to display the data loaded into the table 'default.mobile_os_usage' using the pyspark magic command %python \n", 37 | "\n", 38 | "Task9: Create a python function to convert the given input to upper case\n", 39 | "\n", 40 | "Task10: Install pandas library using the pip python magic command %pip\n", 41 | "\n", 42 | "Task11: Import pandas, using pandas read_csv and display the output using the magic command %python\n", 43 | "\n", 44 | "Task12: echo \"Magic commands tasks completed\" using the linux shell magic command %sh " 45 | ] 46 | } 47 | ], 48 | "metadata": { 49 | "application/vnd.databricks.v1+notebook": { 50 | "computePreferences": { 51 | "hardware": { 52 | "accelerator": null, 53 | "gpuPoolId": null, 54 | "memory": null 55 | } 56 | }, 57 | "dashboards": [], 58 | "environmentMetadata": { 59 | "base_environment": "", 60 | "environment_version": "4" 61 | }, 62 | "inputWidgetPreferences": null, 63 | "language": "python", 64 | "notebookMetadata": { 65 | "mostRecentlyExecutedCommandWithImplicitDF": { 66 | "commandId": 7900721791748489, 67 | "dataframes": [ 68 | "_sqldf" 69 | ] 70 | }, 71 | "pythonIndentUnit": 4 72 | }, 73 | "notebookName": "1_Usecase_Explore_Notebooks_magic_commands", 74 | "widgets": {} 75 | }, 76 | "language_info": { 77 | "name": "python" 78 | } 79 | }, 80 | "nbformat": 4, 81 | "nbformat_minor": 0 82 | } 83 | -------------------------------------------------------------------------------- /we47_local_notebooks/my_first_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "7fa1dbe6-20db-4359-8df7-7866e701f032", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "![](https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "application/vnd.databricks.v1+cell": { 23 | "cellMetadata": {}, 24 | "inputWidgets": {}, 25 | "nuid": "fbd8ec32-720f-42d9-bd51-45b397b9d381", 26 | "showTitle": false, 27 | "tableResultSettingsMap": {}, 28 | "title": "" 29 | } 30 | }, 31 | "source": [ 32 | "#Lets learn about how to work in notebooks" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "application/vnd.databricks.v1+cell": { 39 | "cellMetadata": {}, 40 | "inputWidgets": {}, 41 | "nuid": "29059821-bbb4-4ad0-bfab-8861666501b5", 42 | "showTitle": false, 43 | "tableResultSettingsMap": {}, 44 | "title": "" 45 | } 46 | }, 47 | "source": [ 48 | "## Lets create markdown designs" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "application/vnd.databricks.v1+cell": { 55 | "cellMetadata": {}, 56 | "inputWidgets": {}, 57 | "nuid": "3950f092-165c-44b7-b6de-d847624b9dc7", 58 | "showTitle": false, 59 | "tableResultSettingsMap": {}, 60 | "title": "" 61 | } 62 | }, 63 | "source": [ 64 | "##Lets learn magic commands" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": { 70 | "application/vnd.databricks.v1+cell": { 71 | "cellMetadata": {}, 72 | "inputWidgets": {}, 73 | "nuid": "0ebbd436-b0ec-42b4-99dd-84ef6371cb04", 74 | "showTitle": false, 75 | "tableResultSettingsMap": {}, 76 | "title": "" 77 | } 78 | }, 79 | "source": [ 80 | "###Lets learn %sh magic command" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 0, 86 | "metadata": { 87 | "application/vnd.databricks.v1+cell": { 88 | "cellMetadata": { 89 | "byteLimit": 2048000, 90 | "rowLimit": 10000 91 | }, 92 | "inputWidgets": {}, 93 | "nuid": "d2c29bd7-d881-4d82-8f48-e103887cbd9f", 94 | "showTitle": false, 95 | "tableResultSettingsMap": {}, 96 | "title": "" 97 | } 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "%sh ls -l /home/spark-c799a53d-6b9f-4442-ba04-aa" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": { 107 | "application/vnd.databricks.v1+cell": { 108 | "cellMetadata": {}, 109 | "inputWidgets": {}, 110 | "nuid": "0defa698-799c-4112-a1a5-dd77e86cbff6", 111 | "showTitle": false, 112 | "tableResultSettingsMap": {}, 113 | "title": "" 114 | } 115 | }, 116 | "source": [ 117 | "###Lets learn %fs magic command" 118 | ] 119 | } 120 | ], 121 | "metadata": { 122 | "application/vnd.databricks.v1+notebook": { 123 | "computePreferences": null, 124 | "dashboards": [], 125 | "environmentMetadata": { 126 | "base_environment": "", 127 | "environment_version": "4" 128 | }, 129 | "inputWidgetPreferences": null, 130 | "language": "python", 131 | "notebookMetadata": { 132 | "mostRecentlyExecutedCommandWithImplicitDF": { 133 | "commandId": 7789511576367464, 134 | "dataframes": [ 135 | "_sqldf" 136 | ] 137 | }, 138 | "pythonIndentUnit": 4 139 | }, 140 | "notebookName": "my_first_notebook", 141 | "widgets": {} 142 | }, 143 | "language_info": { 144 | "name": "python" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 0 149 | } 150 | -------------------------------------------------------------------------------- /databricks_workouts_2025_WE47/1_USECASES_NB_FUNDAMENTALS/4_child_nb_dataload.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "d6e0adc4-d54e-4015-ac37-1ed3c539799c", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "### Notebook to create and load data into databricks volume" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 0, 22 | "metadata": { 23 | "application/vnd.databricks.v1+cell": { 24 | "cellMetadata": { 25 | "byteLimit": 2048000, 26 | "implicitDf": true, 27 | "rowLimit": 10000 28 | }, 29 | "inputWidgets": {}, 30 | "nuid": "19d46640-79da-457b-87c6-1235f27cfbac", 31 | "showTitle": false, 32 | "tableResultSettingsMap": {}, 33 | "title": "" 34 | } 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "%sql\n", 39 | "CREATE VOLUME IF NOT EXISTS workspace.default.mobile_metrics;" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 0, 45 | "metadata": { 46 | "application/vnd.databricks.v1+cell": { 47 | "cellMetadata": { 48 | "byteLimit": 2048000, 49 | "rowLimit": 10000 50 | }, 51 | "inputWidgets": {}, 52 | "nuid": "07ad79a4-1066-4181-9f5f-0121d09dce83", 53 | "showTitle": false, 54 | "tableResultSettingsMap": {}, 55 | "title": "" 56 | } 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "import requests\n", 61 | "response = requests.get(\"https://public.tableau.com/app/sample-data/mobile_os_usage.csv\")\n", 62 | "dbutils.fs.put(\"/Volumes/workspace/default/mobile_metrics/mobile_os_usage.csv\", response.text, overwrite=True)\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 0, 68 | "metadata": { 69 | "application/vnd.databricks.v1+cell": { 70 | "cellMetadata": { 71 | "byteLimit": 2048000, 72 | "rowLimit": 10000 73 | }, 74 | "inputWidgets": {}, 75 | "nuid": "96dec063-50b9-489d-a43b-eed5ec938643", 76 | "showTitle": false, 77 | "tableResultSettingsMap": {}, 78 | "title": "" 79 | } 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "%fs\n", 84 | "ls /Volumes/workspace/default/volume1/mobile_os_usage.csv" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 0, 90 | "metadata": { 91 | "application/vnd.databricks.v1+cell": { 92 | "cellMetadata": { 93 | "byteLimit": 2048000, 94 | "rowLimit": 10000 95 | }, 96 | "inputWidgets": {}, 97 | "nuid": "9f4f28f6-62e3-434e-bfc5-b23c5675a6dc", 98 | "showTitle": false, 99 | "tableResultSettingsMap": {}, 100 | "title": "" 101 | } 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "%fs head /Volumes/workspace/default/volume1/mobile_os_usage.csv" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 0, 111 | "metadata": { 112 | "application/vnd.databricks.v1+cell": { 113 | "cellMetadata": { 114 | "byteLimit": 2048000, 115 | "rowLimit": 10000 116 | }, 117 | "inputWidgets": {}, 118 | "nuid": "ce526cca-6cd6-423d-b37d-841576bd5c25", 119 | "showTitle": false, 120 | "tableResultSettingsMap": {}, 121 | "title": "" 122 | } 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "spark.read.csv(\"/Volumes/workspace/default/volume1/mobile_os_usage.csv\").write.saveAsTable(\"mobile_os_usage\")" 127 | ] 128 | } 129 | ], 130 | "metadata": { 131 | "application/vnd.databricks.v1+notebook": { 132 | "computePreferences": { 133 | "hardware": { 134 | "accelerator": null, 135 | "gpuPoolId": null, 136 | "memory": null 137 | } 138 | }, 139 | "dashboards": [], 140 | "environmentMetadata": { 141 | "base_environment": "", 142 | "environment_version": "4" 143 | }, 144 | "inputWidgetPreferences": null, 145 | "language": "python", 146 | "notebookMetadata": { 147 | "mostRecentlyExecutedCommandWithImplicitDF": { 148 | "commandId": 7900721791748484, 149 | "dataframes": [ 150 | "_sqldf" 151 | ] 152 | }, 153 | "pythonIndentUnit": 4 154 | }, 155 | "notebookName": "4_child_nb_dataload", 156 | "widgets": {} 157 | }, 158 | "language_info": { 159 | "name": "python" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 0 164 | } 165 | -------------------------------------------------------------------------------- /databricks_workouts_2025/1_USECASES_NB_FUNDAMENTALS/4_child_nb_dataload.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "d6e0adc4-d54e-4015-ac37-1ed3c539799c", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "### Notebook to create and load data into databricks volume" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 0, 22 | "metadata": { 23 | "application/vnd.databricks.v1+cell": { 24 | "cellMetadata": { 25 | "byteLimit": 2048000, 26 | "implicitDf": true, 27 | "rowLimit": 10000 28 | }, 29 | "inputWidgets": {}, 30 | "nuid": "19d46640-79da-457b-87c6-1235f27cfbac", 31 | "showTitle": false, 32 | "tableResultSettingsMap": {}, 33 | "title": "" 34 | } 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "%sql\n", 39 | "CREATE VOLUME IF NOT EXISTS workspace.default.mobile_metrics;" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 0, 45 | "metadata": { 46 | "application/vnd.databricks.v1+cell": { 47 | "cellMetadata": { 48 | "byteLimit": 2048000, 49 | "rowLimit": 10000 50 | }, 51 | "inputWidgets": {}, 52 | "nuid": "07ad79a4-1066-4181-9f5f-0121d09dce83", 53 | "showTitle": false, 54 | "tableResultSettingsMap": {}, 55 | "title": "" 56 | } 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "import requests\n", 61 | "response = requests.get(\"https://public.tableau.com/app/sample-data/mobile_os_usage.csv\")\n", 62 | "dbutils.fs.put(\"/Volumes/workspace/default/mobile_metrics/mobile_os_usage.csv\", response.text, overwrite=True)\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 0, 68 | "metadata": { 69 | "application/vnd.databricks.v1+cell": { 70 | "cellMetadata": { 71 | "byteLimit": 2048000, 72 | "rowLimit": 10000 73 | }, 74 | "inputWidgets": {}, 75 | "nuid": "96dec063-50b9-489d-a43b-eed5ec938643", 76 | "showTitle": false, 77 | "tableResultSettingsMap": {}, 78 | "title": "" 79 | } 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "%fs\n", 84 | "ls /Volumes/workspace/default/volume1/mobile_os_usage.csv" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 0, 90 | "metadata": { 91 | "application/vnd.databricks.v1+cell": { 92 | "cellMetadata": { 93 | "byteLimit": 2048000, 94 | "rowLimit": 10000 95 | }, 96 | "inputWidgets": {}, 97 | "nuid": "9f4f28f6-62e3-434e-bfc5-b23c5675a6dc", 98 | "showTitle": false, 99 | "tableResultSettingsMap": {}, 100 | "title": "" 101 | } 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "%fs head /Volumes/workspace/default/volume1/mobile_os_usage.csv" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 0, 111 | "metadata": { 112 | "application/vnd.databricks.v1+cell": { 113 | "cellMetadata": { 114 | "byteLimit": 2048000, 115 | "rowLimit": 10000 116 | }, 117 | "inputWidgets": {}, 118 | "nuid": "5e796d50-f0ba-427f-9485-63bd3ef375bc", 119 | "showTitle": false, 120 | "tableResultSettingsMap": {}, 121 | "title": "" 122 | } 123 | }, 124 | "outputs": [], 125 | "source": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 0, 130 | "metadata": { 131 | "application/vnd.databricks.v1+cell": { 132 | "cellMetadata": { 133 | "byteLimit": 2048000, 134 | "rowLimit": 10000 135 | }, 136 | "inputWidgets": {}, 137 | "nuid": "ce526cca-6cd6-423d-b37d-841576bd5c25", 138 | "showTitle": false, 139 | "tableResultSettingsMap": {}, 140 | "title": "" 141 | } 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "spark.read.csv(\"/Volumes/workspace/default/volume1/mobile_os_usage.csv\").write.saveAsTable(\"mobile_os_usage\")" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 0, 151 | "metadata": { 152 | "application/vnd.databricks.v1+cell": { 153 | "cellMetadata": { 154 | "byteLimit": 2048000, 155 | "rowLimit": 10000 156 | }, 157 | "inputWidgets": {}, 158 | "nuid": "ec47633a-0cdc-4180-8cf5-a795a51137b9", 159 | "showTitle": false, 160 | "tableResultSettingsMap": {}, 161 | "title": "" 162 | } 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "%matplotlib inline\n", 167 | "import pandas as pd\n", 168 | "df = pd.read_csv(\"/Volumes/workspace/default/volume1/mobile_os_usage.csv\")\n", 169 | "df.plot(kind=\"bar\", x=df.columns[0])\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": { 175 | "application/vnd.databricks.v1+cell": { 176 | "cellMetadata": {}, 177 | "inputWidgets": {}, 178 | "nuid": "67c2a369-637e-457e-9e61-1193ed81182c", 179 | "showTitle": false, 180 | "tableResultSettingsMap": {}, 181 | "title": "" 182 | } 183 | }, 184 | "source": [ 185 | "https://web.s-cdn.boostkit.dev/webaction-files/5ac62a0f22728e050851fc87_our_faculty/face-67f16daa4404199c78d2e38b.jpg\n", 186 | "![](https://fpimages.withfloats.com/actual/6929d1ac956d0a744b5c9822.jpeg)" 187 | ] 188 | } 189 | ], 190 | "metadata": { 191 | "application/vnd.databricks.v1+notebook": { 192 | "computePreferences": { 193 | "hardware": { 194 | "accelerator": null, 195 | "gpuPoolId": null, 196 | "memory": null 197 | } 198 | }, 199 | "dashboards": [], 200 | "environmentMetadata": { 201 | "base_environment": "", 202 | "environment_version": "4" 203 | }, 204 | "inputWidgetPreferences": null, 205 | "language": "python", 206 | "notebookMetadata": { 207 | "mostRecentlyExecutedCommandWithImplicitDF": { 208 | "commandId": 7900721791748484, 209 | "dataframes": [ 210 | "_sqldf" 211 | ] 212 | }, 213 | "pythonIndentUnit": 4 214 | }, 215 | "notebookName": "4_child_nb_dataload", 216 | "widgets": {} 217 | }, 218 | "language_info": { 219 | "name": "python" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 0 224 | } 225 | -------------------------------------------------------------------------------- /databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/3_Notebook_workflow_utils_notebooks_widgets_invoking_passing_params.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC #####1. Display the list databricks utils 4 | 5 | # COMMAND ---------- 6 | 7 | dbutils.widgets.removeAll() 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %md 12 | # MAGIC ######Below dbutils is the comprehensive one, out of which we are going to concentrate currently on notebook, widgets and fs for now 13 | 14 | # COMMAND ---------- 15 | 16 | dbutils.help() 17 | #important utils are 18 | #fs, jobs, notebook, widgets 19 | 20 | # COMMAND ---------- 21 | 22 | # MAGIC %md 23 | # MAGIC #####2. Notebook utils help 24 | 25 | # COMMAND ---------- 26 | 27 | dbutils.help() 28 | 29 | # COMMAND ---------- 30 | 31 | # MAGIC %md 32 | # MAGIC ###3. FS Commands 33 | 34 | # COMMAND ---------- 35 | 36 | dbutils.fs.help() 37 | 38 | # COMMAND ---------- 39 | 40 | print("lets learn all fs commands options...") 41 | print("copying") 42 | dbutils.fs.cp("/Volumes/workspace/default/volumewd36/sample_healthcare_patients.csv","/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv") 43 | print("head of 10 rows") 44 | print(dbutils.fs.head("/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv")) 45 | print("listing") 46 | dbutils.fs.ls("/Volumes/workspace/default/volumewd36/") 47 | print("make directory") 48 | dbutils.fs.mkdirs("/Volumes/workspace/default/volumewd36/healthcare/") 49 | print("move") 50 | dbutils.fs.mv("/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv","/Volumes/workspace/default/volumewd36/healthcare/sample_healthcare_patients1.csv") 51 | dbutils.fs.ls("/Volumes/workspace/default/volumewd36/healthcare/") 52 | dbutils.fs.cp("/Volumes/workspace/default/volumewd36/sample_healthcare_patients.csv","/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv") 53 | print("put to write some data into a file") 54 | 55 | # COMMAND ---------- 56 | 57 | print("try below command without the 3rd argument of true, you will find the dbfs-> hadoop -> spark -> s3 bucket") 58 | #dbutils.fs.put("dbfs:///Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv","put something",False) 59 | print(dbutils.fs.head("/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv")) 60 | dbutils.fs.put("dbfs:///Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv","put something",True) 61 | print("see the data in the file") 62 | print(dbutils.fs.head("/Volumes/workspace/default/volumewd36/sample_healthcare_patients1.csv")) 63 | dbutils.fs.rm("/Volumes/workspace/default/volumewd36/healthcare/sample_healthcare_patients1.csv") 64 | 65 | # COMMAND ---------- 66 | 67 | # MAGIC %md 68 | # MAGIC #####4. Widgets utils help 69 | 70 | # COMMAND ---------- 71 | 72 | dbutils.widgets.help() 73 | 74 | # COMMAND ---------- 75 | 76 | # MAGIC %md 77 | # MAGIC ###Widgets utility used for adding the components/widgets into our notebook for creating 78 | # MAGIC dynamic/parameterized approaches 79 | 80 | # COMMAND ---------- 81 | 82 | print("can you create a textbox widget") 83 | dbutils.widgets.text("tablename","cities","enter the tablename to query") 84 | 85 | # COMMAND ---------- 86 | 87 | print("can you get the value of the widget using dbutils.widgets.get and store into a local python variable tblname") 88 | tblname=dbutils.widgets.get("tablename") 89 | print("user passed the value of ?",tblname) 90 | 91 | # COMMAND ---------- 92 | 93 | #Implemented dynamic SQL usecase in Databricks 94 | display(spark.sql(f"select * from default.{tblname} limit 10")) 95 | 96 | # COMMAND ---------- 97 | 98 | dbutils.widgets.removeAll() 99 | 100 | # COMMAND ---------- 101 | 102 | dbutils.widgets.help() 103 | 104 | # COMMAND ---------- 105 | 106 | dbutils.widgets.dropdown("dropdown_widget","Senthil",["Senthil","Balaji","Arun"],"Select your name") 107 | aspirant_name_chosen=dbutils.widgets.get("dropdown_widget") 108 | print("Good morning",aspirant_name_chosen) 109 | 110 | # COMMAND ---------- 111 | 112 | dbutils.widgets.multiselect("multiselect_widget","wd36",["wd32","we43","we45","wd36"],"Select your team name") 113 | all_batches=dbutils.widgets.get("multiselect_widget") 114 | all_batches_lst=all_batches.split(",") 115 | for i in all_batches_lst: 116 | print(f"hello team {i}") 117 | #print("You have chosen the team name as",all_batches) 118 | 119 | # COMMAND ---------- 120 | 121 | #Interview question- how to access some value from the given string 122 | fullname="mohamed kader irfan" 123 | fname=fullname.split(" ")[0] 124 | lname=fullname.split(" ")[-1] 125 | print(fname, 'and', lname) 126 | 127 | # COMMAND ---------- 128 | 129 | dbutils.widgets.combobox("combobox_widget","wd36",["wd32","we43","we45","wd36"],"Select your team name") 130 | combobox_value=dbutils.widgets.get("combobox_widget") 131 | print("Good morning",combobox_value) 132 | 133 | # COMMAND ---------- 134 | 135 | dbutils.widgets.text("team_name","WD36","This is to represent our team name") 136 | 137 | # COMMAND ---------- 138 | 139 | text_box_value1=dbutils.widgets.get("team_name") 140 | print("Good Morning ",text_box_value1) 141 | 142 | # COMMAND ---------- 143 | 144 | dbutils.widgets.dropdown("listbox","wd36",["wd32","we43","we45","wd36"],"Team names drop down") 145 | listbox_value2=dbutils.widgets.get("listbox") 146 | print("Good morning",listbox_value2) 147 | 148 | # COMMAND ---------- 149 | 150 | dbutils.widgets.combobox("combobox","we47",["wd32","we43","we45","we47"],"Team names combo box") 151 | 152 | # COMMAND ---------- 153 | 154 | dbutils.widgets.multiselect("multiselect","wd36",["wd32","we43","we45","wd36"],"Team names multiselect") 155 | 156 | # COMMAND ---------- 157 | 158 | dict_all_widgets=dbutils.widgets.getAll() 159 | print(dict_all_widgets) 160 | 161 | # COMMAND ---------- 162 | 163 | # MAGIC %md 164 | # MAGIC #####4. Calling a child notebook (example_child_notebook.ipynb) from this parent notebook with parameters 165 | # MAGIC dbutils.widgets.text("param1", "default_value", "Your input parameter") 166 | # MAGIC param_value = dbutils.widgets.get("param1") 167 | # MAGIC print("printing the parameters",param_value) 168 | 169 | # COMMAND ---------- 170 | 171 | child_return_value=dbutils.notebook.run("/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook", 180,{"table_name":"cities1"}) 172 | 173 | # COMMAND ---------- 174 | 175 | if True: 176 | dbutils.notebook.run("/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook",600) 177 | else: 178 | dbutils.notebook.run("/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook",300) 179 | 180 | # COMMAND ---------- 181 | 182 | import time 183 | for i in range(13): 184 | dbutils.notebook.run("/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook",300) 185 | time.sleep(10) 186 | 187 | # COMMAND ---------- 188 | 189 | dbutils.widgets.removeAll() 190 | -------------------------------------------------------------------------------- /databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/3_Notebook_workflow_utils_notebooks_widgets_invoking_passing_params.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | dbutils.widgets.removeAll() 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC #We are going to learn usage of dbutils (DB Utilities...) + widgets (interesting dbutil) 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %fs ls 12 | 13 | # COMMAND ---------- 14 | 15 | # MAGIC %md 16 | # MAGIC #####1. Display the list databricks utils 17 | # MAGIC ######Below dbutils is the comprehensive one, out of which we are going to concentrate currently on notebook, widgets and fs for now 18 | 19 | # COMMAND ---------- 20 | 21 | dbutils.help() 22 | #Some of the important utils... 23 | #fs, notebook, widgets, secrets (security management) 24 | 25 | # COMMAND ---------- 26 | 27 | # MAGIC %md 28 | # MAGIC #####2. Notebook's particular utils help 29 | 30 | # COMMAND ---------- 31 | 32 | dbutils.fs.help() 33 | 34 | # COMMAND ---------- 35 | 36 | # MAGIC %md 37 | # MAGIC #####3. Widgets utils help 38 | 39 | # COMMAND ---------- 40 | 41 | dbutils.widgets.help() 42 | #4 Important widgets 43 | #combobox, dropdown, text, multiselect 44 | 45 | # COMMAND ---------- 46 | 47 | # MAGIC %md 48 | # MAGIC #####4. Let's create all those widgets/plugins/components, attach to this notebook, capture the widget content and make use of it... 49 | 50 | # COMMAND ---------- 51 | 52 | dbutils.widgets.removeAll() 53 | 54 | # COMMAND ---------- 55 | 56 | # MAGIC %md 57 | # MAGIC ######A. Text widget 58 | 59 | # COMMAND ---------- 60 | 61 | #creating and attaching a widget (simple and important widget) 62 | dbutils.widgets.text("aspirant_name","Thilaga","enter our aspirant name to wish") 63 | 64 | # COMMAND ---------- 65 | 66 | #capture the widget input in a variable 67 | name_of_aspirant=dbutils.widgets.get("aspirant_name") 68 | #use that variable for some purpose 69 | print(f"Congratulations!!! {name_of_aspirant}") 70 | 71 | # COMMAND ---------- 72 | 73 | # MAGIC %md 74 | # MAGIC ######B. Dropdown widget 75 | 76 | # COMMAND ---------- 77 | 78 | dbutils.widgets.dropdown("aspirant_gender","Female",["Male","Female"]) 79 | gender=dbutils.widgets.get("aspirant_gender") 80 | print(f"Gender of aspirant is {gender}") 81 | 82 | # COMMAND ---------- 83 | 84 | # MAGIC %md 85 | # MAGIC ######C. Combobox widget - Used to choose only one value from the dropdown by searching 86 | 87 | # COMMAND ---------- 88 | 89 | dbutils.widgets.combobox("aspirant_country_combo","India",["India","USA","UK","Canada","Australia"]) 90 | country=dbutils.widgets.get("aspirant_country_combo") 91 | print(f"Country of aspirant is {country}") 92 | 93 | # COMMAND ---------- 94 | 95 | # MAGIC %md 96 | # MAGIC ######D. Multiselect widget - Used to choose multiple values from the dropdown by searching 97 | 98 | # COMMAND ---------- 99 | 100 | dbutils.widgets.multiselect("aspirant_hobbies_multiselect","Dance",["Dance","Music","Sports","Reading","Writing"]) 101 | hobbies=dbutils.widgets.get("aspirant_hobbies_multiselect") 102 | print(f"Hobbies of aspirant are {hobbies}",type(hobbies)) 103 | print("Top and Least hobbies ?", hobbies.split(",")[0],hobbies.split(",")[-1]) 104 | 105 | # COMMAND ---------- 106 | 107 | all_widgets=dbutils.widgets.getAll() 108 | print(all_widgets) 109 | 110 | # COMMAND ---------- 111 | 112 | # MAGIC %md 113 | # MAGIC #####5. Dynamic SQL usecase to try on dropdown widget? 114 | # MAGIC 1. Collect the list of tables present in the catalog/schema/tables 115 | # MAGIC 2. substitute in the dropdown widgets 116 | # MAGIC 3. allow user to choose the respective table and execute the query to return the total number of rows in that table chosen.
117 | # MAGIC 4. How you can explain this in the interview? 118 | # MAGIC .............................................................................................................. 119 | # MAGIC 120 | 121 | # COMMAND ---------- 122 | 123 | # MAGIC %md 124 | # MAGIC #####6. DBUtils FS Commands - For doing DBFS operations 125 | 126 | # COMMAND ---------- 127 | 128 | dbutils.fs.help() 129 | #cp(from: String, to: String, recurse: boolean = false): boolean -> Copies a file or directory, possibly across FileSystems 130 | #head(file: String, maxBytes: int = 65536): String -> Returns up to the first 'maxBytes' bytes of the given file as a String encoded in UTF-8 131 | #ls(dir: String): Seq -> Lists the contents of a directory 132 | #mkdirs(dir: String): boolean -> Creates the given directory if it does not exist, also creating any necessary parent directories 133 | #mv(from: String, to: String, recurse: boolean = false): boolean -> Moves a file or directory, possibly across FileSystems 134 | #put(file: String, contents: String, overwrite: boolean = false): boolean -> Writes the given String out to a file, encoded in UTF-8 135 | #rm(dir: String, recurse: boolean = false): boolean -> Removes a file or directory 136 | 137 | 138 | # COMMAND ---------- 139 | 140 | dbutils.fs.mkdirs("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1") 141 | data="hello team" 142 | dbutils.fs.put("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt",data,True) 143 | dbutils.fs.ls("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1") 144 | print(dbutils.fs.head("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt",5))#Want to see the top 5 bytes of data 145 | dbutils.fs.cp("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt","dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt_copy2.csv") 146 | dbutils.fs.mv("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt_copy2.csv","dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt_moved.csv") 147 | dbutils.fs.rm("dbfs:/Volumes/workspace/default/volumewe47_datalake/directory1/sample.txt") 148 | 149 | # COMMAND ---------- 150 | 151 | # MAGIC %md 152 | # MAGIC #####7. Calling a child notebook (example_child_notebook.ipynb) from this parent notebook with parameters 153 | # MAGIC dbutils.widgets.text("param1", "default_value", "Your input parameter") 154 | # MAGIC param_value = dbutils.widgets.get("param1") 155 | # MAGIC print("printing the parameters",param_value) 156 | 157 | # COMMAND ---------- 158 | 159 | # MAGIC %md 160 | # MAGIC #####Imporantant interview question: Difference between run magic and dbutils command? 161 | # MAGIC A. The %run magic command will run some other notebook inline in this current notebook itself, so we don't have write some other notebook code, rather we can just run it...
162 | # MAGIC B. The dbutils.notebook.run() command will trigger some other notebook in the respective notebook environment itself, and we can add additional parameters such as timeout seconds and custom parameters to the widgets we can pass... 163 | 164 | # COMMAND ---------- 165 | 166 | # MAGIC %run "/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook" 167 | 168 | # COMMAND ---------- 169 | 170 | return_status=dbutils.notebook.run("/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook",90,{"table_name":"cust"}) 171 | print("child notebook ",return_status) 172 | 173 | # COMMAND ---------- 174 | 175 | #####Interview question... 176 | fullname="inceptez technologies" 177 | print(fullname) 178 | fullname_lst=fullname.split(" ") 179 | print(fullname_lst) 180 | fname=fullname.split(" ")[0] 181 | lname=fullname.split(" ")[-1] 182 | print(fname,lname) 183 | -------------------------------------------------------------------------------- /databricks_workouts_2025/2_Spark_DataFrame_Read_Write_Operations/2-Advanced-Readops.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "4a070bc4-17e0-4059-97ef-0e29d09c8cf3", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "### 1. Options for handling quotes & Escape\n", 17 | "\n", 18 | "id,name,remarks\n", 19 | "1,'Ramesh, K.P','Good performer'\n", 20 | "2,'Manoj','Needs ~'special~' attention'" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "application/vnd.databricks.v1+cell": { 27 | "cellMetadata": {}, 28 | "inputWidgets": {}, 29 | "nuid": "2878e608-057e-4b0d-a33d-3ad862025f23", 30 | "showTitle": false, 31 | "tableResultSettingsMap": {}, 32 | "title": "" 33 | } 34 | }, 35 | "source": [ 36 | "### 2. Comments, Multi line, leading and trailing whitespace handling, null and nan handling" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "application/vnd.databricks.v1+cell": { 43 | "cellMetadata": {}, 44 | "inputWidgets": {}, 45 | "nuid": "7a395acd-7233-49e8-806d-800d56de7195", 46 | "showTitle": false, 47 | "tableResultSettingsMap": {}, 48 | "title": "" 49 | } 50 | }, 51 | "source": [ 52 | "### 3. Read modes in csv" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "application/vnd.databricks.v1+cell": { 59 | "cellMetadata": {}, 60 | "inputWidgets": {}, 61 | "nuid": "63c96d50-7254-47d0-9b35-76c8c44bb49d", 62 | "showTitle": false, 63 | "tableResultSettingsMap": {}, 64 | "title": "" 65 | } 66 | }, 67 | "source": [ 68 | "### There are 3 typical read modes and the default read mode is permissive.\n", 69 | "##### 1. permissive — All fields are set to null and corrupted records are placed in a string column called _corrupt_record\n", 70 | "##### \t2. dropMalformed — Drops all rows containing corrupt records.\n", 71 | "##### 3. failFast — Fails when corrupt records are encountered." 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": { 77 | "application/vnd.databricks.v1+cell": { 78 | "cellMetadata": {}, 79 | "inputWidgets": {}, 80 | "nuid": "b570db0d-c137-41f4-a6a6-43a82eb56d06", 81 | "showTitle": false, 82 | "tableResultSettingsMap": {}, 83 | "title": "" 84 | } 85 | }, 86 | "source": [ 87 | "####4. Max advanced features used...\n", 88 | "\\#This is a commented line and should be ignored\n", 89 | "\"ID\",\"Name\",\"Age\",\"Salary\",\"JoinDate\",\"LastLogin\",\"Notes\"\n", 90 | "1,\"John Doe\",28,45000.50,01-2025-25,2024-01-25 10:15:45,\"New employee\"\n", 91 | "2,\"Jane, Smith\",32,55000.00,2023-12-30,2024-01-25 14:05:10\n", 92 | "3,\"Ravi Kumar\",-1,67000.75,2023-11-05,2024-02-01 08:30:00,\"Null age\",\"addon cols\"\n", 93 | "4,\"李小龍\",45,88000.00,2022-05-18,2024-01-19 13:45:22,\"UTF-8 Chinese name\"\n", 94 | "5,\"Carlos \\\"The Boss\\\" Pérez\",38,72000.30,2023-02-11,2024-01-28 09:55:05,\"Contains quotes\"\n", 95 | "6,\"Manoj\",29,50000,2024-02-10,2024-02-10 17:25:55,\"Line\n", 96 | "break\n", 97 | "inside notes\"\n", 98 | "7,\"Anita\",41,na,2023-10-08,2024-02-02 11:11:11,\"Salary is NaN\"\n", 99 | "8,\"Robert\",34,47000.20,2023-06-22,2024-01-27 18:40:40, \"Leading and trailing spaces\" \n", 100 | "9,\"\",30,39000.00,2023-09-19,2024-01-26 16:20:20,\"Empty name field\"\n", 101 | "10,\"#NotAComment\",37,51000.10,02-2025-25,2024-02-03 12:55:30,\"Starts with # but not a comment\"" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": { 107 | "application/vnd.databricks.v1+cell": { 108 | "cellMetadata": {}, 109 | "inputWidgets": {}, 110 | "nuid": "c793a1ba-d061-45d9-852d-88a5598cd125", 111 | "showTitle": false, 112 | "tableResultSettingsMap": {}, 113 | "title": "" 114 | } 115 | }, 116 | "source": [ 117 | "####5. Reading data from other formats (Try the below usecases after completing the 3-Basic-WriteOps)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": { 123 | "application/vnd.databricks.v1+cell": { 124 | "cellMetadata": {}, 125 | "inputWidgets": {}, 126 | "nuid": "1c69af82-91aa-4854-b6ce-b1eb57c70629", 127 | "showTitle": false, 128 | "tableResultSettingsMap": {}, 129 | "title": "" 130 | } 131 | }, 132 | "source": [ 133 | "####1. Reading csv data" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 0, 139 | "metadata": { 140 | "application/vnd.databricks.v1+cell": { 141 | "cellMetadata": { 142 | "byteLimit": 2048000, 143 | "rowLimit": 10000 144 | }, 145 | "inputWidgets": {}, 146 | "nuid": "6f4039ed-9dec-418d-b54b-a2fcfbd0000a", 147 | "showTitle": false, 148 | "tableResultSettingsMap": {}, 149 | "title": "" 150 | } 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "spark.read.csv(\"/Volumes/workspace/wd36schema/ingestion_volume/target/csvout\").show(2)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "application/vnd.databricks.v1+cell": { 161 | "cellMetadata": {}, 162 | "inputWidgets": {}, 163 | "nuid": "748f2729-c766-4363-ab86-b4d73205c76c", 164 | "showTitle": false, 165 | "tableResultSettingsMap": {}, 166 | "title": "" 167 | } 168 | }, 169 | "source": [ 170 | "####2. Reading json data" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 0, 176 | "metadata": { 177 | "application/vnd.databricks.v1+cell": { 178 | "cellMetadata": { 179 | "byteLimit": 2048000, 180 | "rowLimit": 10000 181 | }, 182 | "inputWidgets": {}, 183 | "nuid": "a9f1957d-69c5-4d4e-b452-99bb5683cc24", 184 | "showTitle": false, 185 | "tableResultSettingsMap": {}, 186 | "title": "" 187 | } 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "spark.read.json(\"/Volumes/workspace/wd36schema/ingestion_volume/target/jsonout\").show(2)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "application/vnd.databricks.v1+cell": { 198 | "cellMetadata": {}, 199 | "inputWidgets": {}, 200 | "nuid": "e5df82fe-767f-4ab1-a316-15e9a3e57f38", 201 | "showTitle": false, 202 | "tableResultSettingsMap": {}, 203 | "title": "" 204 | } 205 | }, 206 | "source": [ 207 | "####3. Reading xml data" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 0, 213 | "metadata": { 214 | "application/vnd.databricks.v1+cell": { 215 | "cellMetadata": { 216 | "byteLimit": 2048000, 217 | "rowLimit": 10000 218 | }, 219 | "inputWidgets": {}, 220 | "nuid": "b35a6186-f61a-4c61-9f5f-c5f303fe6b7c", 221 | "showTitle": false, 222 | "tableResultSettingsMap": {}, 223 | "title": "" 224 | } 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "spark.read.xml(\"/Volumes/workspace/wd36schema/ingestion_volume/target/xmlout\",rowTag=\"cust\").show(2)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": { 234 | "application/vnd.databricks.v1+cell": { 235 | "cellMetadata": {}, 236 | "inputWidgets": {}, 237 | "nuid": "e26a3f08-51ab-4b48-89e8-0d077a3becd9", 238 | "showTitle": false, 239 | "tableResultSettingsMap": {}, 240 | "title": "" 241 | } 242 | }, 243 | "source": [ 244 | "####4. Reading serialized data (orc/parquet/delta)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": { 250 | "application/vnd.databricks.v1+cell": { 251 | "cellMetadata": {}, 252 | "inputWidgets": {}, 253 | "nuid": "2d1d6b33-8169-4811-8acd-c94d1e463da8", 254 | "showTitle": false, 255 | "tableResultSettingsMap": {}, 256 | "title": "" 257 | } 258 | }, 259 | "source": [ 260 | "####5. Reading delta/hive table data" 261 | ] 262 | } 263 | ], 264 | "metadata": { 265 | "application/vnd.databricks.v1+notebook": { 266 | "computePreferences": null, 267 | "dashboards": [], 268 | "environmentMetadata": { 269 | "base_environment": "", 270 | "environment_version": "3" 271 | }, 272 | "inputWidgetPreferences": null, 273 | "language": "python", 274 | "notebookMetadata": { 275 | "pythonIndentUnit": 4 276 | }, 277 | "notebookName": "2-Advanced-Readops", 278 | "widgets": {} 279 | }, 280 | "language_info": { 281 | "name": "python" 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 0 286 | } 287 | -------------------------------------------------------------------------------- /databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/2_Explore_Notebook_Markdowns.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "ea376574-7777-4592-936b-38eb25b6e1d9", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "![Copyright!!](https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "application/vnd.databricks.v1+cell": { 23 | "cellMetadata": {}, 24 | "inputWidgets": {}, 25 | "nuid": "96aa1a34-733e-4d2f-b705-1e076af575eb", 26 | "showTitle": false, 27 | "tableResultSettingsMap": {}, 28 | "title": "" 29 | } 30 | }, 31 | "source": [ 32 | "#1. Basics of Python Programing" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "application/vnd.databricks.v1+cell": { 39 | "cellMetadata": {}, 40 | "inputWidgets": {}, 41 | "nuid": "523792e4-d5c5-4831-9694-7243855d6ead", 42 | "showTitle": false, 43 | "tableResultSettingsMap": {}, 44 | "title": "" 45 | } 46 | }, 47 | "source": [ 48 | "##A. Python is an indent based programming language\n", 49 | "Why Python uses indend based programing ->\n", 50 | "1. Managing the program more efficiently\n", 51 | "2. Better Readablility of the code\n", 52 | "3. For creating the hierarchy of programming.\n", 53 | "4. By default 4 spaces we will give for indends, but more/less spaces or tabs also can be used..." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 0, 59 | "metadata": { 60 | "application/vnd.databricks.v1+cell": { 61 | "cellMetadata": {}, 62 | "inputWidgets": {}, 63 | "nuid": "289c8a44-dee8-4046-9f80-a34ba4707f9b", 64 | "showTitle": false, 65 | "tableResultSettingsMap": {}, 66 | "title": "" 67 | } 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "%python\n", 72 | "aspirants_list=['Jeeva','Bharathi','Vaanmathy','Nag']\n", 73 | "for aspirants in aspirants_list:\n", 74 | " print(\"good afternoon \",aspirants)\n", 75 | "print(\"good after all aspirants\")" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "application/vnd.databricks.v1+cell": { 82 | "cellMetadata": {}, 83 | "inputWidgets": {}, 84 | "nuid": "320756df-0857-43ea-817d-a382622f54b2", 85 | "showTitle": false, 86 | "tableResultSettingsMap": {}, 87 | "title": "" 88 | } 89 | }, 90 | "source": [ 91 | "##B. This is a commented line in Python" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 0, 97 | "metadata": { 98 | "application/vnd.databricks.v1+cell": { 99 | "cellMetadata": {}, 100 | "inputWidgets": {}, 101 | "nuid": "01f6ba35-cb48-45e2-93c7-9d56d31381ef", 102 | "showTitle": false, 103 | "tableResultSettingsMap": {}, 104 | "title": "" 105 | } 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "%python\n", 110 | "#1. Single line comment - use # in the starting\n", 111 | "'''2.Multi line comment''' \n", 112 | "# - use ''' comment ''' or \"\"\" comment \"\"\"" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": { 118 | "application/vnd.databricks.v1+cell": { 119 | "cellMetadata": { 120 | "byteLimit": 2048000, 121 | "rowLimit": 10000 122 | }, 123 | "inputWidgets": {}, 124 | "nuid": "a7967bfd-6528-4b42-bd8b-8f8f251fba02", 125 | "showTitle": false, 126 | "tableResultSettingsMap": {}, 127 | "title": "" 128 | } 129 | }, 130 | "source": [ 131 | "#Main Heading1 using #
How to do some markdowns design
using the magic command" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": { 137 | "application/vnd.databricks.v1+cell": { 138 | "cellMetadata": { 139 | "byteLimit": 2048000, 140 | "rowLimit": 10000 141 | }, 142 | "inputWidgets": {}, 143 | "nuid": "e8d1dcea-9fc3-45db-86c5-43c19d75710a", 144 | "showTitle": false, 145 | "tableResultSettingsMap": {}, 146 | "title": "" 147 | } 148 | }, 149 | "source": [ 150 | "## Main Heading2 - prefix with \"2#\"" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "application/vnd.databricks.v1+cell": { 157 | "cellMetadata": {}, 158 | "inputWidgets": {}, 159 | "nuid": "86589943-f7b2-476a-b758-1efffeb2a73e", 160 | "showTitle": false, 161 | "tableResultSettingsMap": {}, 162 | "title": "" 163 | } 164 | }, 165 | "source": [ 166 | "### Main Heading3 - prefix with \"3#\"" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "application/vnd.databricks.v1+cell": { 173 | "cellMetadata": { 174 | "byteLimit": 2048000, 175 | "rowLimit": 10000 176 | }, 177 | "inputWidgets": {}, 178 | "nuid": "7c9fc9b4-2d30-4bb3-8932-01495c8b5786", 179 | "showTitle": false, 180 | "tableResultSettingsMap": {}, 181 | "title": "" 182 | } 183 | }, 184 | "source": [ 185 | "#### Sub Heading1 - prefix with \"max 4#\"" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "application/vnd.databricks.v1+cell": { 192 | "cellMetadata": { 193 | "byteLimit": 2048000, 194 | "rowLimit": 10000 195 | }, 196 | "inputWidgets": {}, 197 | "nuid": "7b8d623b-f654-4e28-9399-6e1acad1ffc0", 198 | "showTitle": false, 199 | "tableResultSettingsMap": {}, 200 | "title": "" 201 | } 202 | }, 203 | "source": [ 204 | "##### Sub Heading2 - prefix with \"max 5#\"" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": { 210 | "application/vnd.databricks.v1+cell": { 211 | "cellMetadata": { 212 | "byteLimit": 2048000, 213 | "rowLimit": 10000 214 | }, 215 | "inputWidgets": {}, 216 | "nuid": "486d7884-73e3-4a60-9fc2-2ef0a49fc2b4", 217 | "showTitle": false, 218 | "tableResultSettingsMap": {}, 219 | "title": "" 220 | } 221 | }, 222 | "source": [ 223 | "###### Sub Heading3 - prefix with \"max 6#\"" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "application/vnd.databricks.v1+cell": { 230 | "cellMetadata": {}, 231 | "inputWidgets": {}, 232 | "nuid": "8b40e4c3-aed9-437a-86dc-0c62ce3d5751", 233 | "showTitle": false, 234 | "tableResultSettingsMap": {}, 235 | "title": "" 236 | } 237 | }, 238 | "source": [ 239 | "####### Sub Heading3 - prefix with \"max 6#\"" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": { 245 | "application/vnd.databricks.v1+cell": { 246 | "cellMetadata": { 247 | "byteLimit": 2048000, 248 | "rowLimit": 10000 249 | }, 250 | "inputWidgets": {}, 251 | "nuid": "d8bf1fd6-2aac-4e22-8e88-27b54eb3de92", 252 | "showTitle": false, 253 | "tableResultSettingsMap": {}, 254 | "title": "" 255 | } 256 | }, 257 | "source": [ 258 | "######Lets learn about bold\n", 259 | "1. Bold - using html tagging \n", 260 | "2. **Bold** - prefixed and suffixed with **" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": { 266 | "application/vnd.databricks.v1+cell": { 267 | "cellMetadata": { 268 | "byteLimit": 2048000, 269 | "rowLimit": 10000 270 | }, 271 | "inputWidgets": {}, 272 | "nuid": "2ca1c554-d730-451d-9dce-6ee98bb9d136", 273 | "showTitle": false, 274 | "tableResultSettingsMap": {}, 275 | "title": "" 276 | } 277 | }, 278 | "source": [ 279 | "###### Lets learn about Italics\n", 280 | "*Italics* - prefixed and suffixed with *" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": { 286 | "application/vnd.databricks.v1+cell": { 287 | "cellMetadata": { 288 | "byteLimit": 2048000, 289 | "rowLimit": 10000 290 | }, 291 | "inputWidgets": {}, 292 | "nuid": "ea06f315-b1ab-4e41-b9e0-7b561e633cf9", 293 | "showTitle": false, 294 | "tableResultSettingsMap": {}, 295 | "title": "" 296 | } 297 | }, 298 | "source": [ 299 | "###### Lets learn about bullet points\n", 300 | "\n", 301 | "- bullet points - prefix with -\n", 302 | "- bullet points - prefix with -" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": { 308 | "application/vnd.databricks.v1+cell": { 309 | "cellMetadata": { 310 | "byteLimit": 2048000, 311 | "rowLimit": 10000 312 | }, 313 | "inputWidgets": {}, 314 | "nuid": "6e366d03-811c-4548-9a77-b38ca97d2ac4", 315 | "showTitle": false, 316 | "tableResultSettingsMap": {}, 317 | "title": "" 318 | } 319 | }, 320 | "source": [ 321 | "###### Lets learn about Color codes\n", 322 | "$${\\color{pink}Text}$$\n", 323 | "$${\\color{black}Black-color}$$\n", 324 | "$${\\color{red}Red}$$\n", 325 | "$${\\color{green}Green}$$\n", 326 | "$${\\color{blue}Blue}$$\t" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": { 332 | "application/vnd.databricks.v1+cell": { 333 | "cellMetadata": { 334 | "byteLimit": 2048000, 335 | "rowLimit": 10000 336 | }, 337 | "inputWidgets": {}, 338 | "nuid": "a8a1de18-2d42-447e-a677-210fc56e138e", 339 | "showTitle": false, 340 | "tableResultSettingsMap": {}, 341 | "title": "" 342 | } 343 | }, 344 | "source": [ 345 | "###### Lets learn about Embedding urls\n", 346 | "Click here for [Inceptez Webpage](https://www.inceptez.in/)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": { 352 | "application/vnd.databricks.v1+cell": { 353 | "cellMetadata": { 354 | "byteLimit": 2048000, 355 | "rowLimit": 10000 356 | }, 357 | "inputWidgets": {}, 358 | "nuid": "f88a1ce7-b687-434f-81ed-5dd7ec5c0e6b", 359 | "showTitle": false, 360 | "tableResultSettingsMap": {}, 361 | "title": "" 362 | } 363 | }, 364 | "source": [ 365 | "######To learn markdowns more in detail\n", 366 | "Click here [Microsoft markdown cheatsheet](https://docs.databricks.com/aws/en/notebooks/notebook-media)" 367 | ] 368 | } 369 | ], 370 | "metadata": { 371 | "application/vnd.databricks.v1+notebook": { 372 | "computePreferences": { 373 | "hardware": { 374 | "accelerator": null, 375 | "gpuPoolId": null, 376 | "memory": null 377 | } 378 | }, 379 | "dashboards": [], 380 | "environmentMetadata": { 381 | "base_environment": "", 382 | "environment_version": "4" 383 | }, 384 | "inputWidgetPreferences": null, 385 | "language": "python", 386 | "notebookMetadata": { 387 | "pythonIndentUnit": 4 388 | }, 389 | "notebookName": "2_Explore_Notebook_Markdowns", 390 | "widgets": {} 391 | }, 392 | "language_info": { 393 | "name": "python" 394 | } 395 | }, 396 | "nbformat": 4, 397 | "nbformat_minor": 0 398 | } 399 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /databricks_workouts_2025/1_USECASES_NB_FUNDAMENTALS/2_Usecase_md_dbutils_widgets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "411d4d08-41b4-4ba2-9976-bc04181fd083", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "![](https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "application/vnd.databricks.v1+cell": { 23 | "cellMetadata": {}, 24 | "inputWidgets": {}, 25 | "nuid": "12568f05-eab3-411e-a659-99ab5018dde4", 26 | "showTitle": false, 27 | "tableResultSettingsMap": {}, 28 | "title": "" 29 | } 30 | }, 31 | "source": [ 32 | "# Healthcare Data Utilities Usecase2\n", 33 | "\n", 34 | "## Objective\n", 35 | "This notebook demonstrates how to design Databricks notebook using Markdown\n", 36 | "and how to work with Databricks utilities such as dbutils.fs, dbutils.widgets,\n", 37 | "and dbutils.notebook using Volumes.\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "application/vnd.databricks.v1+cell": { 44 | "cellMetadata": {}, 45 | "inputWidgets": {}, 46 | "nuid": "c156024f-930c-4e29-82c5-5620442bfab3", 47 | "showTitle": false, 48 | "tableResultSettingsMap": {}, 49 | "title": "" 50 | } 51 | }, 52 | "source": [ 53 | "## Project Workflow\n", 54 | "1. Create folder structure using Volumes\n", 55 | "2. Create sample healthcare data\n", 56 | "3. Perform file operations using dbutils.fs\n", 57 | "4. Parameterize execution using widgets\n", 58 | "5. Exit notebook with execution status" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "application/vnd.databricks.v1+cell": { 65 | "cellMetadata": {}, 66 | "inputWidgets": {}, 67 | "nuid": "6183ff53-1730-4c06-9a40-10ad0589d66f", 68 | "showTitle": false, 69 | "tableResultSettingsMap": {}, 70 | "title": "" 71 | } 72 | }, 73 | "source": [ 74 | "## Folder Structure\n", 75 | "\n", 76 | "| Folder | Purpose |\n", 77 | "|------|---------|\n", 78 | "| raw | Incoming healthcare files |\n", 79 | "| processed | Validated healthcare data |\n", 80 | "| archive | Historical data |\n" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "application/vnd.databricks.v1+cell": { 87 | "cellMetadata": {}, 88 | "inputWidgets": {}, 89 | "nuid": "8dbd7c13-06c7-4313-bae5-f889c84da16c", 90 | "showTitle": false, 91 | "tableResultSettingsMap": {}, 92 | "title": "" 93 | } 94 | }, 95 | "source": [ 96 | "## Learning Outcome\n", 97 | "Our Aspirants will understand notebook design, parameterization, and fs, notebook, widgets using Databricks utilities." 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "application/vnd.databricks.v1+cell": { 104 | "cellMetadata": {}, 105 | "inputWidgets": {}, 106 | "nuid": "a7ad7451-2275-4a6c-ab9e-cfe5de48106f", 107 | "showTitle": false, 108 | "tableResultSettingsMap": {}, 109 | "title": "" 110 | } 111 | }, 112 | "source": [ 113 | "1. Define Base Paths using python variable
\n", 114 | "base_path = \"/Volumes/workspace/default/volumewd36\"
\n", 115 | "Create raw_path, processed_path and archive_path as given below...
\n", 116 | "raw_path = f\"{base_path}/raw\"
\n", 117 | "processed_path = f\"{base_path}/processed\"
\n", 118 | "archive_path = f\"{base_path}/archive\"" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 0, 124 | "metadata": { 125 | "application/vnd.databricks.v1+cell": { 126 | "cellMetadata": {}, 127 | "inputWidgets": {}, 128 | "nuid": "6a6fbdfb-0a62-46c7-a639-02474c550929", 129 | "showTitle": false, 130 | "tableResultSettingsMap": {}, 131 | "title": "" 132 | } 133 | }, 134 | "outputs": [], 135 | "source": [] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "application/vnd.databricks.v1+cell": { 141 | "cellMetadata": {}, 142 | "inputWidgets": {}, 143 | "nuid": "061e8e6c-65be-477e-a831-a333b88ae3b0", 144 | "showTitle": false, 145 | "tableResultSettingsMap": {}, 146 | "title": "" 147 | } 148 | }, 149 | "source": [ 150 | "2. dbutils Usecase – Create Directories using the above path variables.." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 0, 156 | "metadata": { 157 | "application/vnd.databricks.v1+cell": { 158 | "cellMetadata": {}, 159 | "inputWidgets": {}, 160 | "nuid": "0d397356-1c43-4fdc-9190-d00eb33c0338", 161 | "showTitle": false, 162 | "tableResultSettingsMap": {}, 163 | "title": "" 164 | } 165 | }, 166 | "outputs": [], 167 | "source": [] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "application/vnd.databricks.v1+cell": { 173 | "cellMetadata": {}, 174 | "inputWidgets": {}, 175 | "nuid": "ab99fabf-7144-4430-aabe-d42f3ffdfec5", 176 | "showTitle": false, 177 | "tableResultSettingsMap": {}, 178 | "title": "" 179 | } 180 | }, 181 | "source": [ 182 | "3. dbutils Usecase – Create Sample Healthcare File
\n", 183 | "sample_data = \"\"\"patient_id,patient_name,age,gender\n", 184 | "1,John Doe,68,M\n", 185 | "2,Jane Smith,54,F\n", 186 | "\"\"\"\n", 187 | "\n", 188 | "TODO: Write this file content into raw folder created earlier... using dbutils.fs......." 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 0, 194 | "metadata": { 195 | "application/vnd.databricks.v1+cell": { 196 | "cellMetadata": {}, 197 | "inputWidgets": {}, 198 | "nuid": "80f89788-6f61-4a6c-ab53-e6a07c323c03", 199 | "showTitle": false, 200 | "tableResultSettingsMap": {}, 201 | "title": "" 202 | } 203 | }, 204 | "outputs": [], 205 | "source": [] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": { 210 | "application/vnd.databricks.v1+cell": { 211 | "cellMetadata": {}, 212 | "inputWidgets": {}, 213 | "nuid": "072284ab-41b9-44d0-b9ea-da3658ab19fe", 214 | "showTitle": false, 215 | "tableResultSettingsMap": {}, 216 | "title": "" 217 | } 218 | }, 219 | "source": [ 220 | "4. dbutils Usecase - list the file created
\n", 221 | "TODO: List all files available in raw folder using the dbutils command
\n", 222 | "dbutils.fs......" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 0, 228 | "metadata": { 229 | "application/vnd.databricks.v1+cell": { 230 | "cellMetadata": {}, 231 | "inputWidgets": {}, 232 | "nuid": "1fde88db-8cf8-4795-8733-b9e7a64f3751", 233 | "showTitle": false, 234 | "tableResultSettingsMap": {}, 235 | "title": "" 236 | } 237 | }, 238 | "outputs": [], 239 | "source": [] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "application/vnd.databricks.v1+cell": { 245 | "cellMetadata": {}, 246 | "inputWidgets": {}, 247 | "nuid": "da30d1c1-9a8b-4a8c-babf-a44bc10f2415", 248 | "showTitle": false, 249 | "tableResultSettingsMap": {}, 250 | "title": "" 251 | } 252 | }, 253 | "source": [ 254 | "5. dbutils Usecase – Copy File (raw → processed)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 0, 260 | "metadata": { 261 | "application/vnd.databricks.v1+cell": { 262 | "cellMetadata": {}, 263 | "inputWidgets": {}, 264 | "nuid": "db57a210-0d54-4cb1-91b1-445f4bb05101", 265 | "showTitle": false, 266 | "tableResultSettingsMap": {}, 267 | "title": "" 268 | } 269 | }, 270 | "outputs": [], 271 | "source": [] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "application/vnd.databricks.v1+cell": { 277 | "cellMetadata": {}, 278 | "inputWidgets": {}, 279 | "nuid": "7ec73c87-5fcd-40c4-bb12-adf6bb15e1cb", 280 | "showTitle": false, 281 | "tableResultSettingsMap": {}, 282 | "title": "" 283 | } 284 | }, 285 | "source": [ 286 | "6. dbutils widget usecase - Create dropdown and text widgets...
\n", 287 | "TODO: Create a dropdown widget for environment (dev, qa, prod) using
\n", 288 | "TODO: Create a text widget for owner name\n", 289 | "\n" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 0, 295 | "metadata": { 296 | "application/vnd.databricks.v1+cell": { 297 | "cellMetadata": {}, 298 | "inputWidgets": {}, 299 | "nuid": "ba2d9ddd-c9cf-4ac0-b09b-5b509b3c686e", 300 | "showTitle": false, 301 | "tableResultSettingsMap": {}, 302 | "title": "" 303 | } 304 | }, 305 | "outputs": [], 306 | "source": [] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "application/vnd.databricks.v1+cell": { 312 | "cellMetadata": {}, 313 | "inputWidgets": {}, 314 | "nuid": "530d9f4b-7142-41de-a23b-e0e2ed4b6f4f", 315 | "showTitle": false, 316 | "tableResultSettingsMap": {}, 317 | "title": "" 318 | } 319 | }, 320 | "source": [ 321 | "7. dbutils widget Usecase – Read Widget Values environment and owner and print in the screen" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 0, 327 | "metadata": { 328 | "application/vnd.databricks.v1+cell": { 329 | "cellMetadata": {}, 330 | "inputWidgets": {}, 331 | "nuid": "54aaeb8a-a4c5-47a6-941c-18bd24732d67", 332 | "showTitle": false, 333 | "tableResultSettingsMap": {}, 334 | "title": "" 335 | } 336 | }, 337 | "outputs": [], 338 | "source": [] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": { 343 | "application/vnd.databricks.v1+cell": { 344 | "cellMetadata": {}, 345 | "inputWidgets": {}, 346 | "nuid": "c3bdfb8c-d886-43ca-8cf0-d61bbc1f883f", 347 | "showTitle": false, 348 | "tableResultSettingsMap": {}, 349 | "title": "" 350 | } 351 | }, 352 | "source": [ 353 | "8. dbutils widget Usecase – Move the above processed File to Archive" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 0, 359 | "metadata": { 360 | "application/vnd.databricks.v1+cell": { 361 | "cellMetadata": {}, 362 | "inputWidgets": {}, 363 | "nuid": "2697f7a9-fed9-4e37-abc5-e8f1dc36aa48", 364 | "showTitle": false, 365 | "tableResultSettingsMap": {}, 366 | "title": "" 367 | } 368 | }, 369 | "outputs": [], 370 | "source": [] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": { 375 | "application/vnd.databricks.v1+cell": { 376 | "cellMetadata": {}, 377 | "inputWidgets": {}, 378 | "nuid": "69804d19-6262-43aa-83f8-a71da479aad2", 379 | "showTitle": false, 380 | "tableResultSettingsMap": {}, 381 | "title": "" 382 | } 383 | }, 384 | "source": [ 385 | "9. dbutils notebook usecase - Run the notebook4 using the dbutils command\n", 386 | "/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_USECASES_NB_FUNDAMENTALS/4_child_nb_dataload" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 0, 392 | "metadata": { 393 | "application/vnd.databricks.v1+cell": { 394 | "cellMetadata": {}, 395 | "inputWidgets": {}, 396 | "nuid": "a54cd602-18c3-4ca4-8311-e4787f69fd25", 397 | "showTitle": false, 398 | "tableResultSettingsMap": {}, 399 | "title": "" 400 | } 401 | }, 402 | "outputs": [], 403 | "source": [] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "application/vnd.databricks.v1+cell": { 409 | "cellMetadata": {}, 410 | "inputWidgets": {}, 411 | "nuid": "f1191fc8-a393-47f3-882c-c24ae95959d9", 412 | "showTitle": false, 413 | "tableResultSettingsMap": {}, 414 | "title": "" 415 | } 416 | }, 417 | "source": [ 418 | "10. dbutils notebook usecase - exit this notebook \n", 419 | "TODO: Exit notebook with a success message\n", 420 | "dbutils.notebook._____(\"Pipeline completed successfully\")\n" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 0, 426 | "metadata": { 427 | "application/vnd.databricks.v1+cell": { 428 | "cellMetadata": {}, 429 | "inputWidgets": {}, 430 | "nuid": "6ec32fa5-0791-48f4-a54f-f1294e1146c2", 431 | "showTitle": false, 432 | "tableResultSettingsMap": {}, 433 | "title": "" 434 | } 435 | }, 436 | "outputs": [], 437 | "source": [] 438 | } 439 | ], 440 | "metadata": { 441 | "application/vnd.databricks.v1+notebook": { 442 | "computePreferences": null, 443 | "dashboards": [], 444 | "environmentMetadata": { 445 | "base_environment": "", 446 | "environment_version": "4" 447 | }, 448 | "inputWidgetPreferences": null, 449 | "language": "python", 450 | "notebookMetadata": { 451 | "pythonIndentUnit": 4 452 | }, 453 | "notebookName": "2_Usecase_md_dbutils_widgets", 454 | "widgets": {} 455 | }, 456 | "language_info": { 457 | "name": "python" 458 | } 459 | }, 460 | "nbformat": 4, 461 | "nbformat_minor": 0 462 | } 463 | -------------------------------------------------------------------------------- /databricks_workouts_2025_WE47/1_USECASES_NB_FUNDAMENTALS/2_Usecase_md_dbutils_widgets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "411d4d08-41b4-4ba2-9976-bc04181fd083", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "![](https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "application/vnd.databricks.v1+cell": { 23 | "cellMetadata": {}, 24 | "inputWidgets": {}, 25 | "nuid": "12568f05-eab3-411e-a659-99ab5018dde4", 26 | "showTitle": false, 27 | "tableResultSettingsMap": {}, 28 | "title": "" 29 | } 30 | }, 31 | "source": [ 32 | "# Healthcare Data Utilities Usecase2\n", 33 | "\n", 34 | "## Objective\n", 35 | "This notebook demonstrates how to design Databricks notebook using Markdown\n", 36 | "and how to work with Databricks utilities such as dbutils.fs, dbutils.widgets,\n", 37 | "and dbutils.notebook using Volumes.\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "application/vnd.databricks.v1+cell": { 44 | "cellMetadata": {}, 45 | "inputWidgets": {}, 46 | "nuid": "c156024f-930c-4e29-82c5-5620442bfab3", 47 | "showTitle": false, 48 | "tableResultSettingsMap": {}, 49 | "title": "" 50 | } 51 | }, 52 | "source": [ 53 | "## Project Workflow\n", 54 | "1. Create folder structure using Volumes\n", 55 | "2. Create sample healthcare data\n", 56 | "3. Perform file operations using dbutils.fs\n", 57 | "4. Parameterize execution using widgets\n", 58 | "5. Exit notebook with execution status" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "application/vnd.databricks.v1+cell": { 65 | "cellMetadata": {}, 66 | "inputWidgets": {}, 67 | "nuid": "6183ff53-1730-4c06-9a40-10ad0589d66f", 68 | "showTitle": false, 69 | "tableResultSettingsMap": {}, 70 | "title": "" 71 | } 72 | }, 73 | "source": [ 74 | "## Folder Structure\n", 75 | "\n", 76 | "| Folder | Purpose |\n", 77 | "|------|---------|\n", 78 | "| raw | Incoming healthcare files |\n", 79 | "| processed | Validated healthcare data |\n", 80 | "| archive | Historical data |\n" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "application/vnd.databricks.v1+cell": { 87 | "cellMetadata": {}, 88 | "inputWidgets": {}, 89 | "nuid": "8dbd7c13-06c7-4313-bae5-f889c84da16c", 90 | "showTitle": false, 91 | "tableResultSettingsMap": {}, 92 | "title": "" 93 | } 94 | }, 95 | "source": [ 96 | "## Learning Outcome\n", 97 | "Our Aspirants will understand notebook design, parameterization, and fs, notebook, widgets using Databricks utilities." 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "application/vnd.databricks.v1+cell": { 104 | "cellMetadata": {}, 105 | "inputWidgets": {}, 106 | "nuid": "a7ad7451-2275-4a6c-ab9e-cfe5de48106f", 107 | "showTitle": false, 108 | "tableResultSettingsMap": {}, 109 | "title": "" 110 | } 111 | }, 112 | "source": [ 113 | "1. Define Base Paths using python variable
\n", 114 | "base_path = \"/Volumes/workspace/default/volumewd36\"
\n", 115 | "Create raw_path, processed_path and archive_path as given below...
\n", 116 | "raw_path = f\"{base_path}/raw\"
\n", 117 | "processed_path = f\"{base_path}/processed\"
\n", 118 | "archive_path = f\"{base_path}/archive\"" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 0, 124 | "metadata": { 125 | "application/vnd.databricks.v1+cell": { 126 | "cellMetadata": {}, 127 | "inputWidgets": {}, 128 | "nuid": "6a6fbdfb-0a62-46c7-a639-02474c550929", 129 | "showTitle": false, 130 | "tableResultSettingsMap": {}, 131 | "title": "" 132 | } 133 | }, 134 | "outputs": [], 135 | "source": [] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "application/vnd.databricks.v1+cell": { 141 | "cellMetadata": {}, 142 | "inputWidgets": {}, 143 | "nuid": "061e8e6c-65be-477e-a831-a333b88ae3b0", 144 | "showTitle": false, 145 | "tableResultSettingsMap": {}, 146 | "title": "" 147 | } 148 | }, 149 | "source": [ 150 | "2. dbutils Usecase – Create Directories using the above path variables.." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 0, 156 | "metadata": { 157 | "application/vnd.databricks.v1+cell": { 158 | "cellMetadata": {}, 159 | "inputWidgets": {}, 160 | "nuid": "0d397356-1c43-4fdc-9190-d00eb33c0338", 161 | "showTitle": false, 162 | "tableResultSettingsMap": {}, 163 | "title": "" 164 | } 165 | }, 166 | "outputs": [], 167 | "source": [] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "application/vnd.databricks.v1+cell": { 173 | "cellMetadata": {}, 174 | "inputWidgets": {}, 175 | "nuid": "ab99fabf-7144-4430-aabe-d42f3ffdfec5", 176 | "showTitle": false, 177 | "tableResultSettingsMap": {}, 178 | "title": "" 179 | } 180 | }, 181 | "source": [ 182 | "3. dbutils Usecase – Create Sample Healthcare File
\n", 183 | "sample_data = \"\"\"patient_id,patient_name,age,gender\n", 184 | "1,John Doe,68,M\n", 185 | "2,Jane Smith,54,F\n", 186 | "\"\"\"\n", 187 | "\n", 188 | "TODO: Write this file content into raw folder created earlier... using dbutils.fs......." 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 0, 194 | "metadata": { 195 | "application/vnd.databricks.v1+cell": { 196 | "cellMetadata": {}, 197 | "inputWidgets": {}, 198 | "nuid": "80f89788-6f61-4a6c-ab53-e6a07c323c03", 199 | "showTitle": false, 200 | "tableResultSettingsMap": {}, 201 | "title": "" 202 | } 203 | }, 204 | "outputs": [], 205 | "source": [] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": { 210 | "application/vnd.databricks.v1+cell": { 211 | "cellMetadata": {}, 212 | "inputWidgets": {}, 213 | "nuid": "072284ab-41b9-44d0-b9ea-da3658ab19fe", 214 | "showTitle": false, 215 | "tableResultSettingsMap": {}, 216 | "title": "" 217 | } 218 | }, 219 | "source": [ 220 | "4. dbutils Usecase - list the file created
\n", 221 | "TODO: List all files available in raw folder using the dbutils command
\n", 222 | "dbutils.fs......" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 0, 228 | "metadata": { 229 | "application/vnd.databricks.v1+cell": { 230 | "cellMetadata": {}, 231 | "inputWidgets": {}, 232 | "nuid": "1fde88db-8cf8-4795-8733-b9e7a64f3751", 233 | "showTitle": false, 234 | "tableResultSettingsMap": {}, 235 | "title": "" 236 | } 237 | }, 238 | "outputs": [], 239 | "source": [] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "application/vnd.databricks.v1+cell": { 245 | "cellMetadata": {}, 246 | "inputWidgets": {}, 247 | "nuid": "da30d1c1-9a8b-4a8c-babf-a44bc10f2415", 248 | "showTitle": false, 249 | "tableResultSettingsMap": {}, 250 | "title": "" 251 | } 252 | }, 253 | "source": [ 254 | "5. dbutils Usecase – Copy File (raw → processed)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 0, 260 | "metadata": { 261 | "application/vnd.databricks.v1+cell": { 262 | "cellMetadata": {}, 263 | "inputWidgets": {}, 264 | "nuid": "db57a210-0d54-4cb1-91b1-445f4bb05101", 265 | "showTitle": false, 266 | "tableResultSettingsMap": {}, 267 | "title": "" 268 | } 269 | }, 270 | "outputs": [], 271 | "source": [] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "application/vnd.databricks.v1+cell": { 277 | "cellMetadata": {}, 278 | "inputWidgets": {}, 279 | "nuid": "7ec73c87-5fcd-40c4-bb12-adf6bb15e1cb", 280 | "showTitle": false, 281 | "tableResultSettingsMap": {}, 282 | "title": "" 283 | } 284 | }, 285 | "source": [ 286 | "6. dbutils widget usecase - Create dropdown and text widgets...
\n", 287 | "TODO: Create a dropdown widget for environment (dev, qa, prod) using
\n", 288 | "TODO: Create a text widget for owner name\n", 289 | "\n" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 0, 295 | "metadata": { 296 | "application/vnd.databricks.v1+cell": { 297 | "cellMetadata": {}, 298 | "inputWidgets": {}, 299 | "nuid": "ba2d9ddd-c9cf-4ac0-b09b-5b509b3c686e", 300 | "showTitle": false, 301 | "tableResultSettingsMap": {}, 302 | "title": "" 303 | } 304 | }, 305 | "outputs": [], 306 | "source": [] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "application/vnd.databricks.v1+cell": { 312 | "cellMetadata": {}, 313 | "inputWidgets": {}, 314 | "nuid": "530d9f4b-7142-41de-a23b-e0e2ed4b6f4f", 315 | "showTitle": false, 316 | "tableResultSettingsMap": {}, 317 | "title": "" 318 | } 319 | }, 320 | "source": [ 321 | "7. dbutils widget Usecase – Read Widget Values environment and owner and print in the screen" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 0, 327 | "metadata": { 328 | "application/vnd.databricks.v1+cell": { 329 | "cellMetadata": {}, 330 | "inputWidgets": {}, 331 | "nuid": "54aaeb8a-a4c5-47a6-941c-18bd24732d67", 332 | "showTitle": false, 333 | "tableResultSettingsMap": {}, 334 | "title": "" 335 | } 336 | }, 337 | "outputs": [], 338 | "source": [] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": { 343 | "application/vnd.databricks.v1+cell": { 344 | "cellMetadata": {}, 345 | "inputWidgets": {}, 346 | "nuid": "c3bdfb8c-d886-43ca-8cf0-d61bbc1f883f", 347 | "showTitle": false, 348 | "tableResultSettingsMap": {}, 349 | "title": "" 350 | } 351 | }, 352 | "source": [ 353 | "8. dbutils widget Usecase – Move the above processed File to Archive" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 0, 359 | "metadata": { 360 | "application/vnd.databricks.v1+cell": { 361 | "cellMetadata": {}, 362 | "inputWidgets": {}, 363 | "nuid": "2697f7a9-fed9-4e37-abc5-e8f1dc36aa48", 364 | "showTitle": false, 365 | "tableResultSettingsMap": {}, 366 | "title": "" 367 | } 368 | }, 369 | "outputs": [], 370 | "source": [] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": { 375 | "application/vnd.databricks.v1+cell": { 376 | "cellMetadata": {}, 377 | "inputWidgets": {}, 378 | "nuid": "69804d19-6262-43aa-83f8-a71da479aad2", 379 | "showTitle": false, 380 | "tableResultSettingsMap": {}, 381 | "title": "" 382 | } 383 | }, 384 | "source": [ 385 | "9. dbutils notebook usecase - Run the notebook4 using the dbutils command\n", 386 | "/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_USECASES_NB_FUNDAMENTALS/4_child_nb_dataload" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 0, 392 | "metadata": { 393 | "application/vnd.databricks.v1+cell": { 394 | "cellMetadata": {}, 395 | "inputWidgets": {}, 396 | "nuid": "a54cd602-18c3-4ca4-8311-e4787f69fd25", 397 | "showTitle": false, 398 | "tableResultSettingsMap": {}, 399 | "title": "" 400 | } 401 | }, 402 | "outputs": [], 403 | "source": [] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "application/vnd.databricks.v1+cell": { 409 | "cellMetadata": {}, 410 | "inputWidgets": {}, 411 | "nuid": "f1191fc8-a393-47f3-882c-c24ae95959d9", 412 | "showTitle": false, 413 | "tableResultSettingsMap": {}, 414 | "title": "" 415 | } 416 | }, 417 | "source": [ 418 | "10. dbutils notebook usecase - exit this notebook \n", 419 | "TODO: Exit notebook with a success message\n", 420 | "dbutils.notebook._____(\"Pipeline completed successfully\")\n" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 0, 426 | "metadata": { 427 | "application/vnd.databricks.v1+cell": { 428 | "cellMetadata": {}, 429 | "inputWidgets": {}, 430 | "nuid": "6ec32fa5-0791-48f4-a54f-f1294e1146c2", 431 | "showTitle": false, 432 | "tableResultSettingsMap": {}, 433 | "title": "" 434 | } 435 | }, 436 | "outputs": [], 437 | "source": [] 438 | } 439 | ], 440 | "metadata": { 441 | "application/vnd.databricks.v1+notebook": { 442 | "computePreferences": null, 443 | "dashboards": [], 444 | "environmentMetadata": { 445 | "base_environment": "", 446 | "environment_version": "4" 447 | }, 448 | "inputWidgetPreferences": null, 449 | "language": "python", 450 | "notebookMetadata": { 451 | "pythonIndentUnit": 4 452 | }, 453 | "notebookName": "2_Usecase_md_dbutils_widgets", 454 | "widgets": {} 455 | }, 456 | "language_info": { 457 | "name": "python" 458 | } 459 | }, 460 | "nbformat": 4, 461 | "nbformat_minor": 0 462 | } 463 | -------------------------------------------------------------------------------- /databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/2_Explore_Notebook_Markdowns.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "ea376574-7777-4592-936b-38eb25b6e1d9", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "![Copyright!!](https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "application/vnd.databricks.v1+cell": { 23 | "cellMetadata": {}, 24 | "inputWidgets": {}, 25 | "nuid": "96aa1a34-733e-4d2f-b705-1e076af575eb", 26 | "showTitle": false, 27 | "tableResultSettingsMap": {}, 28 | "title": "" 29 | } 30 | }, 31 | "source": [ 32 | "#1. Basics of Python Programing" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "application/vnd.databricks.v1+cell": { 39 | "cellMetadata": {}, 40 | "inputWidgets": {}, 41 | "nuid": "523792e4-d5c5-4831-9694-7243855d6ead", 42 | "showTitle": false, 43 | "tableResultSettingsMap": {}, 44 | "title": "" 45 | } 46 | }, 47 | "source": [ 48 | "##A. Python is an indent based programming language\n", 49 | "Why Python uses indend based programing ->\n", 50 | "1. Managing the program more efficiently\n", 51 | "2. Better Readablility of the code\n", 52 | "3. For creating the hierarchy of programming.\n", 53 | "4. By default 4 spaces we will give for indends, but more/less spaces or tabs also can be used..." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": { 59 | "application/vnd.databricks.v1+cell": { 60 | "cellMetadata": {}, 61 | "inputWidgets": {}, 62 | "nuid": "b50093e4-02c9-41df-a552-4081040e16f6", 63 | "showTitle": false, 64 | "tableResultSettingsMap": {}, 65 | "title": "" 66 | } 67 | }, 68 | "source": [ 69 | "###How many space for intending" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 0, 75 | "metadata": { 76 | "application/vnd.databricks.v1+cell": { 77 | "cellMetadata": {}, 78 | "inputWidgets": {}, 79 | "nuid": "eccd8947-afed-41d7-a9a8-4aa7cdf267ad", 80 | "showTitle": false, 81 | "tableResultSettingsMap": {}, 82 | "title": "" 83 | } 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "if True:\n", 88 | " print(\"hello\")" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": { 94 | "application/vnd.databricks.v1+cell": { 95 | "cellMetadata": {}, 96 | "inputWidgets": {}, 97 | "nuid": "2d9344d7-e471-4ceb-a2ee-039351f00e91", 98 | "showTitle": false, 99 | "tableResultSettingsMap": {}, 100 | "title": "" 101 | } 102 | }, 103 | "source": [ 104 | "###Multiple intents" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 0, 110 | "metadata": { 111 | "application/vnd.databricks.v1+cell": { 112 | "cellMetadata": {}, 113 | "inputWidgets": {}, 114 | "nuid": "289c8a44-dee8-4046-9f80-a34ba4707f9b", 115 | "showTitle": false, 116 | "tableResultSettingsMap": {}, 117 | "title": "" 118 | } 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "%python\n", 123 | "aspirants_list=['Jeeva','Bharathi','Vaanmathy','Nag']\n", 124 | "for aspirants in aspirants_list:\n", 125 | " print(\"good afternoon \",aspirants)\n", 126 | "print(\"good after all aspirants\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": { 132 | "application/vnd.databricks.v1+cell": { 133 | "cellMetadata": {}, 134 | "inputWidgets": {}, 135 | "nuid": "320756df-0857-43ea-817d-a382622f54b2", 136 | "showTitle": false, 137 | "tableResultSettingsMap": {}, 138 | "title": "" 139 | } 140 | }, 141 | "source": [ 142 | "##B. This is a commented line in Python" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 0, 148 | "metadata": { 149 | "application/vnd.databricks.v1+cell": { 150 | "cellMetadata": {}, 151 | "inputWidgets": {}, 152 | "nuid": "01f6ba35-cb48-45e2-93c7-9d56d31381ef", 153 | "showTitle": false, 154 | "tableResultSettingsMap": {}, 155 | "title": "" 156 | } 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "%python\n", 161 | "#1. Single line comment - use # in the starting\n", 162 | "'''2.Multi line comment''' \n", 163 | "# - use ''' comment ''' or \"\"\" comment \"\"\"" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": { 169 | "application/vnd.databricks.v1+cell": { 170 | "cellMetadata": { 171 | "byteLimit": 2048000, 172 | "rowLimit": 10000 173 | }, 174 | "inputWidgets": {}, 175 | "nuid": "a7967bfd-6528-4b42-bd8b-8f8f251fba02", 176 | "showTitle": false, 177 | "tableResultSettingsMap": {}, 178 | "title": "" 179 | } 180 | }, 181 | "source": [ 182 | "#Main Heading1 using #
How to do some markdowns design
using the magic command" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": { 188 | "application/vnd.databricks.v1+cell": { 189 | "cellMetadata": { 190 | "byteLimit": 2048000, 191 | "rowLimit": 10000 192 | }, 193 | "inputWidgets": {}, 194 | "nuid": "e8d1dcea-9fc3-45db-86c5-43c19d75710a", 195 | "showTitle": false, 196 | "tableResultSettingsMap": {}, 197 | "title": "" 198 | } 199 | }, 200 | "source": [ 201 | "## Main Heading2 - prefix with \"2#\"" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": { 207 | "application/vnd.databricks.v1+cell": { 208 | "cellMetadata": {}, 209 | "inputWidgets": {}, 210 | "nuid": "86589943-f7b2-476a-b758-1efffeb2a73e", 211 | "showTitle": false, 212 | "tableResultSettingsMap": {}, 213 | "title": "" 214 | } 215 | }, 216 | "source": [ 217 | "### Main Heading3 - prefix with \"3#\"" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "application/vnd.databricks.v1+cell": { 224 | "cellMetadata": { 225 | "byteLimit": 2048000, 226 | "rowLimit": 10000 227 | }, 228 | "inputWidgets": {}, 229 | "nuid": "7c9fc9b4-2d30-4bb3-8932-01495c8b5786", 230 | "showTitle": false, 231 | "tableResultSettingsMap": {}, 232 | "title": "" 233 | } 234 | }, 235 | "source": [ 236 | "#### Sub Heading1 - prefix with \"max 4#\"" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": { 242 | "application/vnd.databricks.v1+cell": { 243 | "cellMetadata": { 244 | "byteLimit": 2048000, 245 | "rowLimit": 10000 246 | }, 247 | "inputWidgets": {}, 248 | "nuid": "7b8d623b-f654-4e28-9399-6e1acad1ffc0", 249 | "showTitle": false, 250 | "tableResultSettingsMap": {}, 251 | "title": "" 252 | } 253 | }, 254 | "source": [ 255 | "##### Sub Heading2 - prefix with \"max 5#\"" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": { 261 | "application/vnd.databricks.v1+cell": { 262 | "cellMetadata": { 263 | "byteLimit": 2048000, 264 | "rowLimit": 10000 265 | }, 266 | "inputWidgets": {}, 267 | "nuid": "486d7884-73e3-4a60-9fc2-2ef0a49fc2b4", 268 | "showTitle": false, 269 | "tableResultSettingsMap": {}, 270 | "title": "" 271 | } 272 | }, 273 | "source": [ 274 | "###### Sub Heading3 - prefix with \"max 6#\"" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "application/vnd.databricks.v1+cell": { 281 | "cellMetadata": {}, 282 | "inputWidgets": {}, 283 | "nuid": "8b40e4c3-aed9-437a-86dc-0c62ce3d5751", 284 | "showTitle": false, 285 | "tableResultSettingsMap": {}, 286 | "title": "" 287 | } 288 | }, 289 | "source": [ 290 | "####### Sub Heading3 - prefix with \"max 6#\"" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": { 296 | "application/vnd.databricks.v1+cell": { 297 | "cellMetadata": { 298 | "byteLimit": 2048000, 299 | "rowLimit": 10000 300 | }, 301 | "inputWidgets": {}, 302 | "nuid": "d8bf1fd6-2aac-4e22-8e88-27b54eb3de92", 303 | "showTitle": false, 304 | "tableResultSettingsMap": {}, 305 | "title": "" 306 | } 307 | }, 308 | "source": [ 309 | "######Lets learn about bold\n", 310 | "1. Bold - using html tagging \n", 311 | "2. **Bold** - prefixed and suffixed with **" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": { 317 | "application/vnd.databricks.v1+cell": { 318 | "cellMetadata": { 319 | "byteLimit": 2048000, 320 | "rowLimit": 10000 321 | }, 322 | "inputWidgets": {}, 323 | "nuid": "2ca1c554-d730-451d-9dce-6ee98bb9d136", 324 | "showTitle": false, 325 | "tableResultSettingsMap": {}, 326 | "title": "" 327 | } 328 | }, 329 | "source": [ 330 | "###### Lets learn about Italics\n", 331 | "*Italics* - prefixed and suffixed with *" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": { 337 | "application/vnd.databricks.v1+cell": { 338 | "cellMetadata": { 339 | "byteLimit": 2048000, 340 | "rowLimit": 10000 341 | }, 342 | "inputWidgets": {}, 343 | "nuid": "ea06f315-b1ab-4e41-b9e0-7b561e633cf9", 344 | "showTitle": false, 345 | "tableResultSettingsMap": {}, 346 | "title": "" 347 | } 348 | }, 349 | "source": [ 350 | "###### Lets learn about bullet points\n", 351 | "\n", 352 | "- bullet points - prefix with -\n", 353 | "- bullet points - prefix with -" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": { 359 | "application/vnd.databricks.v1+cell": { 360 | "cellMetadata": { 361 | "byteLimit": 2048000, 362 | "rowLimit": 10000 363 | }, 364 | "inputWidgets": {}, 365 | "nuid": "6e366d03-811c-4548-9a77-b38ca97d2ac4", 366 | "showTitle": false, 367 | "tableResultSettingsMap": {}, 368 | "title": "" 369 | } 370 | }, 371 | "source": [ 372 | "###### Lets learn about Color codes\n", 373 | "$${\\color{pink}text-to-display}$$\n", 374 | "$${\\color{black}Black-color}$$\n", 375 | "$${\\color{red}Red}$$\n", 376 | "$${\\color{green}Green}$$\n", 377 | "$${\\color{blue}Blue}$$\t" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": { 383 | "application/vnd.databricks.v1+cell": { 384 | "cellMetadata": { 385 | "byteLimit": 2048000, 386 | "rowLimit": 10000 387 | }, 388 | "inputWidgets": {}, 389 | "nuid": "a8a1de18-2d42-447e-a677-210fc56e138e", 390 | "showTitle": false, 391 | "tableResultSettingsMap": {}, 392 | "title": "" 393 | } 394 | }, 395 | "source": [ 396 | "###### Lets learn about Embedding urls\n", 397 | "[click here <-](https://www.google.com/search?q=whether+databricks+uses+hive+in+the+behind%3F&sca_esv=d340eac8d7c27e5b&sxsrf=AE3TifM0tbhMSJ32VMGLkFYoRjocGCu6jw%3A1765160969262&ei=CTg2abXhD4PD4-EPsuGjiAo&ved=0ahUKEwj1ia2E-ayRAxWD4TgGHbLwCKEQ4dUDCBE&uact=5&oq=whether+databricks+uses+hive+in+the+behind%3F&gs_lp=Egxnd3Mtd2l6LXNlcnAiK3doZXRoZXIgZGF0YWJyaWNrcyB1c2VzIGhpdmUgaW4gdGhlIGJlaGluZD8yBRAhGKABMgUQIRigATIFECEYoAEyBRAhGKABSM1VUABY7VFwBXgBkAEAmAGRAaABzyKqAQQwLjM2uAEDyAEA-AEBmAIkoAL0HsICBxAAGIAEGA3CAgYQABgHGB7CAggQABgHGAgYHsICCBAAGAgYDRgewgILEAAYgAQYhgMYigXCAgYQABgNGB7CAggQABiABBiiBMICBRAhGJ8FwgIEECEYFcICBxAhGKABGAqYAwCSBwQ1LjMxoAeJ8AGyBwQwLjMxuAfjHsIHBjIuMzIuMsgHRIAIAA&sclient=gws-wiz-serp)\n", 398 | "Click here for [Inceptez Webpage](https://www.inceptez.in/)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": { 404 | "application/vnd.databricks.v1+cell": { 405 | "cellMetadata": { 406 | "byteLimit": 2048000, 407 | "rowLimit": 10000 408 | }, 409 | "inputWidgets": {}, 410 | "nuid": "f88a1ce7-b687-434f-81ed-5dd7ec5c0e6b", 411 | "showTitle": false, 412 | "tableResultSettingsMap": {}, 413 | "title": "" 414 | } 415 | }, 416 | "source": [ 417 | "######To learn markdowns more in detail\n", 418 | "Click here [Microsoft markdown cheatsheet](https://docs.databricks.com/aws/en/notebooks/notebook-media)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": { 424 | "application/vnd.databricks.v1+cell": { 425 | "cellMetadata": {}, 426 | "inputWidgets": {}, 427 | "nuid": "f11e8389-002a-4099-9207-0f7ab9b554fa", 428 | "showTitle": false, 429 | "tableResultSettingsMap": {}, 430 | "title": "" 431 | } 432 | }, 433 | "source": [ 434 | "| col1 | col2 |\n", 435 | "|------|------|\n", 436 | "| a | b |\n", 437 | "| c | d |" 438 | ] 439 | } 440 | ], 441 | "metadata": { 442 | "application/vnd.databricks.v1+notebook": { 443 | "computePreferences": { 444 | "hardware": { 445 | "accelerator": null, 446 | "gpuPoolId": null, 447 | "memory": null 448 | } 449 | }, 450 | "dashboards": [], 451 | "environmentMetadata": { 452 | "base_environment": "", 453 | "environment_version": "4" 454 | }, 455 | "inputWidgetPreferences": null, 456 | "language": "python", 457 | "notebookMetadata": { 458 | "pythonIndentUnit": 4 459 | }, 460 | "notebookName": "2_Explore_Notebook_Markdowns", 461 | "widgets": {} 462 | }, 463 | "language_info": { 464 | "name": "python" 465 | } 466 | }, 467 | "nbformat": 4, 468 | "nbformat_minor": 0 469 | } 470 | -------------------------------------------------------------------------------- /databricks_workouts_2025_WE47/2_Spark_DataFrame_Read_Write_Operations/read_write_usecases.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "8ba86a20-5a3a-4130-86f5-e312f4a7901b", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "#Telecom Domain Read & Write Ops Assignment - Building Datalake & Lakehouse\n", 17 | "This notebook contains assignments to practice Spark read options and Databricks volumes.
\n", 18 | "Sections: Sample data creation, Catalog & Volume creation, Copying data into Volumes, Path glob/recursive reads, toDF() column renaming variants, inferSchema/header/separator experiments, and exercises.
" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "application/vnd.databricks.v1+cell": { 25 | "cellMetadata": {}, 26 | "inputWidgets": {}, 27 | "nuid": "841c7ed8-ef18-486a-8187-07685e499b84", 28 | "showTitle": false, 29 | "tableResultSettingsMap": {}, 30 | "title": "" 31 | } 32 | }, 33 | "source": [ 34 | "![](https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png)\n", 35 | "![](https://theciotimes.com/wp-content/uploads/2021/03/TELECOM1.jpg)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "application/vnd.databricks.v1+cell": { 42 | "cellMetadata": {}, 43 | "inputWidgets": {}, 44 | "nuid": "d4aa0a44-8cd6-41cf-921d-abb5ff67615b", 45 | "showTitle": false, 46 | "tableResultSettingsMap": {}, 47 | "title": "" 48 | } 49 | }, 50 | "source": [ 51 | "##First Import all required libraries & Create spark session object" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "application/vnd.databricks.v1+cell": { 58 | "cellMetadata": {}, 59 | "inputWidgets": {}, 60 | "nuid": "d0b67823-2e4e-45e2-aa25-80550a3ac580", 61 | "showTitle": false, 62 | "tableResultSettingsMap": {}, 63 | "title": "" 64 | } 65 | }, 66 | "source": [ 67 | "##1. Write SQL statements to create:\n", 68 | "1. A catalog named telecom_catalog_assign\n", 69 | "2. A schema landing_zone\n", 70 | "3. A volume landing_vol\n", 71 | "4. Using dbutils.fs.mkdirs, create folders:
\n", 72 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/\n", 73 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/\n", 74 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/\n", 75 | "5. Explain the difference between (Just google and understand why we are going for volume concept for prod ready systems):
\n", 76 | "a. Volume vs DBFS/FileStore
\n", 77 | "b. Why production teams prefer Volumes for regulated data
" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": { 83 | "application/vnd.databricks.v1+cell": { 84 | "cellMetadata": {}, 85 | "inputWidgets": {}, 86 | "nuid": "26d8bd3d-b575-448b-ae22-8173d15ca671", 87 | "showTitle": false, 88 | "tableResultSettingsMap": {}, 89 | "title": "" 90 | } 91 | }, 92 | "source": [ 93 | "##Data files to use in this usecase:\n", 94 | "customer_csv = '''\n", 95 | "101,Arun,31,Chennai,PREPAID\n", 96 | "102,Meera,45,Bangalore,POSTPAID\n", 97 | "103,Irfan,29,Hyderabad,PREPAID\n", 98 | "104,Raj,52,Mumbai,POSTPAID\n", 99 | "105,,27,Delhi,PREPAID\n", 100 | "106,Sneha,abc,Pune,PREPAID\n", 101 | "'''\n", 102 | "\n", 103 | "usage_tsv = '''customer_id\\tvoice_mins\\tdata_mb\\tsms_count\n", 104 | "101\\t320\\t1500\\t20\n", 105 | "102\\t120\\t4000\\t5\n", 106 | "103\\t540\\t600\\t52\n", 107 | "104\\t45\\t200\\t2\n", 108 | "105\\t0\\t0\\t0\n", 109 | "'''\n", 110 | "\n", 111 | "tower_logs_region1 = '''event_id|customer_id|tower_id|signal_strength|timestamp\n", 112 | "5001|101|TWR01|-80|2025-01-10 10:21:54\n", 113 | "5004|104|TWR05|-75|2025-01-10 11:01:12\n", 114 | "'''" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "application/vnd.databricks.v1+cell": { 121 | "cellMetadata": {}, 122 | "inputWidgets": {}, 123 | "nuid": "9540d2e2-2562-4be7-897f-0a7d57adaa72", 124 | "showTitle": false, 125 | "tableResultSettingsMap": {}, 126 | "title": "" 127 | } 128 | }, 129 | "source": [ 130 | "##2. Filesystem operations\n", 131 | "1. Write dbutils.fs code to copy the above datasets into your created Volume folders:\n", 132 | "Customer → /Volumes/.../customer/\n", 133 | "Usage → /Volumes/.../usage/\n", 134 | "Tower (region-based) → /Volumes/.../tower/region1/ and /Volumes/.../tower/region2/\n", 135 | "\n", 136 | "2. Write a command to validate whether files were successfully copied" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": { 142 | "application/vnd.databricks.v1+cell": { 143 | "cellMetadata": {}, 144 | "inputWidgets": {}, 145 | "nuid": "8767735b-24d3-428a-ad12-ae821903e2ce", 146 | "showTitle": false, 147 | "tableResultSettingsMap": {}, 148 | "title": "" 149 | } 150 | }, 151 | "source": [ 152 | "##3. Spark Directory Read Use Cases\n", 153 | "1. Read all tower logs using:\n", 154 | "Path glob filter (example: *.csv)\n", 155 | "Multiple paths input\n", 156 | "Recursive lookup\n", 157 | "\n", 158 | "2. Demonstrate these 3 reads separately:\n", 159 | "Using pathGlobFilter\n", 160 | "Using list of paths in spark.read.csv([path1, path2])\n", 161 | "Using .option(\"recursiveFileLookup\",\"true\")\n", 162 | "\n", 163 | "3. Compare the outputs and understand when each should be used." 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": { 169 | "application/vnd.databricks.v1+cell": { 170 | "cellMetadata": {}, 171 | "inputWidgets": {}, 172 | "nuid": "9f7147c1-5d58-47e1-84fe-7ebd26a217b9", 173 | "showTitle": false, 174 | "tableResultSettingsMap": {}, 175 | "title": "" 176 | } 177 | }, 178 | "source": [ 179 | "##4. Schema Inference, Header, and Separator\n", 180 | "1. Try the Customer, Usage files with the option and options using read.csv and format function:
\n", 181 | "header=false, inferSchema=false
\n", 182 | "or
\n", 183 | "header=true, inferSchema=true
\n", 184 | "2. Write a note on What changed when we use header or inferSchema with true/false?
\n", 185 | "3. How schema inference handled “abc” in age?
" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "application/vnd.databricks.v1+cell": { 192 | "cellMetadata": {}, 193 | "inputWidgets": {}, 194 | "nuid": "15d8dad0-bc63-47f1-9a90-72837cba6c4f", 195 | "showTitle": false, 196 | "tableResultSettingsMap": {}, 197 | "title": "" 198 | } 199 | }, 200 | "source": [ 201 | "##5. Column Renaming Usecases\n", 202 | "1. Apply column names using string using toDF function for customer data\n", 203 | "2. Apply column names and datatype using the schema function for usage data\n", 204 | "3. Apply column names and datatype using the StructType with IntegerType, StringType, TimestampType and other classes for towers data " 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": { 210 | "application/vnd.databricks.v1+cell": { 211 | "cellMetadata": {}, 212 | "inputWidgets": {}, 213 | "nuid": "6e1d6d88-7bcc-4548-a0d1-15d37f6fc0be", 214 | "showTitle": false, 215 | "tableResultSettingsMap": {}, 216 | "title": "" 217 | } 218 | }, 219 | "source": [ 220 | "## Spark Write Operations using \n", 221 | "- csv, json, orc, parquet, delta, saveAsTable, insertInto, xml with different write mode, header and sep options" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": { 227 | "application/vnd.databricks.v1+cell": { 228 | "cellMetadata": {}, 229 | "inputWidgets": {}, 230 | "nuid": "8e34c3bc-962d-438d-a1b6-ac27d2da6608", 231 | "showTitle": false, 232 | "tableResultSettingsMap": {}, 233 | "title": "" 234 | } 235 | }, 236 | "source": [ 237 | "##6. Write Operations (Data Conversion/Schema migration) – CSV Format Usecases\n", 238 | "1. Write customer data into CSV format using overwrite mode\n", 239 | "2. Write usage data into CSV format using append mode\n", 240 | "3. Write tower data into CSV format with header enabled and custom separator (|)\n", 241 | "4. Read the tower data in a dataframe and show only 5 rows.\n", 242 | "5. Download the file into local from the catalog volume location and see the data of any of the above files opening in a notepad++." 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": { 248 | "application/vnd.databricks.v1+cell": { 249 | "cellMetadata": {}, 250 | "inputWidgets": {}, 251 | "nuid": "34158cf6-dd7f-40d6-9969-ed76710540a4", 252 | "showTitle": false, 253 | "tableResultSettingsMap": {}, 254 | "title": "" 255 | } 256 | }, 257 | "source": [ 258 | "##7. Write Operations (Data Conversion/Schema migration)– JSON Format Usecases\n", 259 | "1. Write customer data into JSON format using overwrite mode\n", 260 | "2. Write usage data into JSON format using append mode and snappy compression format\n", 261 | "3. Write tower data into JSON format using ignore mode and observe the behavior of this mode\n", 262 | "4. Read the tower data in a dataframe and show only 5 rows.\n", 263 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++." 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": { 269 | "application/vnd.databricks.v1+cell": { 270 | "cellMetadata": {}, 271 | "inputWidgets": {}, 272 | "nuid": "26f2ba69-3cde-4ec6-8945-e4ef9f7bb109", 273 | "showTitle": false, 274 | "tableResultSettingsMap": {}, 275 | "title": "" 276 | } 277 | }, 278 | "source": [ 279 | "##8. Write Operations (Data Conversion/Schema migration) – Parquet Format Usecases\n", 280 | "1. Write customer data into Parquet format using overwrite mode and in a gzip format\n", 281 | "2. Write usage data into Parquet format using error mode\n", 282 | "3. Write tower data into Parquet format with gzip compression option\n", 283 | "4. Read the usage data in a dataframe and show only 5 rows.\n", 284 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++." 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": { 290 | "application/vnd.databricks.v1+cell": { 291 | "cellMetadata": {}, 292 | "inputWidgets": {}, 293 | "nuid": "b41c794f-5cfc-4aeb-a599-e6d4a47a0f3f", 294 | "showTitle": false, 295 | "tableResultSettingsMap": {}, 296 | "title": "" 297 | } 298 | }, 299 | "source": [ 300 | "##9. Write Operations (Data Conversion/Schema migration) – Orc Format Usecases\n", 301 | "1. Write customer data into ORC format using overwrite mode\n", 302 | "2. Write usage data into ORC format using append mode\n", 303 | "3. Write tower data into ORC format and see the output file structure\n", 304 | "4. Read the usage data in a dataframe and show only 5 rows.\n", 305 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++." 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "application/vnd.databricks.v1+cell": { 312 | "cellMetadata": {}, 313 | "inputWidgets": {}, 314 | "nuid": "35761315-0b0f-46ff-9c3d-c0405bce7b62", 315 | "showTitle": false, 316 | "tableResultSettingsMap": {}, 317 | "title": "" 318 | } 319 | }, 320 | "source": [ 321 | "##10. Write Operations (Data Conversion/Schema migration) – Delta Format Usecases\n", 322 | "1. Write customer data into Delta format using overwrite mode\n", 323 | "2. Write usage data into Delta format using append mode\n", 324 | "3. Write tower data into Delta format and see the output file structure\n", 325 | "4. Read the usage data in a dataframe and show only 5 rows.\n", 326 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++.\n", 327 | "6. Compare the parquet location and delta location and try to understand what is the differentiating factor, as both are parquet files only." 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "application/vnd.databricks.v1+cell": { 334 | "cellMetadata": {}, 335 | "inputWidgets": {}, 336 | "nuid": "e6dd0890-02bd-4acd-b837-daceb256c706", 337 | "showTitle": false, 338 | "tableResultSettingsMap": {}, 339 | "title": "" 340 | } 341 | }, 342 | "source": [ 343 | "##11. Write Operations (Lakehouse Usecases) – Delta table Usecases\n", 344 | "1. Write customer data using saveAsTable() as a managed table\n", 345 | "2. Write usage data using saveAsTable() with overwrite mode\n", 346 | "3. Drop the managed table and verify data removal\n", 347 | "4. Go and check the table overview and realize it is in delta format in the Catalog.\n", 348 | "5. Use spark.read.sql to write some simple queries on the above tables created.\n" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": { 354 | "application/vnd.databricks.v1+cell": { 355 | "cellMetadata": {}, 356 | "inputWidgets": {}, 357 | "nuid": "1aac447b-690b-4562-99dd-0ce096e9ad55", 358 | "showTitle": false, 359 | "tableResultSettingsMap": {}, 360 | "title": "" 361 | } 362 | }, 363 | "source": [ 364 | "##12. Write Operations (Lakehouse Usecases) – Delta table Usecases\n", 365 | "1. Write customer data using insertInto() in a new table and find the behavior\n", 366 | "2. Write usage data using insertTable() with overwrite mode" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "application/vnd.databricks.v1+cell": { 373 | "cellMetadata": {}, 374 | "inputWidgets": {}, 375 | "nuid": "e3c4bce3-4bd3-4db6-a074-02bb24c5f91a", 376 | "showTitle": false, 377 | "tableResultSettingsMap": {}, 378 | "title": "" 379 | } 380 | }, 381 | "source": [ 382 | "##13. Write Operations (Lakehouse Usecases) – Delta table Usecases\n", 383 | "1. Write customer data into XML format using rowTag as cust\n", 384 | "2. Write usage data into XML format using overwrite mode with the rowTag as usage\n", 385 | "3. Download the xml data and open the file in notepad++ and see how the xml file looks like." 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": { 391 | "application/vnd.databricks.v1+cell": { 392 | "cellMetadata": {}, 393 | "inputWidgets": {}, 394 | "nuid": "83e2fe69-9352-4ec9-bf70-15d760c89aa3", 395 | "showTitle": false, 396 | "tableResultSettingsMap": {}, 397 | "title": "" 398 | } 399 | }, 400 | "source": [ 401 | "##14. Compare all the downloaded files (csv, json, orc, parquet, delta and xml) \n", 402 | "1. Capture the size occupied between all of these file formats and list the formats below based on the order of size from small to big." 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "application/vnd.databricks.v1+cell": { 409 | "cellMetadata": {}, 410 | "inputWidgets": {}, 411 | "nuid": "3d6e39ec-752d-4183-9656-2b6d7938922d", 412 | "showTitle": false, 413 | "tableResultSettingsMap": {}, 414 | "title": "" 415 | } 416 | }, 417 | "source": [ 418 | "##15. Do a final exercise of defining one/two liner of... \n", 419 | "1. When to use/benifits csv\n", 420 | "2. When to use/benifits json\n", 421 | "3. When to use/benifit orc\n", 422 | "4. When to use/benifit parquet\n", 423 | "5. When to use/benifit delta\n", 424 | "6. When to use/benifit xml\n", 425 | "7. When to use/benifit delta tables\n" 426 | ] 427 | } 428 | ], 429 | "metadata": { 430 | "application/vnd.databricks.v1+notebook": { 431 | "computePreferences": null, 432 | "dashboards": [], 433 | "environmentMetadata": { 434 | "base_environment": "", 435 | "environment_version": "4" 436 | }, 437 | "inputWidgetPreferences": null, 438 | "language": "python", 439 | "notebookMetadata": { 440 | "mostRecentlyExecutedCommandWithImplicitDF": { 441 | "commandId": -1, 442 | "dataframes": [ 443 | "_sqldf" 444 | ] 445 | }, 446 | "pythonIndentUnit": 4 447 | }, 448 | "notebookName": "read_write_usecases", 449 | "widgets": {} 450 | }, 451 | "language_info": { 452 | "name": "python" 453 | } 454 | }, 455 | "nbformat": 4, 456 | "nbformat_minor": 0 457 | } 458 | -------------------------------------------------------------------------------- /databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/1_Explore_Notebooks_magic_commands.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": { 8 | "byteLimit": 2048000, 9 | "rowLimit": 10000 10 | }, 11 | "inputWidgets": {}, 12 | "nuid": "2c477351-9470-4b26-8f73-6f967c37729e", 13 | "showTitle": false, 14 | "tableResultSettingsMap": {}, 15 | "title": "" 16 | } 17 | }, 18 | "source": [ 19 | "#Welcome to Inceptez Technologies\n", 20 | "Let us understand about creating notebooks & magical commands\n", 21 | "https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png\n", 22 | "![](https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "application/vnd.databricks.v1+cell": { 29 | "cellMetadata": { 30 | "byteLimit": 2048000, 31 | "rowLimit": 10000 32 | }, 33 | "inputWidgets": {}, 34 | "nuid": "3ae38262-0e89-4a3b-9130-322375328fe4", 35 | "showTitle": false, 36 | "tableResultSettingsMap": {}, 37 | "title": "" 38 | } 39 | }, 40 | "source": [ 41 | "##Let us learn first about Magical Commands\n", 42 | "**Important Magic Commands**\n", 43 | "- %md: allows you to write markdown text to design the notebook.\n", 44 | "- %run: runs a Python file or a notebook.\n", 45 | "- %sh: executes shell commands on the cluster edge/client node.\n", 46 | "- %fs: allows you to interact with the Databricks file system (Datalake command (cloud storage s3/adls/gcs))\n", 47 | "- %sql: allows you to run Spark SQL/HQL queries.\n", 48 | "- %python: switches the notebook context to Python.\n", 49 | "- %pip: allows you to install Python packages.\n", 50 | "\n", 51 | "**Not Important Magic Commands or We learn few of these where we have Cloud(Azure) dependency**\n", 52 | "- %scala: switches the notebook context to Scala.\n", 53 | "- %r: switches the notebook context to R.\n", 54 | "- %lsmagic: lists all the available magic commands.\n", 55 | "- %config: allows you to set configuration options for the notebook.\n", 56 | "- %load: loads the contents of a file into a cell.\n", 57 | "- %who: lists all the variables in the current scope." 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "application/vnd.databricks.v1+cell": { 64 | "cellMetadata": { 65 | "byteLimit": 2048000, 66 | "rowLimit": 10000 67 | }, 68 | "inputWidgets": {}, 69 | "nuid": "b35163da-292b-4c60-b6a9-a62521e22343", 70 | "showTitle": false, 71 | "tableResultSettingsMap": {}, 72 | "title": "" 73 | } 74 | }, 75 | "source": [ 76 | "####How to call a notebook from the current notebook using %run magic command" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 0, 82 | "metadata": { 83 | "application/vnd.databricks.v1+cell": { 84 | "cellMetadata": { 85 | "byteLimit": 2048000, 86 | "rowLimit": 10000 87 | }, 88 | "inputWidgets": {}, 89 | "nuid": "cb391d1b-fb52-49ab-a463-2849a7f60fea", 90 | "showTitle": false, 91 | "tableResultSettingsMap": {}, 92 | "title": "" 93 | } 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "%run \"/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook\"" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "application/vnd.databricks.v1+cell": { 104 | "cellMetadata": { 105 | "byteLimit": 2048000, 106 | "rowLimit": 10000 107 | }, 108 | "inputWidgets": {}, 109 | "nuid": "7b7260e3-4612-4e86-ac30-51e7c91fe669", 110 | "showTitle": false, 111 | "tableResultSettingsMap": {}, 112 | "title": "" 113 | } 114 | }, 115 | "source": [ 116 | "####How to run a linux commands inside a notebook using %sh magic command" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 0, 122 | "metadata": { 123 | "application/vnd.databricks.v1+cell": { 124 | "cellMetadata": { 125 | "byteLimit": 2048000, 126 | "rowLimit": 10000 127 | }, 128 | "inputWidgets": {}, 129 | "nuid": "7f274612-5fb2-4589-86c6-d236abeb9aba", 130 | "showTitle": false, 131 | "tableResultSettingsMap": {}, 132 | "title": "" 133 | } 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "%sh\n", 138 | "ls -l /databricks-datasets/airlines\n", 139 | "head -1 /databricks-datasets/airlines/part-01902" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": { 145 | "application/vnd.databricks.v1+cell": { 146 | "cellMetadata": {}, 147 | "inputWidgets": {}, 148 | "nuid": "85ffa4e3-dc14-4ea4-bab4-94151bc22f9d", 149 | "showTitle": false, 150 | "tableResultSettingsMap": {}, 151 | "title": "" 152 | } 153 | }, 154 | "source": [ 155 | "We are going to use Databricks Unity Catalog (We don't know about it yet)\n", 156 | "to create tables and files under the volume (catalog/schema/volume/folder/files)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 0, 162 | "metadata": { 163 | "application/vnd.databricks.v1+cell": { 164 | "cellMetadata": { 165 | "byteLimit": 2048000, 166 | "implicitDf": true, 167 | "rowLimit": 10000 168 | }, 169 | "inputWidgets": {}, 170 | "nuid": "85719c48-7508-45d6-b879-d5ab1e956bcc", 171 | "showTitle": false, 172 | "tableResultSettingsMap": {}, 173 | "title": "" 174 | } 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "%sql\n", 179 | "CREATE VOLUME IF NOT EXISTS workspace.default.volumewe47_datalake;" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": { 185 | "application/vnd.databricks.v1+cell": { 186 | "cellMetadata": { 187 | "byteLimit": 2048000, 188 | "rowLimit": 10000 189 | }, 190 | "inputWidgets": {}, 191 | "nuid": "97a1ed02-dc2f-4eac-9b34-ed9237d3cd20", 192 | "showTitle": false, 193 | "tableResultSettingsMap": {}, 194 | "title": "" 195 | } 196 | }, 197 | "source": [ 198 | "####Upload some sample data going into (Catalog -> My Organization -> Workspace -> Default -> Volumes)
How to run a DBFS (like Hadoop) FS commands inside a notebook using %fs magic command to copy the uploaded data into some other volume from the uploaded volume" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 0, 204 | "metadata": { 205 | "application/vnd.databricks.v1+cell": { 206 | "cellMetadata": { 207 | "byteLimit": 2048000, 208 | "rowLimit": 10000 209 | }, 210 | "inputWidgets": {}, 211 | "nuid": "6947aa56-af9f-42de-8262-b7bc680203f7", 212 | "showTitle": false, 213 | "tableResultSettingsMap": {}, 214 | "title": "" 215 | } 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "%fs ls \"dbfs:///Volumes/workspace/default/volumewe47_datalake\"" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 0, 225 | "metadata": { 226 | "application/vnd.databricks.v1+cell": { 227 | "cellMetadata": { 228 | "byteLimit": 2048000, 229 | "rowLimit": 10000 230 | }, 231 | "inputWidgets": {}, 232 | "nuid": "e68958aa-a396-4a19-b4e5-3541c3d5d756", 233 | "showTitle": false, 234 | "tableResultSettingsMap": {}, 235 | "title": "" 236 | } 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "%fs cp \"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients.csv\" \"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy.csv\"" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": { 246 | "application/vnd.databricks.v1+cell": { 247 | "cellMetadata": {}, 248 | "inputWidgets": {}, 249 | "nuid": "0a8e80b8-4546-468b-87ed-7bb6012c1a7b", 250 | "showTitle": false, 251 | "tableResultSettingsMap": {}, 252 | "title": "" 253 | } 254 | }, 255 | "source": [ 256 | "Learning for the first time the dbutils, we learn in detail later\n", 257 | "Rather using fs command, we can use databricks utility command (comprehensive) to copy the data/any other filesystem operations in the DBFS" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 0, 263 | "metadata": { 264 | "application/vnd.databricks.v1+cell": { 265 | "cellMetadata": { 266 | "byteLimit": 2048000, 267 | "rowLimit": 10000 268 | }, 269 | "inputWidgets": {}, 270 | "nuid": "f29a64d1-087b-46f7-8e97-63c38991fd40", 271 | "showTitle": false, 272 | "tableResultSettingsMap": {}, 273 | "title": "" 274 | } 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "%python\n", 279 | "dbutils.fs.cp(\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients.csv\",\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy2.csv\")\n", 280 | "dbutils.fs.rm(\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy.csv\")" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": { 286 | "application/vnd.databricks.v1+cell": { 287 | "cellMetadata": { 288 | "byteLimit": 2048000, 289 | "rowLimit": 10000 290 | }, 291 | "inputWidgets": {}, 292 | "nuid": "a43dd280-49bc-4f22-97fb-08a6b23218cc", 293 | "showTitle": false, 294 | "tableResultSettingsMap": {}, 295 | "title": "" 296 | } 297 | }, 298 | "source": [ 299 | "####How to run a Spark SQL/HQL Queries inside a notebook using %sql magic command" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 0, 305 | "metadata": { 306 | "application/vnd.databricks.v1+cell": { 307 | "cellMetadata": { 308 | "byteLimit": 2048000, 309 | "implicitDf": true, 310 | "rowLimit": 10000 311 | }, 312 | "inputWidgets": {}, 313 | "nuid": "0421bbb7-38d3-4c49-bc4a-28f0c30072e1", 314 | "showTitle": false, 315 | "tableResultSettingsMap": {}, 316 | "title": "" 317 | } 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "%sql\n", 322 | "create table if not exists default.cities2(id int,city string);\n", 323 | "insert into default.cities2 values(3,'Mumbai'),(4,'Lucknow');\n", 324 | "select * from cities2;" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 0, 330 | "metadata": { 331 | "application/vnd.databricks.v1+cell": { 332 | "cellMetadata": { 333 | "byteLimit": 2048000, 334 | "rowLimit": 10000 335 | }, 336 | "inputWidgets": {}, 337 | "nuid": "610c68dd-c905-4abc-9f33-ad5623ba4dcb", 338 | "showTitle": false, 339 | "tableResultSettingsMap": {}, 340 | "title": "" 341 | } 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "%python\n", 346 | "spark.sql(\"select * from cities2\").explain(True)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 0, 352 | "metadata": { 353 | "application/vnd.databricks.v1+cell": { 354 | "cellMetadata": { 355 | "byteLimit": 2048000, 356 | "implicitDf": true, 357 | "rowLimit": 10000 358 | }, 359 | "inputWidgets": {}, 360 | "nuid": "d1fe763e-6058-4ad3-980e-ae595fc32499", 361 | "showTitle": false, 362 | "tableResultSettingsMap": {}, 363 | "title": "" 364 | } 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "%sql\n", 369 | "update cities1 set city='Kolkata' where id=4;" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 0, 375 | "metadata": { 376 | "application/vnd.databricks.v1+cell": { 377 | "cellMetadata": { 378 | "byteLimit": 2048000, 379 | "implicitDf": true, 380 | "rowLimit": 10000 381 | }, 382 | "inputWidgets": {}, 383 | "nuid": "2cdb43f7-2838-42d6-a19b-60d2377d1812", 384 | "showTitle": false, 385 | "tableResultSettingsMap": {}, 386 | "title": "" 387 | } 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "%sql\n", 392 | "show create table cities1;" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 0, 398 | "metadata": { 399 | "application/vnd.databricks.v1+cell": { 400 | "cellMetadata": { 401 | "byteLimit": 2048000, 402 | "implicitDf": true, 403 | "rowLimit": 10000 404 | }, 405 | "inputWidgets": {}, 406 | "nuid": "3485856e-b35c-415f-a588-b74aadcbeafd", 407 | "showTitle": false, 408 | "tableResultSettingsMap": {}, 409 | "title": "" 410 | } 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "%sql\n", 415 | "from cities1 select *;" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": { 421 | "application/vnd.databricks.v1+cell": { 422 | "cellMetadata": { 423 | "byteLimit": 2048000, 424 | "rowLimit": 10000 425 | }, 426 | "inputWidgets": {}, 427 | "nuid": "0cb845bd-3334-4f02-8794-28797af494a0", 428 | "showTitle": false, 429 | "tableResultSettingsMap": {}, 430 | "title": "" 431 | } 432 | }, 433 | "source": [ 434 | "####How to run a Python Program inside a notebook using %python magic command or by default the cell will be enabled with python interpretter only" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 0, 440 | "metadata": { 441 | "application/vnd.databricks.v1+cell": { 442 | "cellMetadata": { 443 | "byteLimit": 2048000, 444 | "rowLimit": 10000 445 | }, 446 | "inputWidgets": {}, 447 | "nuid": "4745b1ba-c25c-4c6f-9a4b-6ab4ad27bd98", 448 | "showTitle": false, 449 | "tableResultSettingsMap": {}, 450 | "title": "" 451 | } 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "def sqrt(a):\n", 456 | " return a*a" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 0, 462 | "metadata": { 463 | "application/vnd.databricks.v1+cell": { 464 | "cellMetadata": { 465 | "byteLimit": 2048000, 466 | "rowLimit": 10000 467 | }, 468 | "inputWidgets": {}, 469 | "nuid": "86607897-fad2-454f-86ab-d1f639889cca", 470 | "showTitle": false, 471 | "tableResultSettingsMap": {}, 472 | "title": "" 473 | } 474 | }, 475 | "outputs": [], 476 | "source": [ 477 | "print(\"square root function call \",sqrt(10))" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": { 483 | "application/vnd.databricks.v1+cell": { 484 | "cellMetadata": {}, 485 | "inputWidgets": {}, 486 | "nuid": "bb77dbe6-2686-4f10-9781-6e37357e121e", 487 | "showTitle": false, 488 | "tableResultSettingsMap": {}, 489 | "title": "" 490 | } 491 | }, 492 | "source": [ 493 | "In the python magic cell itself, we already have spark session object instantiated,
\n", 494 | "so we can lavishly write spark programs" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": { 500 | "application/vnd.databricks.v1+cell": { 501 | "cellMetadata": { 502 | "byteLimit": 2048000, 503 | "rowLimit": 10000 504 | }, 505 | "inputWidgets": {}, 506 | "nuid": "1c3c9245-15f8-488a-99f1-c4ea979f607e", 507 | "showTitle": false, 508 | "tableResultSettingsMap": {}, 509 | "title": "" 510 | } 511 | }, 512 | "source": [ 513 | "####How to install additional libraries in this current Python Interpreter using %pip magic command" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 0, 519 | "metadata": { 520 | "application/vnd.databricks.v1+cell": { 521 | "cellMetadata": { 522 | "byteLimit": 2048000, 523 | "rowLimit": 10000 524 | }, 525 | "inputWidgets": {}, 526 | "nuid": "11ac3360-e8f1-4454-a37d-9ca2f965fcdd", 527 | "showTitle": false, 528 | "tableResultSettingsMap": {}, 529 | "title": "" 530 | } 531 | }, 532 | "outputs": [], 533 | "source": [ 534 | "%pip install pypi" 535 | ] 536 | } 537 | ], 538 | "metadata": { 539 | "application/vnd.databricks.v1+notebook": { 540 | "computePreferences": { 541 | "hardware": { 542 | "accelerator": null, 543 | "gpuPoolId": null, 544 | "memory": null 545 | } 546 | }, 547 | "dashboards": [], 548 | "environmentMetadata": null, 549 | "inputWidgetPreferences": null, 550 | "language": "python", 551 | "notebookMetadata": { 552 | "mostRecentlyExecutedCommandWithImplicitDF": { 553 | "commandId": 6631791723001833, 554 | "dataframes": [ 555 | "_sqldf" 556 | ] 557 | }, 558 | "pythonIndentUnit": 4 559 | }, 560 | "notebookName": "1_Explore_Notebooks_magic_commands", 561 | "widgets": {} 562 | }, 563 | "language_info": { 564 | "name": "python" 565 | } 566 | }, 567 | "nbformat": 4, 568 | "nbformat_minor": 0 569 | } 570 | -------------------------------------------------------------------------------- /databricks_workouts_2025_WE47/2_Spark_DataFrame_Read_Write_Operations/3-Basic-WriteOps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "e3756d01-4aa7-45d1-bffa-b7e3afd67e3c", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "#By Knowing this notebook, we can become a eligible \"DATA EGRESS DEVELOPER\"" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "application/vnd.databricks.v1+cell": { 23 | "cellMetadata": {}, 24 | "inputWidgets": {}, 25 | "nuid": "71713fcb-b659-4e62-bbee-1d3092f13683", 26 | "showTitle": false, 27 | "tableResultSettingsMap": {}, 28 | "title": "" 29 | } 30 | }, 31 | "source": [ 32 | "### Let's get some data we have already..." 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 0, 38 | "metadata": { 39 | "application/vnd.databricks.v1+cell": { 40 | "cellMetadata": { 41 | "byteLimit": 2048000, 42 | "rowLimit": 10000 43 | }, 44 | "inputWidgets": {}, 45 | "nuid": "059b8ec6-31ff-46f7-a19d-069340242a79", 46 | "showTitle": false, 47 | "tableResultSettingsMap": {}, 48 | "title": "" 49 | } 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "df1=spark.read.csv(path=\"/Volumes/we47catalog/we47schema/we47_volume/we47_dir1/custs_header\",header=True,inferSchema=True)\n", 54 | "df1.show(2)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "application/vnd.databricks.v1+cell": { 61 | "cellMetadata": {}, 62 | "inputWidgets": {}, 63 | "nuid": "4de366d1-b620-4eb3-8014-943802d1b67a", 64 | "showTitle": false, 65 | "tableResultSettingsMap": {}, 66 | "title": "" 67 | } 68 | }, 69 | "source": [ 70 | "### Writing the data in Builtin - different file formats & different targets (all targets in this world we can write the data also...)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "application/vnd.databricks.v1+cell": { 77 | "cellMetadata": {}, 78 | "inputWidgets": {}, 79 | "nuid": "265d5989-ca7e-4f2f-8e6a-93e75c23948d", 80 | "showTitle": false, 81 | "tableResultSettingsMap": {}, 82 | "title": "" 83 | } 84 | }, 85 | "source": [ 86 | "####1. Writing in csv format with few basic options listed below\n", 87 | "- header\n", 88 | "- sep\n", 89 | "- mode" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 0, 95 | "metadata": { 96 | "application/vnd.databricks.v1+cell": { 97 | "cellMetadata": { 98 | "byteLimit": 2048000, 99 | "rowLimit": 10000 100 | }, 101 | "inputWidgets": {}, 102 | "nuid": "a0ae6f6b-aae1-4749-ad83-da37c28e41bc", 103 | "showTitle": false, 104 | "tableResultSettingsMap": {}, 105 | "title": "" 106 | } 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "\n", 111 | "#We did a schema migration from comma to tilde delimiter\n", 112 | "df1.write.csv(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/csv_targetdata\",header=True,sep='~',mode='overwrite')" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": { 118 | "application/vnd.databricks.v1+cell": { 119 | "cellMetadata": {}, 120 | "inputWidgets": {}, 121 | "nuid": "a29a4365-30ff-49c1-af16-5ddbbaa9b3ca", 122 | "showTitle": false, 123 | "tableResultSettingsMap": {}, 124 | "title": "" 125 | } 126 | }, 127 | "source": [ 128 | "####2. Writing in json format with few basic options listed below\n", 129 | "path
\n", 130 | "mode\n", 131 | "- We did a schema migration and data conversion from csv to json format (ie structued to semi structured format)\n", 132 | "- json - we learn a lot subsequently, \n", 133 | "- what is json - fundamentally it is a dictionary of dictionaries\n", 134 | "- json - java script object notation\n", 135 | "- format - {\"k1\":v1,\"k2\":v2,\"k3\":v2} where key has to be unique & enclosed in double quotes and value can be anything\n", 136 | "- **when to go with json or benifits** - \n", 137 | "- a. If we have data in a semistructure format(variable data format with dynamic schema)\n", 138 | "- b. columns and the types and the order can be different\n", 139 | "- c. json will be provided by the sources if the data is dynamic in nature or if the data is api response in nature.\n", 140 | "- d. json is a efficient data format (serialized/encoded) for performing data exchange between applications via network & good for parsing also.\n", 141 | "- e. json can be used to group or create hierarchy of data in a complex or in a nested format." 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 0, 147 | "metadata": { 148 | "application/vnd.databricks.v1+cell": { 149 | "cellMetadata": { 150 | "byteLimit": 2048000, 151 | "rowLimit": 10000 152 | }, 153 | "inputWidgets": {}, 154 | "nuid": "f04f4317-79d6-4fe5-98e8-f41727c31739", 155 | "showTitle": false, 156 | "tableResultSettingsMap": {}, 157 | "title": "" 158 | } 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "df1.write.json(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/json_targetdata\",mode='append')" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "application/vnd.databricks.v1+cell": { 169 | "cellMetadata": {}, 170 | "inputWidgets": {}, 171 | "nuid": "3d9b6ca7-9aa4-4d18-bb18-0d62829dddd2", 172 | "showTitle": false, 173 | "tableResultSettingsMap": {}, 174 | "title": "" 175 | } 176 | }, 177 | "source": [ 178 | "####3.Serialization & Deserialization File formats (Brainy File formats)\n", 179 | "What are the (builtin) serialized file formats we are going to learn?\n", 180 | "orc\n", 181 | "parquet\n", 182 | "delta(databricks properatory)\n", 183 | "\n", 184 | "- We did a schema migration and data conversion from csv/json to serialized data format (ie structued to sturctured(internall binary unstructured) format)\n", 185 | "- We learn/use a lot/heavily subsequently, \n", 186 | "- what is serialized - fundamentally they are intelligent/encoded/serialized/binary data formats applied with lot of optimization & space reduction strategies..\n", 187 | "- orc - optimized row column format\n", 188 | "- parquet - tiled data format\n", 189 | "- delta(databricks properatory) enriched parquet format - Delta (modified) operations can be performed\n", 190 | "- format - serialized/encoded , we can't see with mere eyes, only some library is used deserialized/decoded data can be accessed as structured data\n", 191 | "- **when to go with serialized or benifits** - \n", 192 | "- a. For storage benifits for eg. orc will save 65+% of space for eg. if i store 1gb data it occupy 350 space, with compression it can improved more...\n", 193 | "- b. For processing optimization. Orc/parquet/delta will provide the required data alone if you query using Pushdown optimization .\n", 194 | "- c. Interoperability feature - this data format can be understandable in multiple environments for eg. bigquery can parse this data.\n", 195 | "- d. Secured\n", 196 | "- **In the projects/environments when to use what fileformats - we learn in detail later..." 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 0, 202 | "metadata": { 203 | "application/vnd.databricks.v1+cell": { 204 | "cellMetadata": { 205 | "byteLimit": 2048000, 206 | "rowLimit": 10000 207 | }, 208 | "inputWidgets": {}, 209 | "nuid": "f4fb6848-a995-4977-a1d0-ff547c686cd5", 210 | "showTitle": false, 211 | "tableResultSettingsMap": {}, 212 | "title": "" 213 | } 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "df1.write.orc(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/orc_targetdata\",mode='ignore')#serialization\n", 218 | "spark.read.orc(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/orc_targetdata\").show(2)#deserialization" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 0, 224 | "metadata": { 225 | "application/vnd.databricks.v1+cell": { 226 | "cellMetadata": { 227 | "byteLimit": 2048000, 228 | "rowLimit": 10000 229 | }, 230 | "inputWidgets": {}, 231 | "nuid": "aed8a769-b528-44ee-879f-b8c145e72c80", 232 | "showTitle": false, 233 | "tableResultSettingsMap": {}, 234 | "title": "" 235 | } 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "df1.write.option(\"maxRecordsPerFile\",1).parquet(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/parquet_targetdata2\",mode='error',compression='gzip')" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 0, 245 | "metadata": { 246 | "application/vnd.databricks.v1+cell": { 247 | "cellMetadata": { 248 | "byteLimit": 2048000, 249 | "rowLimit": 10000 250 | }, 251 | "inputWidgets": {}, 252 | "nuid": "27ff59ea-6793-4d25-9896-a14460d241a0", 253 | "showTitle": false, 254 | "tableResultSettingsMap": {}, 255 | "title": "" 256 | } 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "#df1.write.delta(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/delta_targetdata\")\n", 261 | "df1.write.format(\"delta\").save(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/delta_targetdata\",mode='overwrite')\n", 262 | "spark.read.format(\"delta\").load(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/delta_targetdata\").show(2)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 0, 268 | "metadata": { 269 | "application/vnd.databricks.v1+cell": { 270 | "cellMetadata": { 271 | "byteLimit": 2048000, 272 | "rowLimit": 10000 273 | }, 274 | "inputWidgets": {}, 275 | "nuid": "2b1d83f7-b903-42f8-aa39-3094dba9b94d", 276 | "showTitle": false, 277 | "tableResultSettingsMap": {}, 278 | "title": "" 279 | } 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "#What is the default format of file will be generated with, when we don't mention the format explicitly?\n", 284 | "#It is Parquet(Delta)\n", 285 | "df1.write.save(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/what_targetdata\",mode='overwrite')" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": { 291 | "application/vnd.databricks.v1+cell": { 292 | "cellMetadata": {}, 293 | "inputWidgets": {}, 294 | "nuid": "5bb4ab26-481f-4b00-a5cb-675b105863d2", 295 | "showTitle": false, 296 | "tableResultSettingsMap": {}, 297 | "title": "" 298 | } 299 | }, 300 | "source": [ 301 | "####4.Table Load Operations - Building LAKEHOUSE ON TOP OF DATALAKE\n", 302 | "Can we do SQL operations directly on the tables like a database or datawarehouse? or Can we build a Lakehouse in Databricks?\n", 303 | "- We learn/use a lot/heavily subsequently, \n", 304 | "- what is Lakehouse - A SQL/Datawarehouse/Query layer on top of the Datalake is called Lakehouse\n", 305 | "- We have different lakehouses which we are going to learn further - \n", 306 | "1. delta tables (lakehouse) in databricks\n", 307 | "2. hive in onprem\n", 308 | "3. bigquery in GCP\n", 309 | "4. synapse in azure\n", 310 | "5. athena in aws\n", 311 | "- **when to go with lakehouse** - \n", 312 | "- a. Transformation\n", 313 | "- b. Analysis/Analytics\n", 314 | "- c. AI/BI\n", 315 | "- d. Literally we are going to learn SQL & Advanced SQL" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 0, 321 | "metadata": { 322 | "application/vnd.databricks.v1+cell": { 323 | "cellMetadata": { 324 | "byteLimit": 2048000, 325 | "rowLimit": 10000 326 | }, 327 | "inputWidgets": {}, 328 | "nuid": "ad625815-1e9d-4917-b87a-8e8d756bee72", 329 | "showTitle": false, 330 | "tableResultSettingsMap": {}, 331 | "title": "" 332 | } 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "#Out of 18 write.functions, we know 9 functions, lets go with few more basic functions (xml, saveAsTable,InsertInto)\n", 337 | "df1.write.saveAsTable(\"default.customertbl\",mode='overwrite')#default delta format\n", 338 | "spark.read.table(\"default.customertbl\").show(2)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 0, 344 | "metadata": { 345 | "application/vnd.databricks.v1+cell": { 346 | "cellMetadata": { 347 | "byteLimit": 2048000, 348 | "rowLimit": 10000 349 | }, 350 | "inputWidgets": {}, 351 | "nuid": "9f323284-afde-43f8-9a7c-c0838afa3391", 352 | "showTitle": false, 353 | "tableResultSettingsMap": {}, 354 | "title": "" 355 | } 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "#Notes Unlike :meth:`DataFrameWriter.saveAsTable`, :meth:`DataFrameWriter.insertInto` ignores the column names and just uses position-based resolution.\n", 360 | "# table has to be present already\n", 361 | "# this will be used for some minimal data write operation hence preferred function is saveAsTable()\n", 362 | "df1.write.insertInto(\"customertbl\",overwrite=True)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": { 368 | "application/vnd.databricks.v1+cell": { 369 | "cellMetadata": {}, 370 | "inputWidgets": {}, 371 | "nuid": "7586d9b9-5766-44e9-9c4a-51c1805f316c", 372 | "showTitle": false, 373 | "tableResultSettingsMap": {}, 374 | "title": "" 375 | } 376 | }, 377 | "source": [ 378 | "####5. XML Format - Semi structured data format (most of the json features can be applied in xml also, but in DE world not so famous like json)\n", 379 | "- Used rarely on demand (by certain target/source systems eg. mainframes)\n", 380 | "- Can be related with json, but not so much efficient like json\n", 381 | "- Databricks provides xml as a inbuild function" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 0, 387 | "metadata": { 388 | "application/vnd.databricks.v1+cell": { 389 | "cellMetadata": { 390 | "byteLimit": 2048000, 391 | "rowLimit": 10000 392 | }, 393 | "inputWidgets": {}, 394 | "nuid": "82e3bc79-afad-4cbb-a84a-1f98f0f06c1f", 395 | "showTitle": false, 396 | "tableResultSettingsMap": {}, 397 | "title": "" 398 | } 399 | }, 400 | "outputs": [], 401 | "source": [ 402 | "df1.write.xml(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/xml_targetdata\",rowTag='customer',mode='overwrite')" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 0, 408 | "metadata": { 409 | "application/vnd.databricks.v1+cell": { 410 | "cellMetadata": { 411 | "byteLimit": 2048000, 412 | "rowLimit": 10000 413 | }, 414 | "inputWidgets": {}, 415 | "nuid": "a1c88733-687d-4f52-b864-1eec9a7eb87e", 416 | "showTitle": false, 417 | "tableResultSettingsMap": {}, 418 | "title": "" 419 | } 420 | }, 421 | "outputs": [], 422 | "source": [ 423 | "spark.read.xml(\"/Volumes/workspace/default/volumewe47_datalake/serialized_compressed_data_sources/xml_targetdata\",rowTag='customer').show(2)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": { 429 | "application/vnd.databricks.v1+cell": { 430 | "cellMetadata": {}, 431 | "inputWidgets": {}, 432 | "nuid": "7abb6500-0b7e-4339-b814-3e356c78d7ce", 433 | "showTitle": false, 434 | "tableResultSettingsMap": {}, 435 | "title": "" 436 | } 437 | }, 438 | "source": [ 439 | "### Modes in Writing\n", 440 | "1. **Append** - Adds the new data to the existing data. It does not overwrite anything.\n", 441 | "2. **Overwrite** - Replaces the existing data entirely at the destination.\n", 442 | "3. **ErrorIfexist**(default) - Throws an error if data already exists at the destination.\n", 443 | "4. **Ignore** - Skips the write operation if data already exists at the destination." 444 | ] 445 | } 446 | ], 447 | "metadata": { 448 | "application/vnd.databricks.v1+notebook": { 449 | "computePreferences": null, 450 | "dashboards": [], 451 | "environmentMetadata": { 452 | "base_environment": "", 453 | "environment_version": "3" 454 | }, 455 | "inputWidgetPreferences": null, 456 | "language": "python", 457 | "notebookMetadata": { 458 | "mostRecentlyExecutedCommandWithImplicitDF": { 459 | "commandId": 7347217471020383, 460 | "dataframes": [ 461 | "_sqldf" 462 | ] 463 | }, 464 | "pythonIndentUnit": 4 465 | }, 466 | "notebookName": "3-Basic-WriteOps", 467 | "widgets": {} 468 | }, 469 | "language_info": { 470 | "name": "python" 471 | } 472 | }, 473 | "nbformat": 4, 474 | "nbformat_minor": 0 475 | } 476 | -------------------------------------------------------------------------------- /databricks_workouts_2025/2_Spark_DataFrame_Read_Write_Operations/read_write_usecases.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "8ba86a20-5a3a-4130-86f5-e312f4a7901b", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "#Telecom Domain Read & Write Ops Assignment - Building Datalake & Lakehouse\n", 17 | "This notebook contains assignments to practice Spark read options and Databricks volumes.
\n", 18 | "Sections: Sample data creation, Catalog & Volume creation, Copying data into Volumes, Path glob/recursive reads, toDF() column renaming variants, inferSchema/header/separator experiments, and exercises.
" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "application/vnd.databricks.v1+cell": { 25 | "cellMetadata": {}, 26 | "inputWidgets": {}, 27 | "nuid": "841c7ed8-ef18-486a-8187-07685e499b84", 28 | "showTitle": false, 29 | "tableResultSettingsMap": {}, 30 | "title": "" 31 | } 32 | }, 33 | "source": [ 34 | "![](https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png)\n", 35 | "![](https://theciotimes.com/wp-content/uploads/2021/03/TELECOM1.jpg)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "application/vnd.databricks.v1+cell": { 42 | "cellMetadata": {}, 43 | "inputWidgets": {}, 44 | "nuid": "d4aa0a44-8cd6-41cf-921d-abb5ff67615b", 45 | "showTitle": false, 46 | "tableResultSettingsMap": {}, 47 | "title": "" 48 | } 49 | }, 50 | "source": [ 51 | "##First Import all required libraries & Create spark session object" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "application/vnd.databricks.v1+cell": { 58 | "cellMetadata": {}, 59 | "inputWidgets": {}, 60 | "nuid": "d0b67823-2e4e-45e2-aa25-80550a3ac580", 61 | "showTitle": false, 62 | "tableResultSettingsMap": {}, 63 | "title": "" 64 | } 65 | }, 66 | "source": [ 67 | "##1. Write SQL statements to create:\n", 68 | "1. A catalog named telecom_catalog_assign\n", 69 | "2. A schema landing_zone\n", 70 | "3. A volume landing_vol\n", 71 | "4. Using dbutils.fs.mkdirs, create folders:
\n", 72 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/\n", 73 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/\n", 74 | "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/\n", 75 | "5. Explain the difference between (Just google and understand why we are going for volume concept for prod ready systems):
\n", 76 | "a. Volume vs DBFS/FileStore
\n", 77 | "b. Why production teams prefer Volumes for regulated data
" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": { 83 | "application/vnd.databricks.v1+cell": { 84 | "cellMetadata": {}, 85 | "inputWidgets": {}, 86 | "nuid": "26d8bd3d-b575-448b-ae22-8173d15ca671", 87 | "showTitle": false, 88 | "tableResultSettingsMap": {}, 89 | "title": "" 90 | } 91 | }, 92 | "source": [ 93 | "##Data files to use in this usecase:\n", 94 | "customer_csv = '''\n", 95 | "101,Arun,31,Chennai,PREPAID\n", 96 | "102,Meera,45,Bangalore,POSTPAID\n", 97 | "103,Irfan,29,Hyderabad,PREPAID\n", 98 | "104,Raj,52,Mumbai,POSTPAID\n", 99 | "105,,27,Delhi,PREPAID\n", 100 | "106,Sneha,abc,Pune,PREPAID\n", 101 | "'''\n", 102 | "\n", 103 | "usage_tsv = '''customer_id\\tvoice_mins\\tdata_mb\\tsms_count\n", 104 | "101\\t320\\t1500\\t20\n", 105 | "102\\t120\\t4000\\t5\n", 106 | "103\\t540\\t600\\t52\n", 107 | "104\\t45\\t200\\t2\n", 108 | "105\\t0\\t0\\t0\n", 109 | "'''\n", 110 | "\n", 111 | "tower_logs_region1 = '''event_id|customer_id|tower_id|signal_strength|timestamp\n", 112 | "5001|101|TWR01|-80|2025-01-10 10:21:54\n", 113 | "5004|104|TWR05|-75|2025-01-10 11:01:12\n", 114 | "'''" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "application/vnd.databricks.v1+cell": { 121 | "cellMetadata": {}, 122 | "inputWidgets": {}, 123 | "nuid": "9540d2e2-2562-4be7-897f-0a7d57adaa72", 124 | "showTitle": false, 125 | "tableResultSettingsMap": {}, 126 | "title": "" 127 | } 128 | }, 129 | "source": [ 130 | "##2. Filesystem operations\n", 131 | "1. Write dbutils.fs code to copy the above datasets into your created Volume folders:\n", 132 | "Customer → /Volumes/.../customer/\n", 133 | "Usage → /Volumes/.../usage/\n", 134 | "Tower (region-based) → /Volumes/.../tower/region1/ and /Volumes/.../tower/region2/\n", 135 | "\n", 136 | "2. Write a command to validate whether files were successfully copied" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": { 142 | "application/vnd.databricks.v1+cell": { 143 | "cellMetadata": {}, 144 | "inputWidgets": {}, 145 | "nuid": "8767735b-24d3-428a-ad12-ae821903e2ce", 146 | "showTitle": false, 147 | "tableResultSettingsMap": {}, 148 | "title": "" 149 | } 150 | }, 151 | "source": [ 152 | "##3. Spark Directory Read Use Cases\n", 153 | "1. Read all tower logs using:\n", 154 | "Path glob filter (example: *.csv)\n", 155 | "Multiple paths input\n", 156 | "Recursive lookup\n", 157 | "\n", 158 | "2. Demonstrate these 3 reads separately:\n", 159 | "Using pathGlobFilter\n", 160 | "Using list of paths in spark.read.csv([path1, path2])\n", 161 | "Using .option(\"recursiveFileLookup\",\"true\")\n", 162 | "\n", 163 | "3. Compare the outputs and understand when each should be used." 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": { 169 | "application/vnd.databricks.v1+cell": { 170 | "cellMetadata": {}, 171 | "inputWidgets": {}, 172 | "nuid": "9f7147c1-5d58-47e1-84fe-7ebd26a217b9", 173 | "showTitle": false, 174 | "tableResultSettingsMap": {}, 175 | "title": "" 176 | } 177 | }, 178 | "source": [ 179 | "##4. Schema Inference, Header, and Separator\n", 180 | "1. Try the Customer, Usage files with the option and options using read.csv and format function:
\n", 181 | "header=false, inferSchema=false
\n", 182 | "or
\n", 183 | "header=true, inferSchema=true
\n", 184 | "2. Write a note on What changed when we use header or inferSchema with true/false?
\n", 185 | "3. How schema inference handled “abc” in age?
" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "application/vnd.databricks.v1+cell": { 192 | "cellMetadata": {}, 193 | "inputWidgets": {}, 194 | "nuid": "15d8dad0-bc63-47f1-9a90-72837cba6c4f", 195 | "showTitle": false, 196 | "tableResultSettingsMap": {}, 197 | "title": "" 198 | } 199 | }, 200 | "source": [ 201 | "##5. Column Renaming Usecases\n", 202 | "1. Apply column names using string using toDF function for customer data\n", 203 | "2. Apply column names and datatype using the schema function for usage data\n", 204 | "3. Apply column names and datatype using the StructType with IntegerType, StringType, TimestampType and other classes for towers data " 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": { 210 | "application/vnd.databricks.v1+cell": { 211 | "cellMetadata": {}, 212 | "inputWidgets": {}, 213 | "nuid": "6e1d6d88-7bcc-4548-a0d1-15d37f6fc0be", 214 | "showTitle": false, 215 | "tableResultSettingsMap": {}, 216 | "title": "" 217 | } 218 | }, 219 | "source": [ 220 | "## Spark Write Operations using \n", 221 | "- csv, json, orc, parquet, delta, saveAsTable, insertInto, xml with different write mode, header and sep options" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": { 227 | "application/vnd.databricks.v1+cell": { 228 | "cellMetadata": {}, 229 | "inputWidgets": {}, 230 | "nuid": "8e34c3bc-962d-438d-a1b6-ac27d2da6608", 231 | "showTitle": false, 232 | "tableResultSettingsMap": {}, 233 | "title": "" 234 | } 235 | }, 236 | "source": [ 237 | "##6. Write Operations (Data Conversion/Schema migration) – CSV Format Usecases\n", 238 | "1. Write customer data into CSV format using overwrite mode\n", 239 | "2. Write usage data into CSV format using append mode\n", 240 | "3. Write tower data into CSV format with header enabled and custom separator (|)\n", 241 | "4. Read the tower data in a dataframe and show only 5 rows.\n", 242 | "5. Download the file into local from the catalog volume location and see the data of any of the above files opening in a notepad++." 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": { 248 | "application/vnd.databricks.v1+cell": { 249 | "cellMetadata": {}, 250 | "inputWidgets": {}, 251 | "nuid": "34158cf6-dd7f-40d6-9969-ed76710540a4", 252 | "showTitle": false, 253 | "tableResultSettingsMap": {}, 254 | "title": "" 255 | } 256 | }, 257 | "source": [ 258 | "##7. Write Operations (Data Conversion/Schema migration)– JSON Format Usecases\n", 259 | "1. Write customer data into JSON format using overwrite mode\n", 260 | "2. Write usage data into JSON format using append mode and snappy compression format\n", 261 | "3. Write tower data into JSON format using ignore mode and observe the behavior of this mode\n", 262 | "4. Read the tower data in a dataframe and show only 5 rows.\n", 263 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++." 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": { 269 | "application/vnd.databricks.v1+cell": { 270 | "cellMetadata": {}, 271 | "inputWidgets": {}, 272 | "nuid": "26f2ba69-3cde-4ec6-8945-e4ef9f7bb109", 273 | "showTitle": false, 274 | "tableResultSettingsMap": {}, 275 | "title": "" 276 | } 277 | }, 278 | "source": [ 279 | "##8. Write Operations (Data Conversion/Schema migration) – Parquet Format Usecases\n", 280 | "1. Write customer data into Parquet format using overwrite mode and in a gzip format\n", 281 | "2. Write usage data into Parquet format using error mode\n", 282 | "3. Write tower data into Parquet format with gzip compression option\n", 283 | "4. Read the usage data in a dataframe and show only 5 rows.\n", 284 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++." 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": { 290 | "application/vnd.databricks.v1+cell": { 291 | "cellMetadata": {}, 292 | "inputWidgets": {}, 293 | "nuid": "b41c794f-5cfc-4aeb-a599-e6d4a47a0f3f", 294 | "showTitle": false, 295 | "tableResultSettingsMap": {}, 296 | "title": "" 297 | } 298 | }, 299 | "source": [ 300 | "##9. Write Operations (Data Conversion/Schema migration) – Orc Format Usecases\n", 301 | "1. Write customer data into ORC format using overwrite mode\n", 302 | "2. Write usage data into ORC format using append mode\n", 303 | "3. Write tower data into ORC format and see the output file structure\n", 304 | "4. Read the usage data in a dataframe and show only 5 rows.\n", 305 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++." 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "application/vnd.databricks.v1+cell": { 312 | "cellMetadata": {}, 313 | "inputWidgets": {}, 314 | "nuid": "35761315-0b0f-46ff-9c3d-c0405bce7b62", 315 | "showTitle": false, 316 | "tableResultSettingsMap": {}, 317 | "title": "" 318 | } 319 | }, 320 | "source": [ 321 | "##10. Write Operations (Data Conversion/Schema migration) – Delta Format Usecases\n", 322 | "1. Write customer data into Delta format using overwrite mode\n", 323 | "2. Write usage data into Delta format using append mode\n", 324 | "3. Write tower data into Delta format and see the output file structure\n", 325 | "4. Read the usage data in a dataframe and show only 5 rows.\n", 326 | "5. Download the file into local harddisk from the catalog volume location and see the data of any of the above files opening in a notepad++.\n", 327 | "6. Compare the parquet location and delta location and try to understand what is the differentiating factor, as both are parquet files only." 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "application/vnd.databricks.v1+cell": { 334 | "cellMetadata": {}, 335 | "inputWidgets": {}, 336 | "nuid": "e6dd0890-02bd-4acd-b837-daceb256c706", 337 | "showTitle": false, 338 | "tableResultSettingsMap": {}, 339 | "title": "" 340 | } 341 | }, 342 | "source": [ 343 | "##11. Write Operations (Lakehouse Usecases) – Delta table Usecases\n", 344 | "1. Write customer data using saveAsTable() as a managed table\n", 345 | "2. Write usage data using saveAsTable() with overwrite mode\n", 346 | "3. Drop the managed table and verify data removal\n", 347 | "4. Go and check the table overview and realize it is in delta format in the Catalog.\n", 348 | "5. Use spark.read.sql to write some simple queries on the above tables created.\n" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": { 354 | "application/vnd.databricks.v1+cell": { 355 | "cellMetadata": {}, 356 | "inputWidgets": {}, 357 | "nuid": "1aac447b-690b-4562-99dd-0ce096e9ad55", 358 | "showTitle": false, 359 | "tableResultSettingsMap": {}, 360 | "title": "" 361 | } 362 | }, 363 | "source": [ 364 | "##12. Write Operations (Lakehouse Usecases) – Delta table Usecases\n", 365 | "1. Write customer data using insertInto() in a new table and find the behavior\n", 366 | "2. Write usage data using insertTable() with overwrite mode" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "application/vnd.databricks.v1+cell": { 373 | "cellMetadata": {}, 374 | "inputWidgets": {}, 375 | "nuid": "e3c4bce3-4bd3-4db6-a074-02bb24c5f91a", 376 | "showTitle": false, 377 | "tableResultSettingsMap": {}, 378 | "title": "" 379 | } 380 | }, 381 | "source": [ 382 | "##13. Write Operations (Lakehouse Usecases) – Delta table Usecases\n", 383 | "1. Write customer data into XML format using rowTag as cust\n", 384 | "2. Write usage data into XML format using overwrite mode with the rowTag as usage\n", 385 | "3. Download the xml data and open the file in notepad++ and see how the xml file looks like." 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": { 391 | "application/vnd.databricks.v1+cell": { 392 | "cellMetadata": {}, 393 | "inputWidgets": {}, 394 | "nuid": "83e2fe69-9352-4ec9-bf70-15d760c89aa3", 395 | "showTitle": false, 396 | "tableResultSettingsMap": {}, 397 | "title": "" 398 | } 399 | }, 400 | "source": [ 401 | "##14. Compare all the downloaded files (csv, json, orc, parquet, delta and xml) \n", 402 | "1. Capture the size occupied between all of these file formats and list the formats below based on the order of size from small to big." 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "application/vnd.databricks.v1+cell": { 409 | "cellMetadata": {}, 410 | "inputWidgets": {}, 411 | "nuid": "7d4d5ddc-359f-48fe-bac4-c40271b48163", 412 | "showTitle": false, 413 | "tableResultSettingsMap": {}, 414 | "title": "" 415 | } 416 | }, 417 | "source": [ 418 | "###15. Try to do permutation and combination of performing Schema Migration & Data Conversion operations like...\n", 419 | "1. Read any one of the above orc data in a dataframe and write it to dbfs in a parquet format\n", 420 | "2. Read any one of the above parquet data in a dataframe and write it to dbfs in a delta format\n", 421 | "3. Read any one of the above delta data in a dataframe and write it to dbfs in a xml format\n", 422 | "4. Read any one of the above delta table in a dataframe and write it to dbfs in a json format\n", 423 | "5. Read any one of the above delta table in a dataframe and write it to another table" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": { 429 | "application/vnd.databricks.v1+cell": { 430 | "cellMetadata": {}, 431 | "inputWidgets": {}, 432 | "nuid": "3d6e39ec-752d-4183-9656-2b6d7938922d", 433 | "showTitle": false, 434 | "tableResultSettingsMap": {}, 435 | "title": "" 436 | } 437 | }, 438 | "source": [ 439 | "##16. Do a final exercise of defining one/two liner of... \n", 440 | "1. When to use/benifits csv\n", 441 | "2. When to use/benifits json\n", 442 | "3. When to use/benifit orc\n", 443 | "4. When to use/benifit parquet\n", 444 | "5. When to use/benifit delta\n", 445 | "6. When to use/benifit xml\n", 446 | "7. When to use/benifit delta tables\n" 447 | ] 448 | } 449 | ], 450 | "metadata": { 451 | "application/vnd.databricks.v1+notebook": { 452 | "computePreferences": null, 453 | "dashboards": [], 454 | "environmentMetadata": { 455 | "base_environment": "", 456 | "environment_version": "4" 457 | }, 458 | "inputWidgetPreferences": null, 459 | "language": "python", 460 | "notebookMetadata": { 461 | "mostRecentlyExecutedCommandWithImplicitDF": { 462 | "commandId": -1, 463 | "dataframes": [ 464 | "_sqldf" 465 | ] 466 | }, 467 | "pythonIndentUnit": 4 468 | }, 469 | "notebookName": "read_write_usecases", 470 | "widgets": {} 471 | }, 472 | "language_info": { 473 | "name": "python" 474 | } 475 | }, 476 | "nbformat": 4, 477 | "nbformat_minor": 0 478 | } 479 | -------------------------------------------------------------------------------- /databricks_workouts_2025_WE47/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/1_Explore_Notebooks_magic_commands.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": { 8 | "byteLimit": 2048000, 9 | "rowLimit": 10000 10 | }, 11 | "inputWidgets": {}, 12 | "nuid": "2c477351-9470-4b26-8f73-6f967c37729e", 13 | "showTitle": false, 14 | "tableResultSettingsMap": {}, 15 | "title": "" 16 | } 17 | }, 18 | "source": [ 19 | "#Welcome to Inceptez Technologies\n", 20 | "Let us understand about creating notebooks & magical commands\n", 21 | "https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png\n", 22 | "![](https://fplogoimages.withfloats.com/actual/68009c3a43430aff8a30419d.png)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "application/vnd.databricks.v1+cell": { 29 | "cellMetadata": { 30 | "byteLimit": 2048000, 31 | "rowLimit": 10000 32 | }, 33 | "inputWidgets": {}, 34 | "nuid": "3ae38262-0e89-4a3b-9130-322375328fe4", 35 | "showTitle": false, 36 | "tableResultSettingsMap": {}, 37 | "title": "" 38 | } 39 | }, 40 | "source": [ 41 | "##Let us learn first about Magical Commands\n", 42 | "**Important Magic Commands**\n", 43 | "- %md: allows you to write markdown text to design the notebook.\n", 44 | "- %run: runs a Python file or a notebook.\n", 45 | "- %sh: executes shell commands on the cluster nodes.\n", 46 | "- %fs: allows you to interact with the Databricks file system (Datalake command)\n", 47 | "- %sql: allows you to run Spark SQL/HQL queries.\n", 48 | "- %python: switches the notebook context to Python.\n", 49 | "- %pip: allows you to install Python packages.\n", 50 | "\n", 51 | "**Not Important Magic Commands or We learn few of these where we have Cloud(Azure) dependency**\n", 52 | "- %scala: switches the notebook context to Scala.\n", 53 | "- %r: switches the notebook context to R.\n", 54 | "- %lsmagic: lists all the available magic commands.\n", 55 | "- %config: allows you to set configuration options for the notebook.\n", 56 | "- %load: loads the contents of a file into a cell.\n", 57 | "- %who: lists all the variables in the current scope." 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "application/vnd.databricks.v1+cell": { 64 | "cellMetadata": { 65 | "byteLimit": 2048000, 66 | "rowLimit": 10000 67 | }, 68 | "inputWidgets": {}, 69 | "nuid": "b35163da-292b-4c60-b6a9-a62521e22343", 70 | "showTitle": false, 71 | "tableResultSettingsMap": {}, 72 | "title": "" 73 | } 74 | }, 75 | "source": [ 76 | "####How to call a notebook from the current notebook using %run magic command" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 0, 82 | "metadata": { 83 | "application/vnd.databricks.v1+cell": { 84 | "cellMetadata": { 85 | "byteLimit": 2048000, 86 | "rowLimit": 10000 87 | }, 88 | "inputWidgets": {}, 89 | "nuid": "cb391d1b-fb52-49ab-a463-2849a7f60fea", 90 | "showTitle": false, 91 | "tableResultSettingsMap": {}, 92 | "title": "" 93 | } 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "%run \"/Workspace/Users/infoblisstech@gmail.com/databricks-code-repo/databricks_workouts_2025/1_DATABRICKS_NOTEBOOK_FUNDAMENTALS/4_child_notebook\"" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "application/vnd.databricks.v1+cell": { 104 | "cellMetadata": { 105 | "byteLimit": 2048000, 106 | "rowLimit": 10000 107 | }, 108 | "inputWidgets": {}, 109 | "nuid": "7b7260e3-4612-4e86-ac30-51e7c91fe669", 110 | "showTitle": false, 111 | "tableResultSettingsMap": {}, 112 | "title": "" 113 | } 114 | }, 115 | "source": [ 116 | "####How to run a linux commands inside a notebook using %sh magic command" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 0, 122 | "metadata": { 123 | "application/vnd.databricks.v1+cell": { 124 | "cellMetadata": { 125 | "byteLimit": 2048000, 126 | "rowLimit": 10000 127 | }, 128 | "inputWidgets": {}, 129 | "nuid": "7f274612-5fb2-4589-86c6-d236abeb9aba", 130 | "showTitle": false, 131 | "tableResultSettingsMap": {}, 132 | "title": "" 133 | } 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "%sh\n", 138 | "ls -l /databricks-datasets/airlines\n", 139 | "head -1 /databricks-datasets/airlines/part-01902\n", 140 | "echo \"head completed\"\n", 141 | "tail -1 /databricks-datasets/airlines/part-01902" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": { 147 | "application/vnd.databricks.v1+cell": { 148 | "cellMetadata": {}, 149 | "inputWidgets": {}, 150 | "nuid": "85ffa4e3-dc14-4ea4-bab4-94151bc22f9d", 151 | "showTitle": false, 152 | "tableResultSettingsMap": {}, 153 | "title": "" 154 | } 155 | }, 156 | "source": [ 157 | "We are going to use Databricks Unity Catalog (We don't know about it yet)\n", 158 | "to create tables and files under the volume (catalog/schema/volume/folder/files)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 0, 164 | "metadata": { 165 | "application/vnd.databricks.v1+cell": { 166 | "cellMetadata": { 167 | "byteLimit": 2048000, 168 | "implicitDf": true, 169 | "rowLimit": 10000 170 | }, 171 | "inputWidgets": {}, 172 | "nuid": "85719c48-7508-45d6-b879-d5ab1e956bcc", 173 | "showTitle": false, 174 | "tableResultSettingsMap": {}, 175 | "title": "" 176 | } 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "%sql\n", 181 | "CREATE VOLUME IF NOT EXISTS workspace.default.volumewe47_datalake;" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": { 187 | "application/vnd.databricks.v1+cell": { 188 | "cellMetadata": { 189 | "byteLimit": 2048000, 190 | "rowLimit": 10000 191 | }, 192 | "inputWidgets": {}, 193 | "nuid": "97a1ed02-dc2f-4eac-9b34-ed9237d3cd20", 194 | "showTitle": false, 195 | "tableResultSettingsMap": {}, 196 | "title": "" 197 | } 198 | }, 199 | "source": [ 200 | "####Upload some sample data going into (Catalog -> My Organization -> Workspace -> Default -> Volumes)
How to run a DBFS (like Hadoop) FS commands inside a notebook using %fs magic command to copy the uploaded data into some other volume from the uploaded volume" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 0, 206 | "metadata": { 207 | "application/vnd.databricks.v1+cell": { 208 | "cellMetadata": { 209 | "byteLimit": 2048000, 210 | "rowLimit": 10000 211 | }, 212 | "inputWidgets": {}, 213 | "nuid": "6947aa56-af9f-42de-8262-b7bc680203f7", 214 | "showTitle": false, 215 | "tableResultSettingsMap": {}, 216 | "title": "" 217 | } 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "%fs ls \"dbfs:///Volumes/workspace/default/volumewe47_datalake\"" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 0, 227 | "metadata": { 228 | "application/vnd.databricks.v1+cell": { 229 | "cellMetadata": { 230 | "byteLimit": 2048000, 231 | "rowLimit": 10000 232 | }, 233 | "inputWidgets": {}, 234 | "nuid": "e68958aa-a396-4a19-b4e5-3541c3d5d756", 235 | "showTitle": false, 236 | "tableResultSettingsMap": {}, 237 | "title": "" 238 | } 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "%fs cp \"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients.csv\" \"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy.csv\"" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": { 248 | "application/vnd.databricks.v1+cell": { 249 | "cellMetadata": {}, 250 | "inputWidgets": {}, 251 | "nuid": "0a8e80b8-4546-468b-87ed-7bb6012c1a7b", 252 | "showTitle": false, 253 | "tableResultSettingsMap": {}, 254 | "title": "" 255 | } 256 | }, 257 | "source": [ 258 | "Learning for the first time the dbutils, we learn in detail later\n", 259 | "Rather using fs command, we can use databricks utility command (comprehensive) to copy the data/any other filesystem operations in the DBFS" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 0, 265 | "metadata": { 266 | "application/vnd.databricks.v1+cell": { 267 | "cellMetadata": { 268 | "byteLimit": 2048000, 269 | "rowLimit": 10000 270 | }, 271 | "inputWidgets": {}, 272 | "nuid": "f29a64d1-087b-46f7-8e97-63c38991fd40", 273 | "showTitle": false, 274 | "tableResultSettingsMap": {}, 275 | "title": "" 276 | } 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "%python\n", 281 | "dbutils.fs.cp(\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients.csv\",\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy2.csv\")\n", 282 | "dbutils.fs.rm(\"dbfs:/Volumes/workspace/default/volumewe47_datalake/patients_copy.csv\")" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": { 288 | "application/vnd.databricks.v1+cell": { 289 | "cellMetadata": { 290 | "byteLimit": 2048000, 291 | "rowLimit": 10000 292 | }, 293 | "inputWidgets": {}, 294 | "nuid": "a43dd280-49bc-4f22-97fb-08a6b23218cc", 295 | "showTitle": false, 296 | "tableResultSettingsMap": {}, 297 | "title": "" 298 | } 299 | }, 300 | "source": [ 301 | "####How to run a Spark SQL/HQL Queries inside a notebook using %sql magic command" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 0, 307 | "metadata": { 308 | "application/vnd.databricks.v1+cell": { 309 | "cellMetadata": { 310 | "byteLimit": 2048000, 311 | "implicitDf": true, 312 | "rowLimit": 10000 313 | }, 314 | "inputWidgets": {}, 315 | "nuid": "0421bbb7-38d3-4c49-bc4a-28f0c30072e1", 316 | "showTitle": false, 317 | "tableResultSettingsMap": {}, 318 | "title": "" 319 | } 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "%sql\n", 324 | "create table if not exists default.cities2(id int,city string);\n", 325 | "insert into default.cities2 values(3,'Mumbai'),(4,'Lucknow');\n", 326 | "select * from cities2;" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 0, 332 | "metadata": { 333 | "application/vnd.databricks.v1+cell": { 334 | "cellMetadata": { 335 | "byteLimit": 2048000, 336 | "rowLimit": 10000 337 | }, 338 | "inputWidgets": {}, 339 | "nuid": "610c68dd-c905-4abc-9f33-ad5623ba4dcb", 340 | "showTitle": false, 341 | "tableResultSettingsMap": {}, 342 | "title": "" 343 | } 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "%python\n", 348 | "#OOPS, FBP & Declarative (SQL)\n", 349 | "spark.sql(\"select * from cities2\").explain(True)" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 0, 355 | "metadata": { 356 | "application/vnd.databricks.v1+cell": { 357 | "cellMetadata": { 358 | "byteLimit": 2048000, 359 | "implicitDf": true, 360 | "rowLimit": 10000 361 | }, 362 | "inputWidgets": {}, 363 | "nuid": "d1fe763e-6058-4ad3-980e-ae595fc32499", 364 | "showTitle": false, 365 | "tableResultSettingsMap": {}, 366 | "title": "" 367 | } 368 | }, 369 | "outputs": [], 370 | "source": [ 371 | "%sql\n", 372 | "update cities1 set city='Kolkata' where id=4;" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 0, 378 | "metadata": { 379 | "application/vnd.databricks.v1+cell": { 380 | "cellMetadata": { 381 | "byteLimit": 2048000, 382 | "implicitDf": true, 383 | "rowLimit": 10000 384 | }, 385 | "inputWidgets": {}, 386 | "nuid": "2cdb43f7-2838-42d6-a19b-60d2377d1812", 387 | "showTitle": false, 388 | "tableResultSettingsMap": {}, 389 | "title": "" 390 | } 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "%sql\n", 395 | "show create table cities1;" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 0, 401 | "metadata": { 402 | "application/vnd.databricks.v1+cell": { 403 | "cellMetadata": { 404 | "byteLimit": 2048000, 405 | "implicitDf": true, 406 | "rowLimit": 10000 407 | }, 408 | "inputWidgets": {}, 409 | "nuid": "3485856e-b35c-415f-a588-b74aadcbeafd", 410 | "showTitle": false, 411 | "tableResultSettingsMap": {}, 412 | "title": "" 413 | } 414 | }, 415 | "outputs": [], 416 | "source": [ 417 | "%sql\n", 418 | "from cities1 select *;" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": { 424 | "application/vnd.databricks.v1+cell": { 425 | "cellMetadata": { 426 | "byteLimit": 2048000, 427 | "rowLimit": 10000 428 | }, 429 | "inputWidgets": {}, 430 | "nuid": "0cb845bd-3334-4f02-8794-28797af494a0", 431 | "showTitle": false, 432 | "tableResultSettingsMap": {}, 433 | "title": "" 434 | } 435 | }, 436 | "source": [ 437 | "####How to run a Python Program inside a notebook using %python magic command or by default the cell will be enabled with python interpretter only" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 0, 443 | "metadata": { 444 | "application/vnd.databricks.v1+cell": { 445 | "cellMetadata": { 446 | "byteLimit": 2048000, 447 | "rowLimit": 10000 448 | }, 449 | "inputWidgets": {}, 450 | "nuid": "4745b1ba-c25c-4c6f-9a4b-6ab4ad27bd98", 451 | "showTitle": false, 452 | "tableResultSettingsMap": {}, 453 | "title": "" 454 | } 455 | }, 456 | "outputs": [], 457 | "source": [ 458 | "def sqrt(a):\n", 459 | " return a*a" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 0, 465 | "metadata": { 466 | "application/vnd.databricks.v1+cell": { 467 | "cellMetadata": { 468 | "byteLimit": 2048000, 469 | "rowLimit": 10000 470 | }, 471 | "inputWidgets": {}, 472 | "nuid": "86607897-fad2-454f-86ab-d1f639889cca", 473 | "showTitle": false, 474 | "tableResultSettingsMap": {}, 475 | "title": "" 476 | } 477 | }, 478 | "outputs": [], 479 | "source": [ 480 | "print(\"square root function call \",sqrt(10))" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": { 486 | "application/vnd.databricks.v1+cell": { 487 | "cellMetadata": {}, 488 | "inputWidgets": {}, 489 | "nuid": "bb77dbe6-2686-4f10-9781-6e37357e121e", 490 | "showTitle": false, 491 | "tableResultSettingsMap": {}, 492 | "title": "" 493 | } 494 | }, 495 | "source": [ 496 | "In the python magic cell itself, we already have spark session object instantiated,
\n", 497 | "so we can lavishly write spark programs" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 0, 503 | "metadata": { 504 | "application/vnd.databricks.v1+cell": { 505 | "cellMetadata": { 506 | "byteLimit": 2048000, 507 | "rowLimit": 10000 508 | }, 509 | "inputWidgets": {}, 510 | "nuid": "5f63c0dd-51cb-4c14-8dc3-f55ab0ec9009", 511 | "showTitle": false, 512 | "tableResultSettingsMap": { 513 | "0": { 514 | "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1765095105596}", 515 | "filterBlob": "{\"version\":1,\"filterGroups\":[],\"syncTimestamp\":1765600528360}", 516 | "queryPlanFiltersBlob": "[]", 517 | "tableResultIndex": 0 518 | } 519 | }, 520 | "title": "" 521 | } 522 | }, 523 | "outputs": [], 524 | "source": [ 525 | "%python\n", 526 | "from pyspark.sql.session import SparkSession\n", 527 | "#import pyspark.sql.session as sprk\n", 528 | "print(spark)\n", 529 | "spark1 = SparkSession.builder.appName(\"Spark DataFrames\").getOrCreate()\n", 530 | "print(spark1)\n", 531 | "df1 = spark1.read.csv(\"/Volumes/workspace/default/volumewe47_datalake/patients.csv\",header=True)\n", 532 | "display(df1)" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 0, 538 | "metadata": { 539 | "application/vnd.databricks.v1+cell": { 540 | "cellMetadata": { 541 | "byteLimit": 2048000, 542 | "rowLimit": 10000 543 | }, 544 | "inputWidgets": {}, 545 | "nuid": "852e8016-9d82-465f-80df-f8cc0c4dab02", 546 | "showTitle": false, 547 | "tableResultSettingsMap": {}, 548 | "title": "" 549 | } 550 | }, 551 | "outputs": [], 552 | "source": [ 553 | "#DSL - Domain specific language\n", 554 | "df1.where(\"married='Yes'\").write.saveAsTable(\"default.we47_patients\")" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 0, 560 | "metadata": { 561 | "application/vnd.databricks.v1+cell": { 562 | "cellMetadata": { 563 | "byteLimit": 2048000, 564 | "implicitDf": true, 565 | "rowLimit": 10000 566 | }, 567 | "inputWidgets": {}, 568 | "nuid": "1307e5ac-e5fe-4578-9396-299a79f794ef", 569 | "showTitle": false, 570 | "tableResultSettingsMap": {}, 571 | "title": "" 572 | } 573 | }, 574 | "outputs": [], 575 | "source": [ 576 | "spark.sql(\"select count(1),InPatient from default.we47_patients group by 2\").explain(True)" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 0, 582 | "metadata": { 583 | "application/vnd.databricks.v1+cell": { 584 | "cellMetadata": { 585 | "byteLimit": 2048000, 586 | "rowLimit": 10000 587 | }, 588 | "inputWidgets": {}, 589 | "nuid": "f5e2cf2a-3140-4aac-b8e6-babab71ed9bc", 590 | "showTitle": false, 591 | "tableResultSettingsMap": {}, 592 | "title": "" 593 | } 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "spark.read.table(\"default.we47_patients\").show(2)" 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": { 603 | "application/vnd.databricks.v1+cell": { 604 | "cellMetadata": { 605 | "byteLimit": 2048000, 606 | "rowLimit": 10000 607 | }, 608 | "inputWidgets": {}, 609 | "nuid": "1c3c9245-15f8-488a-99f1-c4ea979f607e", 610 | "showTitle": false, 611 | "tableResultSettingsMap": {}, 612 | "title": "" 613 | } 614 | }, 615 | "source": [ 616 | "####How to install additional libraries in this current Python Interpreter using %pip magic command" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": 0, 622 | "metadata": { 623 | "application/vnd.databricks.v1+cell": { 624 | "cellMetadata": { 625 | "byteLimit": 2048000, 626 | "rowLimit": 10000 627 | }, 628 | "inputWidgets": {}, 629 | "nuid": "11ac3360-e8f1-4454-a37d-9ca2f965fcdd", 630 | "showTitle": false, 631 | "tableResultSettingsMap": {}, 632 | "title": "" 633 | } 634 | }, 635 | "outputs": [], 636 | "source": [ 637 | "%pip install pypi" 638 | ] 639 | } 640 | ], 641 | "metadata": { 642 | "application/vnd.databricks.v1+notebook": { 643 | "computePreferences": { 644 | "hardware": { 645 | "accelerator": null, 646 | "gpuPoolId": null, 647 | "memory": null 648 | } 649 | }, 650 | "dashboards": [], 651 | "environmentMetadata": null, 652 | "inputWidgetPreferences": null, 653 | "language": "python", 654 | "notebookMetadata": { 655 | "mostRecentlyExecutedCommandWithImplicitDF": { 656 | "commandId": -1, 657 | "dataframes": [ 658 | "_sqldf" 659 | ] 660 | }, 661 | "pythonIndentUnit": 4 662 | }, 663 | "notebookName": "1_Explore_Notebooks_magic_commands", 664 | "widgets": {} 665 | }, 666 | "language_info": { 667 | "name": "python" 668 | } 669 | }, 670 | "nbformat": 4, 671 | "nbformat_minor": 0 672 | } 673 | -------------------------------------------------------------------------------- /databricks_workouts_2025/2_Spark_DataFrame_Read_Write_Operations/3-Basic-WriteOps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "e3756d01-4aa7-45d1-bffa-b7e3afd67e3c", 10 | "showTitle": false, 11 | "tableResultSettingsMap": {}, 12 | "title": "" 13 | } 14 | }, 15 | "source": [ 16 | "#By Knowing this notebook, we can become an eligible \"Data Egress Developer/Engineer\"\n", 17 | "###We are writing data in Structured(csv), Semi Structured(JSON/XML), Serialized files (orc/parquet/delta) (Datalake)\n", 18 | "###Table (delta/hive) (Lakehouse) format" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "application/vnd.databricks.v1+cell": { 25 | "cellMetadata": {}, 26 | "inputWidgets": {}, 27 | "nuid": "71713fcb-b659-4e62-bbee-1d3092f13683", 28 | "showTitle": false, 29 | "tableResultSettingsMap": {}, 30 | "title": "" 31 | } 32 | }, 33 | "source": [ 34 | "### Let's get some data we have already..." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 0, 40 | "metadata": { 41 | "application/vnd.databricks.v1+cell": { 42 | "cellMetadata": {}, 43 | "inputWidgets": {}, 44 | "nuid": "88e5b5fc-2d6c-4a71-983a-79f140d24e51", 45 | "showTitle": false, 46 | "tableResultSettingsMap": {}, 47 | "title": "" 48 | } 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "from spark.sql.session import SparkSession\n", 53 | "spark=SparkSession.builder.appName(\"Spark DataFrames\").getOrCreate()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 0, 59 | "metadata": { 60 | "application/vnd.databricks.v1+cell": { 61 | "cellMetadata": { 62 | "byteLimit": 2048000, 63 | "rowLimit": 10000 64 | }, 65 | "inputWidgets": {}, 66 | "nuid": "3f1f4ad3-4c7f-4f15-9011-731aa08e1d82", 67 | "showTitle": false, 68 | "tableResultSettingsMap": {}, 69 | "title": "" 70 | } 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "#Extract\n", 75 | "ingest_df1=spark.read.csv(\"/Volumes/workspace/wd36schema/ingestion_volume/source/custs_header\",header=True,sep=',',inferSchema=True,samplingRatio=0.10)\n", 76 | "\n", 77 | "#ingest_df1.write.format(\"delta\").save(\"/Volumes/workspace/wd36schema/ingestion_volume/deltadata\")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": { 83 | "application/vnd.databricks.v1+cell": { 84 | "cellMetadata": {}, 85 | "inputWidgets": {}, 86 | "nuid": "4de366d1-b620-4eb3-8014-943802d1b67a", 87 | "showTitle": false, 88 | "tableResultSettingsMap": {}, 89 | "title": "" 90 | } 91 | }, 92 | "source": [ 93 | "### Writing the data in Builtin - different file formats & different targets (all targets in this world we can write the data also...)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "application/vnd.databricks.v1+cell": { 100 | "cellMetadata": {}, 101 | "inputWidgets": {}, 102 | "nuid": "265d5989-ca7e-4f2f-8e6a-93e75c23948d", 103 | "showTitle": false, 104 | "tableResultSettingsMap": {}, 105 | "title": "" 106 | } 107 | }, 108 | "source": [ 109 | "####1. Writing in csv (structured data (2D data Table/Frames with rows and columns)) format with few basic options listed below (Schema (structure) Migration)\n", 110 | "custid,fname,lname,age,profession -> custid~fname~lname~prof~age\n", 111 | "- header\n", 112 | "- sep\n", 113 | "- mode" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 0, 119 | "metadata": { 120 | "application/vnd.databricks.v1+cell": { 121 | "cellMetadata": { 122 | "byteLimit": 2048000, 123 | "rowLimit": 10000 124 | }, 125 | "inputWidgets": {}, 126 | "nuid": "c6e0bf03-6816-4d82-b3a6-6b1103ee2dee", 127 | "showTitle": false, 128 | "tableResultSettingsMap": {}, 129 | "title": "" 130 | } 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "#We are performing schema migration from comma to tilde delimiter\n", 135 | "ingest_df1.write.csv(path=\"/Volumes/workspace/wd36schema/ingestion_volume/target/csvout\",sep='~',header=True,mode='overwrite')\n", 136 | "#4 modes of writing - append,overwrite,ignore,error" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 0, 142 | "metadata": { 143 | "application/vnd.databricks.v1+cell": { 144 | "cellMetadata": { 145 | "byteLimit": 2048000, 146 | "rowLimit": 10000 147 | }, 148 | "inputWidgets": {}, 149 | "nuid": "2bb81a50-8f23-40fb-a1e2-0b0e97e269fd", 150 | "showTitle": false, 151 | "tableResultSettingsMap": { 152 | "0": { 153 | "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1765767551187}", 154 | "filterBlob": null, 155 | "queryPlanFiltersBlob": null, 156 | "tableResultIndex": 0 157 | } 158 | }, 159 | "title": "" 160 | } 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "#We are performing schema migration by applying some transformations (this is our bread and butter that we learn exclusively further)\n", 165 | "#Transform\n", 166 | "transformed_df=ingest_df1.select(\"custid\",\"fname\",\"lname\",\"profession\",\"age\").withColumnRenamed(\"profession\",\"prof\")#DSL transformation (not for now...)\n", 167 | "#Load\n", 168 | "transformed_df.write.csv(path=\"/Volumes/workspace/wd36schema/ingestion_volume/target/csvout\",sep='~',header=True,mode='overwrite',compression='gzip')" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": { 174 | "application/vnd.databricks.v1+cell": { 175 | "cellMetadata": {}, 176 | "inputWidgets": {}, 177 | "nuid": "a29a4365-30ff-49c1-af16-5ddbbaa9b3ca", 178 | "showTitle": false, 179 | "tableResultSettingsMap": {}, 180 | "title": "" 181 | } 182 | }, 183 | "source": [ 184 | "####2. Writing in json format with few basic options listed below\n", 185 | "path
\n", 186 | "mode\n", 187 | "- We did a schema migration and data conversion from csv to json format (ie structued to semi structured format)\n", 188 | "- json - we learn a lot subsequently (nested/hierarchical/complex/multiline...), \n", 189 | "- what is json - fundamentally it is a dictionary of dictionaries\n", 190 | "- json - java script object notation\n", 191 | "- Standard json format (can't be changed) - {\"k1\":\"string value\",\"k2\":numbervalue,\"k3\":v2} where key has to be unique & enclosed in double quotes and value can be anything\n", 192 | "- **when to go with json or benifits** - \n", 193 | "- a. If we have data in a semistructure format (with variable data format with dynamic schema)\n", 194 | "- eg. {\"custid\":4000001,\"profession\":\"Pilot\",\"age\":55,\"city\":\"NY\"}\n", 195 | "- {\"custid\":4000001,\"fname\":\"Kristina\",\"lname\":\"Chung\",\"prof\":\"Pilot\",\"age\":\"55\"}\n", 196 | "- b. columns/column names or the types or the order can be different\n", 197 | "- c. json will be provided by the sources if the data is dynamic in nature (not sure about number or order of columns) or if the data is api response in nature.\n", 198 | "- d. json is a efficient data format (serialized/encoded) for performing data exchange between applications via network & good for parsing also & good for object by object operations (row by row operation in realtime fashion eg. amazon click stream operations)\n", 199 | "- e. json can be used to group or create hierarchy of data in a complex or in a nested format eg. https://randomuser.me/api/" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 0, 205 | "metadata": { 206 | "application/vnd.databricks.v1+cell": { 207 | "cellMetadata": { 208 | "byteLimit": 2048000, 209 | "rowLimit": 10000 210 | }, 211 | "inputWidgets": {}, 212 | "nuid": "19a05420-114a-4e15-a75b-a8bc3c5c20eb", 213 | "showTitle": false, 214 | "tableResultSettingsMap": {}, 215 | "title": "" 216 | } 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "#Data Conversion/Schema Migration from Structured to SemiStructured format..\n", 221 | "ingest_df1.write.json(path=\"/Volumes/workspace/wd36schema/ingestion_volume/target/jsonout\",mode='append')\n", 222 | "#Structured -> SemiStruct...\n", 223 | "#custid,fname,lname,age,profession -> {\"custid\":4000001,\"fname\":\"Kristina\",\"lname\":\"Chung\",\"prof\":\"Pilot\",\"age\":55}" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "application/vnd.databricks.v1+cell": { 230 | "cellMetadata": {}, 231 | "inputWidgets": {}, 232 | "nuid": "3d9b6ca7-9aa4-4d18-bb18-0d62829dddd2", 233 | "showTitle": false, 234 | "tableResultSettingsMap": {}, 235 | "title": "" 236 | } 237 | }, 238 | "source": [ 239 | "####3.Serialization (encoding in a more optimized fashion) & Deserialization File formats (Binary/Brainy File formats)\n", 240 | "**Data Mechanics:**\n", 241 | "1. encoding/decoding(machine format) - converting the data from human readable format to machine understandable format for performant data transfer (eg. Network transfer of data will be encoded)\n", 242 | "2. *compression/uncompression(encoding+space+time) - shrinking the data in some format using some libraries (tradeoff between time and size) (eg. Compress before store or transfer) - snappy is a good compression tech used in bigdata platform\n", 243 | "3. encryption (encoding+security) - Addition to encoding, encryption add security hence data is (performant+secured) (using some algos - SHA/MD5/AES/DES/RSA/DSA..)\n", 244 | "4. *Serialization (applicable more for bigdata) - Serialization is encoding + performant by saving space + processing intelligent bigdata format - Fast, Compact, Interoperable, Extensible (additional configs), Scalable (cluster compute operations), Secured (binary format)..\n", 245 | "5. *masking - Encoding of data (in some other format not supposed to be machine format) which should not be allowed to decode (used for security purpose)\n", 246 | "\n", 247 | "What are the (builtin) serialized file formats we are going to learn?\n", 248 | "orc\n", 249 | "parquet\n", 250 | "delta(databricks properatory)\n", 251 | "\n", 252 | "- We did a schema migration and data conversion from csv/json to serialized data format (ie structued to sturctured(internall binary unstructured) format)\n", 253 | "- We learn/use a lot/heavily subsequently\n", 254 | "- what is serialized - fundamentally they are intelligent/encoded/serialized/binary data formats applied with lot of optimization & space reduction strategies.. (encoded/compressed/intelligent)\n", 255 | "- orc - optimized row column format (Columnar formats)\n", 256 | "- parquet - tiled data format (Columnar formats)\n", 257 | "- delta(databricks properatory) enriched parquet format - Delta (modified/changes) operations can be performed (ACID property (DML))\n", 258 | "- format - serialized/encoded , we can't see with mere eyes, only some library is used deserialized/decoded data can be accessed as structured data\n", 259 | "- **when to go with serialized or benifits** - \n", 260 | "- a. For storage benifits for eg. orc will save 65+% of space for eg. if i store 1gb data it occupy 350mb space, with compression (snappy) it can improved more...\n", 261 | "- b. For processing optimization. Orc/parquet/delta will provide the required data alone if you query using Pushdown optimization .\n", 262 | "- c. Interoperability feature - this data format can be understandable in multiple environments for eg. bigquery can parse this data.\n", 263 | "- d. Secured\n", 264 | "- **In the projects/environments when to use what fileformats - we learn in detail later...\n", 265 | "| Format | Schema Type | Storage Efficiency | Analytics/Transformation Performance | Updates Supported |\n", 266 | "|--------|--------------------------|--------------------|-----------------------|------------------|\n", 267 | "| CSV | Structured | Low | Slow | No |\n", 268 | "| JSON | Semi-structured | Low | Slow | No |\n", 269 | "| ORC | Structured / Striped | High | Fast | Limited |\n", 270 | "| Parquet| Structured / Nested | High | Very Fast | Limited |\n", 271 | "| Delta | Structured / Evolving | High | Very Fast | Highly |\n", 272 | "| XML | Semi-structured | Low | Slow | No |" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 0, 278 | "metadata": { 279 | "application/vnd.databricks.v1+cell": { 280 | "cellMetadata": { 281 | "byteLimit": 2048000, 282 | "rowLimit": 10000 283 | }, 284 | "inputWidgets": {}, 285 | "nuid": "22b04626-5ab2-4dba-b60e-058374748690", 286 | "showTitle": false, 287 | "tableResultSettingsMap": {}, 288 | "title": "" 289 | } 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "ingest_df1.write.orc(path=\"/Volumes/workspace/wd36schema/ingestion_volume/target/orcout\",mode='overwrite',compression='zlib')#by default orc/parquet uses snappy compression\n", 294 | "spark.read.orc(\"/Volumes/workspace/wd36schema/ingestion_volume/target/orcout\").show(2)#uncompression + deserialization" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 0, 300 | "metadata": { 301 | "application/vnd.databricks.v1+cell": { 302 | "cellMetadata": { 303 | "byteLimit": 2048000, 304 | "rowLimit": 10000 305 | }, 306 | "inputWidgets": {}, 307 | "nuid": "e128ce1d-d358-479e-9590-42f0da8a4cb9", 308 | "showTitle": false, 309 | "tableResultSettingsMap": {}, 310 | "title": "" 311 | } 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "#Orc/Parquet follows WORM feature (Write Once Read Many)\n", 316 | "ingest_df1.write.mode(\"overwrite\").option(\"compression\",\"gzip\").option(\"compression\",\"snappy\").parquet(path=\"/Volumes/workspace/wd36schema/ingestion_volume/target/parquetout\")#by default orc/parquet uses snappy compression\n", 317 | "spark.read.parquet(\"/Volumes/workspace/wd36schema/ingestion_volume/target/parquetout\").show(2)#uncompression + deserialization" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 0, 323 | "metadata": { 324 | "application/vnd.databricks.v1+cell": { 325 | "cellMetadata": { 326 | "byteLimit": 2048000, 327 | "rowLimit": 10000 328 | }, 329 | "inputWidgets": {}, 330 | "nuid": "f52bbbf8-7dbb-4639-9a1e-572114fe9838", 331 | "showTitle": false, 332 | "tableResultSettingsMap": {}, 333 | "title": "" 334 | } 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "#Delta follows WMRM feature (Write Many Read Many) - We did Delta Lake creation (Datalake + Delta file format)\n", 339 | "ingest_df1.write.format(\"delta\").save(\"/Volumes/workspace/wd36schema/ingestion_volume/target/deltaout\",mode='overwrite')\n", 340 | "spark.read.format(\"delta\").load(\"/Volumes/workspace/wd36schema/ingestion_volume/target/deltaout\").show(2)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": { 346 | "application/vnd.databricks.v1+cell": { 347 | "cellMetadata": {}, 348 | "inputWidgets": {}, 349 | "nuid": "5bb4ab26-481f-4b00-a5cb-675b105863d2", 350 | "showTitle": false, 351 | "tableResultSettingsMap": {}, 352 | "title": "" 353 | } 354 | }, 355 | "source": [ 356 | "####4.Table Load Operations - Building LAKEHOUSE ON TOP OF DATALAKE\n", 357 | "Can we do SQL operations directly on the tables like a database or datawarehouse? or Can we build a Lakehouse in Databricks?\n", 358 | "- We learn/use a lot/heavily subsequently, \n", 359 | "- what is Lakehouse - A SQL/Datawarehouse/Query layer on top of the Datalake is called Lakehouse\n", 360 | "- We have different lakehouses which we are going to learn further - \n", 361 | "1. delta tables (lakehouse) in databricks\n", 362 | "2. hive in onprem\n", 363 | "3. bigquery in GCP\n", 364 | "4. synapse in azure\n", 365 | "5. athena in aws\n", 366 | "- **when to go with lakehouse** - \n", 367 | "- a. Transformation\n", 368 | "- b. Analysis/Analytics\n", 369 | "- c. AI/BI\n", 370 | "- d. Literally we are going to learn SQL & Advanced SQL" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 0, 376 | "metadata": { 377 | "application/vnd.databricks.v1+cell": { 378 | "cellMetadata": { 379 | "byteLimit": 2048000, 380 | "rowLimit": 10000 381 | }, 382 | "inputWidgets": {}, 383 | "nuid": "25875988-b00e-4823-95cb-1ff0b6362d44", 384 | "showTitle": false, 385 | "tableResultSettingsMap": {}, 386 | "title": "" 387 | } 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "#We are building delta tables in databricks (we are building hive tables in onprem/we are building bq tables in gcp...)\n", 392 | "#saveastable (named notation/named arguments)\n", 393 | "#Table\n", 394 | "#cid,prof,age,fname,lname\n", 395 | "#mapping\n", 396 | "#cid,prof,age,fname,lname\n", 397 | "ingest_df1.write.saveAsTable(\"workspace.wd36schema.lh_custtbl\",mode='overwrite')\n", 398 | "#display(spark.sql(\"show create table workspace.wd36schema.lh_custtbl\"))" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 0, 404 | "metadata": { 405 | "application/vnd.databricks.v1+cell": { 406 | "cellMetadata": { 407 | "byteLimit": 2048000, 408 | "rowLimit": 10000 409 | }, 410 | "inputWidgets": {}, 411 | "nuid": "fcfa70eb-dec9-409d-a5ab-0aa7c01eb5fb", 412 | "showTitle": false, 413 | "tableResultSettingsMap": {}, 414 | "title": "" 415 | } 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "#1. insertinto function can be used as like saveAstable with few differences\n", 420 | "#a. it works only if the target table exist\n", 421 | "#b. it works by creating insert statements in the behind(not bulk load), hence it is slow, hence we have use for small dataset (safely only if table exists)\n", 422 | "#c. it will load the data from the dataframe by using position, not by using name..\n", 423 | "#insertInto (positional notation/positional arguments)\n", 424 | "#Table\n", 425 | "#cid,prof,age,fname,lname\n", 426 | "#mapping.\n", 427 | "#cid,fname,lname,age,prof\n", 428 | "ingest_df1.write.insertInto(\"workspace.wd36schema.lh_custtbl\",overwrite=True)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 0, 434 | "metadata": { 435 | "application/vnd.databricks.v1+cell": { 436 | "cellMetadata": {}, 437 | "inputWidgets": {}, 438 | "nuid": "06834a14-c6ec-420d-830d-88b88aa5520c", 439 | "showTitle": false, 440 | "tableResultSettingsMap": {}, 441 | "title": "" 442 | } 443 | }, 444 | "outputs": [], 445 | "source": [ 446 | "ingest_df1.write.format(\"delta\").save(\"location\")" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 0, 452 | "metadata": { 453 | "application/vnd.databricks.v1+cell": { 454 | "cellMetadata": { 455 | "byteLimit": 2048000, 456 | "rowLimit": 10000 457 | }, 458 | "inputWidgets": {}, 459 | "nuid": "1f3a1c4f-aabb-4db1-96d7-f3c2836b61e3", 460 | "showTitle": false, 461 | "tableResultSettingsMap": {}, 462 | "title": "" 463 | } 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "#I am using spark engine to pull the data from the lakehouse table backed by dbfs (s3) (datalake) where data in delta format(deltalake) \n", 468 | "display(spark.sql(\"select * from workspace.wd36schema.lh_custtbl\"))#sparkengine+lakehouse+datalake(deltalake)" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": { 474 | "application/vnd.databricks.v1+cell": { 475 | "cellMetadata": {}, 476 | "inputWidgets": {}, 477 | "nuid": "7586d9b9-5766-44e9-9c4a-51c1805f316c", 478 | "showTitle": false, 479 | "tableResultSettingsMap": {}, 480 | "title": "" 481 | } 482 | }, 483 | "source": [ 484 | "####5. XML Format - Semi structured data format (most of the json features can be applied in xml also, but in DE world not so famous like json)\n", 485 | "- Used rarely on demand (by certain target/source systems eg. mainframes)\n", 486 | "- Can be related with json, but not so much efficient like json\n", 487 | "- Databricks provides xml as a inbuild function" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": 0, 493 | "metadata": { 494 | "application/vnd.databricks.v1+cell": { 495 | "cellMetadata": { 496 | "byteLimit": 2048000, 497 | "rowLimit": 10000 498 | }, 499 | "inputWidgets": {}, 500 | "nuid": "637512e0-627e-40d7-8fec-245916796b5b", 501 | "showTitle": false, 502 | "tableResultSettingsMap": {}, 503 | "title": "" 504 | } 505 | }, 506 | "outputs": [], 507 | "source": [ 508 | "ingest_df1.write.xml(\"/Volumes/workspace/wd36schema/ingestion_volume/target/xmlout\",mode=\"ignore\",rowTag=\"cust\")" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": { 514 | "application/vnd.databricks.v1+cell": { 515 | "cellMetadata": {}, 516 | "inputWidgets": {}, 517 | "nuid": "7abb6500-0b7e-4339-b814-3e356c78d7ce", 518 | "showTitle": false, 519 | "tableResultSettingsMap": {}, 520 | "title": "" 521 | } 522 | }, 523 | "source": [ 524 | "### Modes in Writing\n", 525 | "1. **Append** - Adds the new data to the existing data. It does not overwrite anything.\n", 526 | "2. **Overwrite** - Replaces the existing data entirely at the destination.\n", 527 | "3. **ErrorIfexist**(default) - Throws an error if data already exists at the destination.\n", 528 | "4. **Ignore** - Skips the write operation if data already exists at the destination." 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": { 534 | "application/vnd.databricks.v1+cell": { 535 | "cellMetadata": {}, 536 | "inputWidgets": {}, 537 | "nuid": "c8463feb-5c73-4fea-b9ca-9d09d92e1a95", 538 | "showTitle": false, 539 | "tableResultSettingsMap": {}, 540 | "title": "" 541 | } 542 | }, 543 | "source": [ 544 | "####What are all the overall functions/options we used in this notebook, for learning fundamental spark dataframe WRITE operations in different formats and targets?\n", 545 | "1. We learned dozen of functions (out of 18 functions) in the write module with minimum options...\n", 546 | "2. Functions we learned are (Datalake functions - csv/json/xml/orc/parquet+delta), (Lakehouse functions - saveAsTable/insertInto), (additional options - format/save/option/options/mode).\n", 547 | "3. We have few more performance optimization/advanced options available (jdbc (we learn this soon in the name of foreign catalog), partitionBy,ClusterBy,BucketBy,SortBy,text)\n", 548 | "4. Few of the important read options under csv such as header, sep, mode(append/overwrite/error/ignore), toDF.\n", 549 | "5. Few additional options such as compression, different file formats..." 550 | ] 551 | } 552 | ], 553 | "metadata": { 554 | "application/vnd.databricks.v1+notebook": { 555 | "computePreferences": null, 556 | "dashboards": [], 557 | "environmentMetadata": { 558 | "base_environment": "", 559 | "environment_version": "3" 560 | }, 561 | "inputWidgetPreferences": null, 562 | "language": "python", 563 | "notebookMetadata": { 564 | "mostRecentlyExecutedCommandWithImplicitDF": { 565 | "commandId": 1137934375475219, 566 | "dataframes": [ 567 | "_sqldf" 568 | ] 569 | }, 570 | "pythonIndentUnit": 4 571 | }, 572 | "notebookName": "3-Basic-WriteOps", 573 | "widgets": {} 574 | }, 575 | "language_info": { 576 | "name": "python" 577 | } 578 | }, 579 | "nbformat": 4, 580 | "nbformat_minor": 0 581 | } 582 | --------------------------------------------------------------------------------