├── .vscode └── settings.json ├── Fabric ├── Readme.md ├── LibraryManager │ ├── SomeLakehouse.Lakehouse │ │ ├── lakehouse.metadata.json │ │ ├── shortcuts.metadata.json │ │ └── .platform │ ├── MyCustomEnvironment.Environment │ │ ├── Libraries │ │ │ └── PublicLibraries │ │ │ │ └── environment.yml │ │ ├── Setting │ │ │ └── Sparkcompute.yml │ │ └── .platform │ ├── MyLibrary.Notebook │ │ ├── .platform │ │ └── notebook-content.py │ ├── LibraryManager.Notebook │ │ ├── .platform │ │ └── notebook-content.py │ ├── MyOtherLibrary.Notebook │ │ ├── .platform │ │ └── notebook-content.py │ ├── notebook with run.Notebook │ │ ├── .platform │ │ └── notebook-content.py │ ├── MyFabricSparkLibrary.Notebook │ │ ├── .platform │ │ └── notebook-content.py │ ├── load_LibraryManager.Notebook │ │ ├── .platform │ │ └── notebook-content.py │ ├── regular notebook.Notebook │ │ ├── .platform │ │ └── notebook-content.py │ ├── notebook with environment.Notebook │ │ ├── .platform │ │ └── notebook-content.py │ └── README.md └── get latest row.Notebook │ ├── .platform │ └── notebook-content.py ├── README.md ├── LICENSE ├── .gitignore └── DataEngineering └── Library └── VisualizeExecutionPlan.py /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | } -------------------------------------------------------------------------------- /Fabric/Readme.md: -------------------------------------------------------------------------------- 1 | This is an auto-created file for /Fabric -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fabric.Toolbox 2 | Tools for Microsoft Fabric 3 | -------------------------------------------------------------------------------- /Fabric/LibraryManager/SomeLakehouse.Lakehouse/lakehouse.metadata.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /Fabric/LibraryManager/SomeLakehouse.Lakehouse/shortcuts.metadata.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /Fabric/LibraryManager/MyCustomEnvironment.Environment/Libraries/PublicLibraries/environment.yml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - pip: 3 | - pydantic==2.11.5 4 | -------------------------------------------------------------------------------- /Fabric/LibraryManager/MyCustomEnvironment.Environment/Setting/Sparkcompute.yml: -------------------------------------------------------------------------------- 1 | enable_native_execution_engine: false 2 | driver_cores: 8 3 | driver_memory: 56g 4 | executor_cores: 8 5 | executor_memory: 56g 6 | dynamic_executor_allocation: 7 | enabled: true 8 | min_executors: 1 9 | max_executors: 9 10 | runtime_version: 1.3 11 | -------------------------------------------------------------------------------- /Fabric/LibraryManager/SomeLakehouse.Lakehouse/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Lakehouse", 5 | "displayName": "SomeLakehouse" 6 | }, 7 | "config": { 8 | "version": "2.0", 9 | "logicalId": "989099a7-86ab-8bc6-4f32-02932925655f" 10 | } 11 | } -------------------------------------------------------------------------------- /Fabric/get latest row.Notebook/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Notebook", 5 | "displayName": "get latest row", 6 | "description": "New notebook" 7 | }, 8 | "config": { 9 | "version": "2.0", 10 | "logicalId": "04768130-9f28-ac6e-4a68-a328a262441e" 11 | } 12 | } -------------------------------------------------------------------------------- /Fabric/LibraryManager/MyLibrary.Notebook/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Notebook", 5 | "displayName": "MyLibrary", 6 | "description": "New notebook" 7 | }, 8 | "config": { 9 | "version": "2.0", 10 | "logicalId": "42613894-966c-8038-408a-f3c6e0b056da" 11 | } 12 | } -------------------------------------------------------------------------------- /Fabric/LibraryManager/LibraryManager.Notebook/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Notebook", 5 | "displayName": "LibraryManager", 6 | "description": "New notebook" 7 | }, 8 | "config": { 9 | "version": "2.0", 10 | "logicalId": "fd937145-f03f-9309-4f99-df9effe06382" 11 | } 12 | } -------------------------------------------------------------------------------- /Fabric/LibraryManager/MyOtherLibrary.Notebook/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Notebook", 5 | "displayName": "MyOtherLibrary", 6 | "description": "New notebook" 7 | }, 8 | "config": { 9 | "version": "2.0", 10 | "logicalId": "cdbe2b88-86d0-bdd6-4537-99f8a8190032" 11 | } 12 | } -------------------------------------------------------------------------------- /Fabric/LibraryManager/notebook with run.Notebook/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Notebook", 5 | "displayName": "notebook with run", 6 | "description": "New notebook" 7 | }, 8 | "config": { 9 | "version": "2.0", 10 | "logicalId": "da713db2-3ea6-b2e6-40e8-43713a2c6432" 11 | } 12 | } -------------------------------------------------------------------------------- /Fabric/LibraryManager/MyFabricSparkLibrary.Notebook/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Notebook", 5 | "displayName": "MyFabricSparkLibrary", 6 | "description": "New notebook" 7 | }, 8 | "config": { 9 | "version": "2.0", 10 | "logicalId": "230a4da8-f37c-8793-4f16-003b80706781" 11 | } 12 | } -------------------------------------------------------------------------------- /Fabric/LibraryManager/load_LibraryManager.Notebook/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Notebook", 5 | "displayName": "load_LibraryManager", 6 | "description": "New notebook" 7 | }, 8 | "config": { 9 | "version": "2.0", 10 | "logicalId": "4e245d8f-ac1f-8d12-4d59-350bd396f44e" 11 | } 12 | } -------------------------------------------------------------------------------- /Fabric/LibraryManager/MyCustomEnvironment.Environment/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Environment", 5 | "displayName": "MyCustomEnvironment", 6 | "description": "Environment" 7 | }, 8 | "config": { 9 | "version": "2.0", 10 | "logicalId": "70983ad4-37e2-8559-43f2-fdb5ea9bf3be" 11 | } 12 | } -------------------------------------------------------------------------------- /Fabric/LibraryManager/regular notebook.Notebook/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Notebook", 5 | "displayName": "notebook with LibraryManager", 6 | "description": "New notebook" 7 | }, 8 | "config": { 9 | "version": "2.0", 10 | "logicalId": "31aba13c-8c3d-bd15-4062-41b95cf2e5da" 11 | } 12 | } -------------------------------------------------------------------------------- /Fabric/LibraryManager/notebook with environment.Notebook/.platform: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json", 3 | "metadata": { 4 | "type": "Notebook", 5 | "displayName": "notebook with environment", 6 | "description": "New notebook" 7 | }, 8 | "config": { 9 | "version": "2.0", 10 | "logicalId": "13b53598-fcc5-8acc-4505-8077d78d57f6" 11 | } 12 | } -------------------------------------------------------------------------------- /Fabric/LibraryManager/README.md: -------------------------------------------------------------------------------- 1 | # LibraryManager for Microsoft Fabric Data Engineering 2 | 3 | This notebook allows you to specify a set of notebooks from the current workspace to be bundled into a library. This library is stored in the `/Files` section of a lakehouse. Additionally a new notebook is created in the workspace called `load_LibraryManager` which you can call from main notebook to load the LibraryManager and all its libraries into the current notebook session -------------------------------------------------------------------------------- /Fabric/LibraryManager/load_LibraryManager.Notebook/notebook-content.py: -------------------------------------------------------------------------------- 1 | # Fabric notebook source 2 | 3 | 4 | # CELL ******************** 5 | 6 | library_path = "abfss://ca0a79b9-9c03-40b5-9d6a-cfd3fda1c31e@onelake.dfs.fabric.microsoft.com/2925655f-0293-4f32-8bc6-86ab989099a7/Files/Libraries/LibraryManager.zip" 7 | print(f"Loading LibraryManager from '{library_path}' ... ", end = "") 8 | sc.addPyFile(library_path) 9 | print("Done!") 10 | 11 | from LibraryManager import * 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Gerhard Brueckl 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Fabric/LibraryManager/notebook with environment.Notebook/notebook-content.py: -------------------------------------------------------------------------------- 1 | # Fabric notebook source 2 | 3 | # METADATA ******************** 4 | 5 | # META { 6 | # META "kernel_info": { 7 | # META "name": "synapse_pyspark" 8 | # META }, 9 | # META "dependencies": { 10 | # META "lakehouse": { 11 | # META "default_lakehouse": "2925655f-0293-4f32-8bc6-86ab989099a7", 12 | # META "default_lakehouse_name": "SomeLakehouse", 13 | # META "default_lakehouse_workspace_id": "ca0a79b9-9c03-40b5-9d6a-cfd3fda1c31e", 14 | # META "known_lakehouses": [ 15 | # META { 16 | # META "id": "2925655f-0293-4f32-8bc6-86ab989099a7" 17 | # META } 18 | # META ] 19 | # META }, 20 | # META "environment": { 21 | # META "environmentId": "70983ad4-37e2-8559-43f2-fdb5ea9bf3be", 22 | # META "workspaceId": "00000000-0000-0000-0000-000000000000" 23 | # META } 24 | # META } 25 | # META } 26 | 27 | # CELL ******************** 28 | 29 | print("This notebook uses a custom Environment") 30 | 31 | # METADATA ******************** 32 | 33 | # META { 34 | # META "language": "python", 35 | # META "language_group": "synapse_pyspark" 36 | # META } 37 | 38 | # CELL ******************** 39 | 40 | # MAGIC %%sql 41 | # MAGIC SELECT 'This notebook uses a custom Environment' 42 | 43 | # METADATA ******************** 44 | 45 | # META { 46 | # META "language": "sparksql", 47 | # META "language_group": "synapse_pyspark" 48 | # META } 49 | -------------------------------------------------------------------------------- /Fabric/LibraryManager/MyLibrary.Notebook/notebook-content.py: -------------------------------------------------------------------------------- 1 | # Fabric notebook source 2 | 3 | # METADATA ******************** 4 | 5 | # META { 6 | # META "kernel_info": { 7 | # META "name": "synapse_pyspark" 8 | # META }, 9 | # META "dependencies": {} 10 | # META } 11 | 12 | # CELL ******************** 13 | 14 | # variable "spark" is not set when calling sc.addPyFile() 15 | _is_sc_addPyFile = not "spark" in locals() 16 | 17 | # METADATA ******************** 18 | 19 | # META { 20 | # META "language": "python", 21 | # META "language_group": "synapse_pyspark" 22 | # META } 23 | 24 | # CELL ******************** 25 | 26 | from pyspark.sql import DataFrame 27 | from pyspark import SparkContext 28 | from pyspark.sql import DataFrame, SparkSession 29 | 30 | # METADATA ******************** 31 | 32 | # META { 33 | # META "language": "python", 34 | # META "language_group": "synapse_pyspark" 35 | # META } 36 | 37 | # CELL ******************** 38 | 39 | PI = 3.14 40 | 41 | # METADATA ******************** 42 | 43 | # META { 44 | # META "language": "python", 45 | # META "language_group": "synapse_pyspark" 46 | # META } 47 | 48 | # CELL ******************** 49 | 50 | def get_area(radius): 51 | area = float(PI)*radius*radius 52 | return area 53 | 54 | # some code to run only when interactively developing the library 55 | if not _is_sc_addPyFile: 56 | print(get_area(4)) 57 | 58 | # METADATA ******************** 59 | 60 | # META { 61 | # META "language": "python", 62 | # META "language_group": "synapse_pyspark" 63 | # META } 64 | -------------------------------------------------------------------------------- /Fabric/LibraryManager/notebook with run.Notebook/notebook-content.py: -------------------------------------------------------------------------------- 1 | # Fabric notebook source 2 | 3 | # METADATA ******************** 4 | 5 | # META { 6 | # META "kernel_info": { 7 | # META "name": "synapse_pyspark" 8 | # META }, 9 | # META "dependencies": { 10 | # META "lakehouse": { 11 | # META "default_lakehouse": "2925655f-0293-4f32-8bc6-86ab989099a7", 12 | # META "default_lakehouse_name": "SomeLakehouse", 13 | # META "default_lakehouse_workspace_id": "ca0a79b9-9c03-40b5-9d6a-cfd3fda1c31e", 14 | # META "known_lakehouses": [ 15 | # META { 16 | # META "id": "2925655f-0293-4f32-8bc6-86ab989099a7" 17 | # META } 18 | # META ] 19 | # META }, 20 | # META "environment": {} 21 | # META } 22 | # META } 23 | 24 | # MARKDOWN ******************** 25 | 26 | # # Using `%run` to import custom code from other notebooks 27 | 28 | # CELL ******************** 29 | 30 | %run MyLibrary 31 | 32 | # METADATA ******************** 33 | 34 | # META { 35 | # META "language": "python", 36 | # META "language_group": "synapse_pyspark" 37 | # META } 38 | 39 | # CELL ******************** 40 | 41 | %run MyOtherLibrary 42 | 43 | # METADATA ******************** 44 | 45 | # META { 46 | # META "language": "python", 47 | # META "language_group": "synapse_pyspark" 48 | # META } 49 | 50 | # CELL ******************** 51 | 52 | # my regular code using imported library functions ... 53 | 54 | # METADATA ******************** 55 | 56 | # META { 57 | # META "language": "python", 58 | # META "language_group": "synapse_pyspark" 59 | # META } 60 | -------------------------------------------------------------------------------- /Fabric/LibraryManager/MyOtherLibrary.Notebook/notebook-content.py: -------------------------------------------------------------------------------- 1 | # Fabric notebook source 2 | 3 | # METADATA ******************** 4 | 5 | # META { 6 | # META "kernel_info": { 7 | # META "name": "synapse_pyspark" 8 | # META }, 9 | # META "dependencies": {} 10 | # META } 11 | 12 | # CELL ******************** 13 | 14 | # variable "spark" is not set when calling sc.addPyFile() 15 | _is_sc_addPyFile = not "spark" in locals() 16 | 17 | # METADATA ******************** 18 | 19 | # META { 20 | # META "language": "python", 21 | # META "language_group": "synapse_pyspark" 22 | # META } 23 | 24 | # CELL ******************** 25 | 26 | import datetime as dt 27 | 28 | # METADATA ******************** 29 | 30 | # META { 31 | # META "language": "python", 32 | # META "language_group": "synapse_pyspark" 33 | # META } 34 | 35 | # CELL ******************** 36 | 37 | # when called via sc.addPyFile can import other libraries as well using the syntax below 38 | if _is_sc_addPyFile: 39 | from .MyLibrary import PI 40 | else: 41 | # for debugging we need to define the variables here 42 | PI = 123 43 | 44 | # METADATA ******************** 45 | 46 | # META { 47 | # META "language": "python", 48 | # META "language_group": "synapse_pyspark" 49 | # META } 50 | 51 | # CELL ******************** 52 | 53 | def get_circumference(radius): 54 | return 2*radius*PI 55 | 56 | if not _is_sc_addPyFile: 57 | print(get_circumference(1)) 58 | 59 | # METADATA ******************** 60 | 61 | # META { 62 | # META "language": "python", 63 | # META "language_group": "synapse_pyspark" 64 | # META } 65 | 66 | # CELL ******************** 67 | 68 | def log(text: str, end: str = None): 69 | print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\t" + str(text), end = end) 70 | 71 | # some code to run only when interactively developing the library 72 | if not _is_sc_addPyFile: 73 | log("This is a test log!") 74 | 75 | # METADATA ******************** 76 | 77 | # META { 78 | # META "language": "python", 79 | # META "language_group": "synapse_pyspark" 80 | # META } 81 | -------------------------------------------------------------------------------- /Fabric/LibraryManager/regular notebook.Notebook/notebook-content.py: -------------------------------------------------------------------------------- 1 | # Fabric notebook source 2 | 3 | # METADATA ******************** 4 | 5 | # META { 6 | # META "kernel_info": { 7 | # META "name": "synapse_pyspark" 8 | # META }, 9 | # META "dependencies": { 10 | # META "lakehouse": { 11 | # META "default_lakehouse": "2925655f-0293-4f32-8bc6-86ab989099a7", 12 | # META "default_lakehouse_name": "SomeLakehouse", 13 | # META "default_lakehouse_workspace_id": "ca0a79b9-9c03-40b5-9d6a-cfd3fda1c31e", 14 | # META "known_lakehouses": [ 15 | # META { 16 | # META "id": "2925655f-0293-4f32-8bc6-86ab989099a7" 17 | # META } 18 | # META ] 19 | # META } 20 | # META } 21 | # META } 22 | 23 | # MARKDOWN ******************** 24 | 25 | # # Using LibraryManager to import custom code 26 | 27 | # CELL ******************** 28 | 29 | %run load_LibraryManager 30 | 31 | # METADATA ******************** 32 | 33 | # META { 34 | # META "language": "python", 35 | # META "language_group": "synapse_pyspark" 36 | # META } 37 | 38 | # MARKDOWN ******************** 39 | 40 | # # Use log() function and PI constant from library 41 | 42 | # CELL ******************** 43 | 44 | log(PI) 45 | 46 | # METADATA ******************** 47 | 48 | # META { 49 | # META "language": "python", 50 | # META "language_group": "synapse_pyspark" 51 | # META } 52 | 53 | # MARKDOWN ******************** 54 | 55 | # # Use library function that uses `notebookutils` under the hood 56 | 57 | # CELL ******************** 58 | 59 | display(ls("/")) 60 | 61 | # METADATA ******************** 62 | 63 | # META { 64 | # META "language": "python", 65 | # META "language_group": "synapse_pyspark" 66 | # META } 67 | 68 | # MARKDOWN ******************** 69 | 70 | # # Use library function that uses Spark under the hood 71 | 72 | # CELL ******************** 73 | 74 | display(table_to_df("myTable")) 75 | 76 | # METADATA ******************** 77 | 78 | # META { 79 | # META "language": "python", 80 | # META "language_group": "synapse_pyspark" 81 | # META } 82 | 83 | # MARKDOWN ******************** 84 | 85 | # # Use nested library function 86 | 87 | # CELL ******************** 88 | 89 | print(get_circumference(8)) 90 | 91 | # METADATA ******************** 92 | 93 | # META { 94 | # META "language": "python", 95 | # META "language_group": "synapse_pyspark" 96 | # META } 97 | -------------------------------------------------------------------------------- /Fabric/LibraryManager/MyFabricSparkLibrary.Notebook/notebook-content.py: -------------------------------------------------------------------------------- 1 | # Fabric notebook source 2 | 3 | # METADATA ******************** 4 | 5 | # META { 6 | # META "kernel_info": { 7 | # META "name": "synapse_pyspark" 8 | # META }, 9 | # META "dependencies": {} 10 | # META } 11 | 12 | # CELL ******************** 13 | 14 | # variable "spark" is not set when calling sc.addPyFile() 15 | _is_sc_addPyFile = not "spark" in locals() 16 | 17 | # METADATA ******************** 18 | 19 | # META { 20 | # META "language": "python", 21 | # META "language_group": "synapse_pyspark" 22 | # META } 23 | 24 | # CELL ******************** 25 | 26 | from pyspark.sql import DataFrame 27 | from pyspark import SparkContext 28 | from pyspark.sql import DataFrame, SparkSession 29 | 30 | 31 | 32 | # METADATA ******************** 33 | 34 | # META { 35 | # META "language": "python", 36 | # META "language_group": "synapse_pyspark" 37 | # META } 38 | 39 | # MARKDOWN ******************** 40 | 41 | # # Using notebookutils 42 | 43 | # CELL ******************** 44 | 45 | import notebookutils 46 | 47 | # METADATA ******************** 48 | 49 | # META { 50 | # META "language": "python", 51 | # META "language_group": "synapse_pyspark" 52 | # META } 53 | 54 | # CELL ******************** 55 | 56 | def ls(path: str) -> DataFrame: 57 | print(f"Listing {path} ...") 58 | return notebookutils.fs.ls(path) 59 | 60 | # a simple test which is only run when developing the library but not when the library is added via sc.addPyFile() 61 | if not _is_sc_addPyFile: 62 | display(ls("/")) 63 | 64 | # METADATA ******************** 65 | 66 | # META { 67 | # META "language": "python", 68 | # META "language_group": "synapse_pyspark" 69 | # META } 70 | 71 | # MARKDOWN ******************** 72 | 73 | # # Using Spark(`spark`) and SparkContext(`sc`) 74 | 75 | # CELL ******************** 76 | 77 | if _is_sc_addPyFile: 78 | # as our library makes calls to spark, we need to create local instances of SparkContext "sc" and Sparksession "spark" 79 | if not "sc" in locals(): 80 | #print("Creating local SparkContext variable 'sc' ... ", end = "") 81 | sc = SparkContext.getOrCreate() 82 | #print("Done!") 83 | 84 | if not "spark" in locals(): 85 | #print("Creating local SparkSession variable 'spark' ... ", end = "") 86 | spark = (SparkSession(sc) 87 | .builder 88 | .getOrCreate()) 89 | #print("Done!") 90 | 91 | # METADATA ******************** 92 | 93 | # META { 94 | # META "language": "python", 95 | # META "language_group": "synapse_pyspark" 96 | # META } 97 | 98 | # CELL ******************** 99 | 100 | def table_to_df(table_name: str) -> DataFrame: 101 | df = spark.sql(f"SELECT * FROM {table_name}") 102 | 103 | return df 104 | 105 | # METADATA ******************** 106 | 107 | # META { 108 | # META "language": "python", 109 | # META "language_group": "synapse_pyspark" 110 | # META } 111 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /Fabric/get latest row.Notebook/notebook-content.py: -------------------------------------------------------------------------------- 1 | # Fabric notebook source 2 | 3 | # METADATA ******************** 4 | 5 | # META { 6 | # META "kernel_info": { 7 | # META "name": "synapse_pyspark" 8 | # META }, 9 | # META "dependencies": { 10 | # META "lakehouse": { 11 | # META "default_lakehouse": "8e6faaa0-29be-48c1-b0f3-1ffe17396203", 12 | # META "default_lakehouse_name": "TPCH", 13 | # META "default_lakehouse_workspace_id": "e2278f6a-27f2-4261-b759-052b593650b0" 14 | # META } 15 | # META } 16 | # META } 17 | 18 | # CELL ******************** 19 | 20 | # MAGIC %%configure -f 21 | # MAGIC { 22 | # MAGIC // You can get a list of valid parameters to config the session from https://github.com/cloudera/livy#request-body. 23 | # MAGIC "driverMemory": "56g", // Recommended values: ["28g", "56g", "112g", "224g", "400g"] 24 | # MAGIC "driverCores": 8, // Recommended values: [4, 8, 16, 32, 64] 25 | # MAGIC "executorMemory": "56g", 26 | # MAGIC "executorCores": 8, 27 | # MAGIC "numExecutors": 4, 28 | # MAGIC "useStarterPool": false // Set to true to force using starter pool 29 | # MAGIC } 30 | 31 | # METADATA ******************** 32 | 33 | # META { 34 | # META "language": "python", 35 | # META "language_group": "synapse_pyspark" 36 | # META } 37 | 38 | # CELL ******************** 39 | 40 | import time 41 | import datetime as dt 42 | import sempy.fabric as fabric 43 | 44 | import pyspark.sql.functions as F 45 | import pyspark.sql.types as T 46 | from pyspark.sql import Window, Row 47 | 48 | sc.addPyFile("https://raw.githubusercontent.com/gbrueckl/Fabric.Toolbox/main/DataEngineering/Library/VisualizeExecutionPlan.py") 49 | from VisualizeExecutionPlan import show_plan 50 | 51 | # METADATA ******************** 52 | 53 | # META { 54 | # META "language": "python", 55 | # META "language_group": "synapse_pyspark" 56 | # META } 57 | 58 | # CELL ******************** 59 | 60 | scale_factor = 10 # 1, 10 or 100 61 | 62 | df = spark.sql(f"SELECT * FROM TPCH.sf{scale_factor}_lineitem") 63 | display(df.limit(100)) 64 | 65 | group_by_cols = ["OrderId"] 66 | sorting_cols = ["ExtendedPrice"] 67 | 68 | # METADATA ******************** 69 | 70 | # META { 71 | # META "language": "python", 72 | # META "language_group": "synapse_pyspark" 73 | # META } 74 | 75 | # CELL ******************** 76 | 77 | # for validation of the code 78 | if False: 79 | rdd = sc.parallelize([ 80 | [1, date(2024, 1, 7), 13.90], 81 | [1, date(2024, 1, 16), 14.50], 82 | [2, date(2024, 1, 9), 10.50], 83 | [2, date(2024, 1, 28), 9.90], 84 | [3, date(2024, 1, 5), 1.50] 85 | ]) 86 | 87 | df = rdd.toDF(['product_key', 'date', 'price']) 88 | 89 | display(df) 90 | 91 | group_by_cols = ['product_key'] 92 | sorting_cols = ['date'] 93 | 94 | # METADATA ******************** 95 | 96 | # META { 97 | # META "language": "python", 98 | # META "language_group": "synapse_pyspark" 99 | # META } 100 | 101 | # CELL ******************** 102 | 103 | def log(text: str, end: str = None): 104 | print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\t" + str(text), end = end) 105 | 106 | # METADATA ******************** 107 | 108 | # META { 109 | # META "language": "python", 110 | # META "language_group": "synapse_pyspark" 111 | # META } 112 | 113 | # CELL ******************** 114 | 115 | def evaluate_result(df, iterations = 3): 116 | durations = [] 117 | for it in range(1, iterations+1): 118 | log(f"Simulating write operation - iteration {it} ... ") 119 | start = time.time() 120 | df.write.format("noop").mode("overwrite").save() 121 | end = time.time() 122 | log(f"Done - Duration: {end - start:5.2f} seconds") 123 | durations.append(end - start) 124 | 125 | log(f"Total Duration: {sum(durations)/len(durations)}") 126 | log(f"Avg. Duration: {sum(durations)}") 127 | log(f"Run Durations: {durations}") 128 | 129 | # read the df once 130 | evaluate_result(df) 131 | log(f"Rowcount: {df.count()}") 132 | 133 | # METADATA ******************** 134 | 135 | # META { 136 | # META "language": "python", 137 | # META "language_group": "synapse_pyspark" 138 | # META } 139 | 140 | # MARKDOWN ******************** 141 | 142 | # # Get the most recent price per product 143 | 144 | # MARKDOWN ******************** 145 | 146 | # ## using window function 147 | 148 | # CELL ******************** 149 | 150 | # define window, used DESC sort order 151 | w = Window.partitionBy(group_by_cols).orderBy([F.col(x).desc() for x in sorting_cols]) 152 | 153 | #filter DataFrame to only show first row for each group 154 | df_latest_window = ( 155 | df.withColumn('__row_num__', F.row_number().over(w)) 156 | .filter(F.col('__row_num__') == 1) 157 | .drop('__row_num__') 158 | ) 159 | 160 | evaluate_result(df_latest_window) 161 | 162 | # SF1: 163 | # SF10: 21.9 s ± 381 ms per loop (mean ± std. dev. of 3 runs, 1 loop each) 164 | # SF100: 1min 10s ± 18.4 s per loop (mean ± std. dev. of 3 runs, 1 loop each) 165 | 166 | # METADATA ******************** 167 | 168 | # META { 169 | # META "language": "python", 170 | # META "language_group": "synapse_pyspark" 171 | # META } 172 | 173 | # CELL ******************** 174 | 175 | show_plan(df_latest_window) 176 | 177 | # METADATA ******************** 178 | 179 | # META { 180 | # META "language": "python", 181 | # META "language_group": "synapse_pyspark" 182 | # META } 183 | 184 | # MARKDOWN ******************** 185 | 186 | # ## using self join 187 | 188 | # CELL ******************** 189 | 190 | df_latest_dates_per_group = ( 191 | df.groupBy(group_by_cols) 192 | .agg(*[F.max(x).alias(x) for x in sorting_cols]) 193 | ) 194 | #display(df_latest_dates_per_group) 195 | 196 | df_latest_join = ( 197 | df.alias("base") 198 | .join(df_latest_dates_per_group, group_by_cols + sorting_cols, "inner") 199 | .select("base.*") 200 | ) 201 | 202 | evaluate_result(df_latest_join) 203 | 204 | # SF1: 205 | # SF10: 23.9 s ± 1.35 s per loop (mean ± std. dev. of 3 runs, 1 loop each) 206 | # SF100: 1min 21s ± 4.76 s per loop (mean ± std. dev. of 3 runs, 1 loop each) 207 | 208 | # METADATA ******************** 209 | 210 | # META { 211 | # META "language": "python", 212 | # META "language_group": "synapse_pyspark" 213 | # META } 214 | 215 | # CELL ******************** 216 | 217 | show_plan(df_latest_join) 218 | 219 | # METADATA ******************** 220 | 221 | # META { 222 | # META "language": "python", 223 | # META "language_group": "synapse_pyspark" 224 | # META } 225 | 226 | # MARKDOWN ******************** 227 | 228 | # ## using max and struct 229 | 230 | # CELL ******************** 231 | 232 | df_latest_max_struct = ( 233 | df.groupBy(group_by_cols) 234 | .agg(F.max(F.struct(*sorting_cols + [x for x in df.columns if x not in sorting_cols])).alias("latest")) 235 | .select("latest.*") 236 | .select(df.columns) # keep original column order 237 | ) 238 | 239 | evaluate_result(df_latest_max_struct) 240 | 241 | # SF1: 242 | # SF10: 21.6 s ± 1.5 s per loop (mean ± std. dev. of 3 runs, 1 loop each) 243 | # SF100: 58.3 s ± 511 ms per loop (mean ± std. dev. of 3 runs, 1 loop each) 244 | 245 | # METADATA ******************** 246 | 247 | # META { 248 | # META "language": "python", 249 | # META "language_group": "synapse_pyspark" 250 | # META } 251 | 252 | # CELL ******************** 253 | 254 | show_plan(df_latest_max_struct) 255 | 256 | # METADATA ******************** 257 | 258 | # META { 259 | # META "language": "python", 260 | # META "language_group": "synapse_pyspark" 261 | # META } 262 | 263 | # MARKDOWN ******************** 264 | 265 | # ## using max_by 266 | 267 | # CELL ******************** 268 | 269 | df_latest_max_by = ( 270 | df.groupBy(group_by_cols) 271 | .agg(F.max_by(F.struct("*"), F.struct(*sorting_cols)).alias("latest")) 272 | .select("latest.*") 273 | ) 274 | 275 | evaluate_result(df_latest_max_by) 276 | 277 | # SF1: 278 | # SF10: 24.4 s ± 5.76 s per loop (mean ± std. dev. of 3 runs, 1 loop each) 279 | # SF100: 53.5 s ± 1.24 s per loop (mean ± std. dev. of 3 runs, 1 loop each) 280 | 281 | # METADATA ******************** 282 | 283 | # META { 284 | # META "language": "python", 285 | # META "language_group": "synapse_pyspark" 286 | # META } 287 | 288 | # CELL ******************** 289 | 290 | show_plan(df_latest_max_by) 291 | 292 | # METADATA ******************** 293 | 294 | # META { 295 | # META "language": "python", 296 | # META "language_group": "synapse_pyspark" 297 | # META } 298 | 299 | # MARKDOWN ******************** 300 | 301 | # ## using max_by - no struct in sort 302 | 303 | # CELL ******************** 304 | 305 | # MAGIC %%timeit -n 1 -r 3 306 | # MAGIC df_latest_max_by_no_struct = df.groupBy(group_by_cols).agg(F.max_by(F.struct("*"), sorting_cols[0]).alias("latest")).select("latest.*") 307 | # MAGIC #display(df_latest_max_struct) 308 | # MAGIC #show_plan(df_latest_max_struct) 309 | # MAGIC evaluate_result(df_latest_max_by_no_struct) 310 | # MAGIC 311 | # MAGIC # SF1: 312 | # MAGIC # SF10: 27.5 s ± 4.28 s per loop (mean ± std. dev. of 3 runs, 1 loop each) 313 | # MAGIC # SF100: 52.8 s ± 1.21 s per loop (mean ± std. dev. of 3 runs, 1 loop each) 314 | 315 | # METADATA ******************** 316 | 317 | # META { 318 | # META "language": "python", 319 | # META "language_group": "synapse_pyspark" 320 | # META } 321 | 322 | # CELL ******************** 323 | 324 | show_plan(df_latest_max_by_no_struct) 325 | 326 | # METADATA ******************** 327 | 328 | # META { 329 | # META "language": "python", 330 | # META "language_group": "synapse_pyspark" 331 | # META } 332 | -------------------------------------------------------------------------------- /DataEngineering/Library/VisualizeExecutionPlan.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import DataFrame 2 | import re, contextlib, io, math 3 | from graphviz import Digraph 4 | 5 | print("Loading VisualizeExecutionPlan library ...") 6 | 7 | def get_execution_plan(df: DataFrame) -> str: 8 | with contextlib.redirect_stdout(io.StringIO()) as stdout: 9 | df.explain(mode="cost") 10 | 11 | plan = stdout.getvalue() 12 | return plan 13 | 14 | 15 | class PlanNode: 16 | plan_type: str 17 | level: int 18 | line: str 19 | line_number: int 20 | parent: any 21 | identifier: str 22 | table: str 23 | operation: str 24 | sub_operation: str 25 | 26 | size_in_bytes: int 27 | size: str 28 | 29 | node_type: str 30 | node_label: str 31 | node_tooltip: str 32 | 33 | edge_label: str 34 | edge_tooltip: str 35 | 36 | node_matching_text: str 37 | 38 | def __init__(self, line, line_number, plan_type): 39 | if line.startswith("*"): # replace whole-stage code gen prefix 40 | line = re.sub('([^*]*)\*\([0-9]*\)\s(.*)', r'\1\2', line) 41 | 42 | self.line = line 43 | self.line_number = line_number 44 | self.plan_type = plan_type 45 | 46 | self.level = self.get_level() 47 | 48 | self.parent = None 49 | 50 | self.populate_fields() 51 | 52 | def populate_fields(self): 53 | 54 | self.text = self.get_text() 55 | 56 | self.identifier = self.get_identifier() 57 | self.table = self.get_table() 58 | self.operation = self.get_operation() 59 | self.sub_operation = self.get_sub_operation() 60 | self.size_in_bytes = self.get_size_in_bytes() 61 | self.size = self.get_size() 62 | self.node_type = self.get_node_type() 63 | self.node_label = self.get_node_label() 64 | self.node_tooltip = self.get_node_tooltip() 65 | self.edge_label = self.get_edge_label() 66 | self.edge_tooltip = self.get_edge_tooltip() 67 | 68 | self.node_matching_text = self.get_node_matching_text() 69 | 70 | def get_level(self) -> int: 71 | return int(re.search(r'[A-Z]', self.line).start() / 3) 72 | 73 | def get_parent(self, skip_ops: list[str] = []): 74 | if not self.parent: 75 | return None 76 | 77 | it = self.parent 78 | while it.get_operation() in skip_ops: 79 | it = it.parent 80 | 81 | return it 82 | 83 | def get_identifier(self): 84 | return str(self.line_number) 85 | 86 | def get_operation(self) -> str: 87 | #m = re.search('^[:\s+-]*(.*?)[\(),\s]', self.line) 88 | m = re.search('([A-Za-z]+)', self.line) 89 | if m: 90 | return m.group(1) 91 | else: 92 | return self.line 93 | 94 | def get_sub_operation(self) -> str: 95 | m = re.search('^[:\s+-]*([A-Za-z]+)\s([A-Za-z]+)', self.line) 96 | if m: 97 | if m.group(1) in ["Join", "FileScan"]: 98 | return m.group(2) 99 | return m.group(1) 100 | else: 101 | return self.get_operation() 102 | 103 | def get_table(self) -> str: 104 | m = re.search('([a-z0-9-_]*)\.([a-z0-9-_]*)\.([a-z0-9-_]*)', self.line) 105 | if m: 106 | #return f"{m.group(1)}.{m.group(2)}.{m.group(3)}" 107 | return f"{m.group(2)}.{m.group(3)}" 108 | else: 109 | return None 110 | 111 | def get_size_in_bytes(self) -> int: 112 | # from https://semyonsinchenko.github.io/ssinchenko/post/estimation-spark-df-size/ 113 | m = re.search('sizeInBytes\s*=\s*([0-9.]*)\s(.*?)[),]', self.line) 114 | if m: 115 | size = float(m.group(1)) 116 | units = m.group(2) 117 | else: 118 | return -1 119 | 120 | units = units.replace("i", "") 121 | 122 | size_names = ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"] 123 | i = size_names.index(units) 124 | p = math.pow(1024, i) 125 | 126 | size = size * p 127 | 128 | return size # size in byte 129 | 130 | def get_size(self) -> str: 131 | size_bytes = self.get_size_in_bytes() 132 | if size_bytes == -1: 133 | return None 134 | 135 | if size_bytes == 0: 136 | return "0B" 137 | 138 | size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") 139 | i = int(math.floor(math.log(size_bytes, 1024))) 140 | p = math.pow(1024, i) 141 | s = round(size_bytes / p, 2) 142 | return "%s %s" % (s, size_name[i]) 143 | 144 | def get_node_type(self) -> str: 145 | if self.table: 146 | return "table" 147 | else: 148 | if "Join" in self.operation: 149 | return "join" 150 | return self.operation.lower() 151 | 152 | def get_node_label(self) -> str: 153 | if not self.parent: 154 | return f"RESULT" 155 | if self.node_type == "table": 156 | return self.get_table() 157 | 158 | ret = self.operation 159 | if self.node_type == "join": 160 | if self.plan_type == "logical": 161 | ret += "\n" + self.sub_operation 162 | elif self.plan_type == "physical": 163 | ret += "\n" + self.text.split(",")[-3] 164 | 165 | return ret 166 | 167 | def get_node_tooltip(self) -> str: 168 | return self.text 169 | 170 | def get_edge_label(self) -> str: 171 | return self.get_size() 172 | 173 | def get_edge_tooltip(self) -> str: 174 | return None 175 | 176 | def get_text(self) -> str: 177 | m = re.search('^[:\s+-]*(.*)', self.line) 178 | return m.group(1) 179 | 180 | def get_node_matching_text(self) -> str: 181 | if self.node_type == "table": 182 | return self.table 183 | if self.operation == "Filter": 184 | return self.text.split(',')[0] 185 | 186 | 187 | def execution_plan_to_nodes(exec_plan: str, plan_type: str = "combined") -> list[PlanNode]: 188 | assert plan_type in ["logical", "physical", "combined"] 189 | 190 | if plan_type == "combined": 191 | logical_nodes = execution_plan_to_nodes(exec_plan, "logical") 192 | physical_nodes = execution_plan_to_nodes(exec_plan, "physical") 193 | 194 | for node in physical_nodes: 195 | log_node = next((nd for nd in logical_nodes if nd.node_matching_text == node.node_matching_text), None) 196 | if log_node: 197 | node.size = log_node.size 198 | node.size_in_bytes = log_node.size_in_bytes 199 | node.edge_label = log_node.edge_label 200 | node.edge_tooltip = log_node.edge_tooltip 201 | 202 | return physical_nodes 203 | 204 | lines = [l for l in exec_plan.split("\n") if len(l.strip()) > 0] 205 | 206 | nodes = [] 207 | line_number = 1 208 | capture_started = False 209 | for line in lines: 210 | node = PlanNode(line, line_number, plan_type) 211 | line_number += 1 212 | 213 | if line.startswith("==") and line.endswith("=="): 214 | if not capture_started: 215 | if plan_type == "logical": 216 | if "== Optimized Logical Plan ==" in line: 217 | capture_started = True 218 | node.line = "RESULT" 219 | node.level = -1 220 | elif plan_type == "physical": 221 | if line == "== Physical Plan ==": 222 | capture_started = True 223 | node.line = "RESULT" 224 | node.level = -1 225 | else: 226 | raise Exception("Invalid plan_type! Only 'logical' and 'physical' are supported!") 227 | else: 228 | break 229 | 230 | if not capture_started: 231 | continue 232 | 233 | parent = next((nd for nd in reversed(nodes) if nd.level == node.level - 1), None) 234 | node.parent = parent 235 | node.populate_fields() 236 | 237 | nodes.append(node) 238 | 239 | return nodes 240 | 241 | 242 | # https://graphviz.readthedocs.io/en/stable/examples.html 243 | def get_graph_from_nodes(nodes: list[PlanNode], skip_operations: list[str] = []): 244 | g = Digraph(name="Execution Plan", comment='Execution Plan') 245 | 246 | g.attr(label=r'Execution Plan\nSizes are estimates based on table statistics\nThey are not reliable anymore after joins are involved!') 247 | g.attr(fontsize='12') 248 | 249 | g.attr('node', shape='box') 250 | g.attr('node', color='black') 251 | 252 | for node in nodes: 253 | if node.operation in skip_operations: 254 | continue 255 | 256 | if node.node_type == "table": 257 | color = 'lightgreen' 258 | elif node.node_type == "join": 259 | color = 'lightblue' 260 | else: 261 | color = 'white' 262 | 263 | g.node(node.identifier, node.get_node_label(), style='filled', fillcolor=color, tooltip=node.node_tooltip) 264 | 265 | if node.get_parent(skip_operations): 266 | g.edge(node.identifier, node.get_parent(skip_operations).identifier, node.edge_label, tooltip=node.edge_tooltip ) 267 | 268 | return g 269 | 270 | 271 | def get_plan_viz(df: DataFrame, skip_operations: list[str] = []): 272 | exec_plan = get_execution_plan(df) 273 | nodes = execution_plan_to_nodes(exec_plan) 274 | plan_viz = get_graph_from_nodes(nodes, skip_operations) 275 | 276 | return plan_viz 277 | 278 | 279 | def get_plan_viz_html(df: DataFrame, skip_operations: list[str] = [], image_format: str = 'svg'): 280 | plan_viz = get_plan_viz(df, skip_operations) 281 | 282 | # Set the format to 'svg' 283 | plan_viz.format = image_format 284 | 285 | return plan_viz.pipe().decode() 286 | 287 | 288 | def show_plan(df: DataFrame, displayHTML = None, skip_operations: list[str] = [], image_format: str = 'svg'): 289 | if displayHTML == None: 290 | # a simple display() might not work in Databricks 291 | display(get_plan_viz(df, skip_operations)) 292 | else: 293 | displayHTML(get_plan_viz_html(df, skip_operations, image_format)) -------------------------------------------------------------------------------- /Fabric/LibraryManager/LibraryManager.Notebook/notebook-content.py: -------------------------------------------------------------------------------- 1 | # Fabric notebook source 2 | 3 | # METADATA ******************** 4 | 5 | # META { 6 | # META "kernel_info": { 7 | # META "name": "synapse_pyspark" 8 | # META }, 9 | # META "dependencies": { 10 | # META "lakehouse": { 11 | # META "default_lakehouse": "2925655f-0293-4f32-8bc6-86ab989099a7", 12 | # META "default_lakehouse_name": "SomeLakehouse", 13 | # META "default_lakehouse_workspace_id": "ca0a79b9-9c03-40b5-9d6a-cfd3fda1c31e", 14 | # META "known_lakehouses": [ 15 | # META { 16 | # META "id": "2925655f-0293-4f32-8bc6-86ab989099a7" 17 | # META } 18 | # META ] 19 | # META } 20 | # META } 21 | # META } 22 | 23 | # MARKDOWN ******************** 24 | 25 | # # Fabric Library Manager 26 | # This notebook allows you to specify a set of notebooks from the current workspace to be bundled into a library. This library is stored in the `/Files` section of a lakehouse. Additionally a new notebook is created in the workspace called `load_LibraryManager` which you can call from your other notebooks using `%run` to load the LibraryManager and all its libraries into the current notebook session. 27 | # 28 | # If you change anything in the library notebooks, simply run this LibraryManager notebook again to also update the actual library thats imported in all notebooks. 29 | 30 | # CELL ******************** 31 | 32 | library_notebooks = [ 33 | { 34 | "notebook": "MyLibrary", 35 | "library_name": "MyLibrary.py" 36 | }, 37 | { 38 | "notebook": "MyOtherLibrary", 39 | "library_name": "MyOtherLibrary.py" 40 | }, 41 | { 42 | "notebook": "MyFabricSparkLibrary", 43 | "library_name": "MyFabricSparkLibrary.py" 44 | }, 45 | ] 46 | 47 | # METADATA ******************** 48 | 49 | # META { 50 | # META "language": "python", 51 | # META "language_group": "synapse_pyspark" 52 | # META } 53 | 54 | # CELL ******************** 55 | 56 | import os 57 | import base64 58 | import zipfile 59 | import time 60 | import json 61 | import requests 62 | 63 | import notebookutils 64 | from sempy.fabric import FabricRestClient 65 | 66 | # METADATA ******************** 67 | 68 | # META { 69 | # META "language": "python", 70 | # META "language_group": "synapse_pyspark" 71 | # META } 72 | 73 | # MARKDOWN ******************** 74 | 75 | # # Global Settings 76 | 77 | # CELL ******************** 78 | 79 | RUNTIME_CONTEXT = {k:v for k,v in notebookutils.runtime.context.items() if v is not None} 80 | 81 | LIBRARY_FOLDER = "/Libraries" # must start with "/" 82 | LIBRARY_NAME = "LibraryManager" 83 | LIBRARY_LAKEHOUSE_NAME = "SomeLakehouse" 84 | LIBRARY_IMPORT = f"from {LIBRARY_NAME} import *" # could also be a named import 85 | # LIBRARY_IMPORT = f"import {LIBRARY_NAME} as lm" 86 | LIBRARY_LOAD_NOTEBOOK_NAME = f"load_{LIBRARY_NAME}" 87 | 88 | LIBRARY_LAKEHOUSE = notebookutils.lakehouse.get(LIBRARY_LAKEHOUSE_NAME) 89 | 90 | assert LIBRARY_FOLDER.startswith("/"), "LIBRARY_FOLDER must start with '/'" 91 | 92 | # METADATA ******************** 93 | 94 | # META { 95 | # META "language": "python", 96 | # META "language_group": "synapse_pyspark" 97 | # META } 98 | 99 | # CELL ******************** 100 | 101 | REST_CLIENT = FabricRestClient() 102 | ALL_NOTEBOOKS = REST_CLIENT.get_paged(f"v1/workspaces/{RUNTIME_CONTEXT['currentWorkspaceId']}/notebooks") 103 | 104 | # we leverage the REST_CLIENT to get basic parameters for further requests 105 | dummy_call = REST_CLIENT.get(f"v1/workspaces") 106 | BASE_URL = dummy_call.url[:-13] 107 | HEADERS = dummy_call.request.headers 108 | 109 | invalid_libs = [lib for lib in library_notebooks if not lib["notebook"] in [nb["displayName"] for nb in ALL_NOTEBOOKS]] 110 | assert invalid_libs == [], f"The following Library Notebooks could not be found: {invalid_libs}" 111 | 112 | 113 | # METADATA ******************** 114 | 115 | # META { 116 | # META "language": "python", 117 | # META "language_group": "synapse_pyspark" 118 | # META } 119 | 120 | # MARKDOWN ******************** 121 | 122 | # # api_request 123 | # While we in general rely on SemPy library, there are some things that do not work for us or needed to be improved 124 | # 125 | # - you cannot run a POST request with a body using SemPy 126 | # - the default delay for Long-Running-Operations (LRO) is 20 seconds, which is way too long to simply get the definition of a notebook 127 | 128 | # CELL ******************** 129 | 130 | def api_request(method: str, api_path: str, body: dict = None, interval: int = 1): 131 | if body != None: 132 | if method.upper() == "POST": 133 | response = requests.post(BASE_URL + api_path, json = body, headers = HEADERS) 134 | else: 135 | raise Exception("Only method POST is supported with a body") 136 | else: 137 | response = REST_CLIENT.request(method, api_path) 138 | 139 | if response.status_code == 202: 140 | lro_path = response.headers["Location"] 141 | 142 | while lro_path: 143 | time.sleep(interval) # lower interval than the original sempy request with LRO_wait = True 144 | response = REST_CLIENT.request("GET", lro_path) 145 | lro_path = response.headers.get("Location") 146 | 147 | if lro_path and lro_path.endswith("/result"): 148 | response = REST_CLIENT.request("GET", lro_path) 149 | return response.json() 150 | 151 | return response.json() 152 | 153 | if False: 154 | method = "POST" 155 | api_path = f"v1/workspaces/{RUNTIME_CONTEXT['currentWorkspaceId']}/items/{RUNTIME_CONTEXT['currentNotebookId']}/getDefinition?format=fabricGitSource" 156 | definition = api_request(method, api_path) 157 | 158 | # METADATA ******************** 159 | 160 | # META { 161 | # META "language": "python", 162 | # META "language_group": "synapse_pyspark" 163 | # META } 164 | 165 | # MARKDOWN ******************** 166 | 167 | # # notebook_to_library 168 | # 169 | # Downloads a notebook from the current workspace as `.py` file and stores it in the Lakehouse. From there we create the library where we add all individual libraries to a `.zip` file which is used and imported in the end. 170 | 171 | # CELL ******************** 172 | 173 | def notebook_to_library(notebook: str, library_name: str = None): 174 | if not library_name: library_name = f"{notebook}.py" 175 | assert library_name.endswith(".py"), "library_name must end with '.py'" 176 | 177 | library_path = f"{LIBRARY_LAKEHOUSE['properties']['abfsPath']}/Files{LIBRARY_FOLDER}/{library_name}" 178 | 179 | print(f"Uploading library '{library_name}' to {library_path} ... ", end = "") 180 | 181 | library_notebook = [nb for nb in ALL_NOTEBOOKS if nb["displayName"] == notebook][0] 182 | notebook_id = library_notebook["id"] 183 | 184 | notebook_definition = api_request("POST", f"v1/workspaces/{RUNTIME_CONTEXT['currentWorkspaceId']}/items/{notebook_id}/getDefinition?format=fabricGitSource") 185 | notebook_part = [part for part in notebook_definition["definition"]["parts"] if part["path"].startswith("notebook-content")][0] 186 | 187 | try: 188 | file_content = base64.b64decode(notebook_part["payload"]).decode("utf-8") 189 | mssparkutils.fs.put(library_path, file_content, True) 190 | print("Done!") 191 | except Exception as e: 192 | print("ERROR!") 193 | print(str(e)) 194 | 195 | 196 | # METADATA ******************** 197 | 198 | # META { 199 | # META "language": "python", 200 | # META "language_group": "synapse_pyspark" 201 | # META } 202 | 203 | # MARKDOWN ******************** 204 | 205 | # # Update all know libraries 206 | 207 | # CELL ******************** 208 | 209 | for library_notebook in library_notebooks: 210 | notebook_to_library(**library_notebook) 211 | 212 | # METADATA ******************** 213 | 214 | # META { 215 | # META "language": "python", 216 | # META "language_group": "synapse_pyspark" 217 | # META } 218 | 219 | # MARKDOWN ******************** 220 | 221 | # # Write final library as ZIP-file 222 | 223 | # CELL ******************** 224 | 225 | # write CustomLibrary.zip 226 | libraries_local_path = f"/lakehouse/default/Files{LIBRARY_FOLDER}" 227 | 228 | libraries = [lib["library_name"] for lib in library_notebooks] 229 | 230 | try: 231 | os.remove(f"{libraries_local_path}/{LIBRARY_NAME}.zip") 232 | except OSError: 233 | pass 234 | 235 | 236 | print(f"Writing __init__.py file ... ", end = "") 237 | with open(f"{libraries_local_path}/__init__.py", "w") as init_file: 238 | init_file.writelines([f"from .{lib[:-3]} import *\n" for lib in libraries if lib != "__init__.py"]) 239 | print("Done!") 240 | 241 | print(f"Creating new archive at {libraries_local_path}/{LIBRARY_NAME}.zip ... ") 242 | with zipfile.ZipFile(f"{libraries_local_path}/{LIBRARY_NAME}.zip", "w", zipfile.ZIP_DEFLATED) as myzip: 243 | libs_to_add = libraries + ["__init__.py"] 244 | for lib in libs_to_add: 245 | print(f"\tAdding '{lib}' to archive ... ", end = "") 246 | myzip.write(f"{libraries_local_path}/{lib}", f"/{LIBRARY_NAME}/{lib}") 247 | print("Done!") 248 | 249 | print("Done!") 250 | 251 | # METADATA ******************** 252 | 253 | # META { 254 | # META "language": "python", 255 | # META "language_group": "synapse_pyspark" 256 | # META } 257 | 258 | # MARKDOWN ******************** 259 | 260 | # # Write the notebook to initialize the LibraryManager 261 | 262 | # CELL ******************** 263 | 264 | 265 | init_script = f"""# Fabric notebook source 266 | 267 | # CELL ******************** 268 | 269 | library_path = "{LIBRARY_LAKEHOUSE['properties']['abfsPath']}/Files{LIBRARY_FOLDER}/{LIBRARY_NAME}.zip" 270 | print(f"Loading {LIBRARY_NAME} from '{{library_path}}' ... ", end = "") 271 | sc.addPyFile(library_path) 272 | print("Done!") 273 | 274 | {LIBRARY_IMPORT} 275 | """ 276 | 277 | existing_notebook_id = [nb for nb in ALL_NOTEBOOKS if nb["displayName"] == LIBRARY_LOAD_NOTEBOOK_NAME] 278 | 279 | if existing_notebook_id: 280 | api_path = f"v1/workspaces/{RUNTIME_CONTEXT['currentWorkspaceId']}/items/{existing_notebook_id[0]['id']}/updateDefinition" 281 | body = {} 282 | else: 283 | api_path = f"v1/workspaces/{RUNTIME_CONTEXT['currentWorkspaceId']}/notebooks" 284 | body = { 285 | "displayName": LIBRARY_LOAD_NOTEBOOK_NAME, 286 | "description": "A notebook that can be run to load the LibraryManager via run" 287 | } 288 | 289 | definition = { 290 | "format": "fabricGitSource", 291 | "parts": [ 292 | { 293 | "path": "notebook-content.py", 294 | "payload": base64.b64encode(init_script.encode("utf-8")).decode("utf-8"), 295 | "payloadType": "InlineBase64" 296 | } 297 | ] 298 | } 299 | 300 | body["definition"] = definition 301 | 302 | result = api_request("POST", api_path, body) 303 | 304 | print(f"Successfully created notebook '{LIBRARY_LOAD_NOTEBOOK_NAME}' (ID = {result['id']})!") 305 | print(f"To load the LibraryManager into your notebooks, you can now use the following command in a notebook cell:") 306 | print("═"*80) 307 | print(f"%run {LIBRARY_LOAD_NOTEBOOK_NAME}") 308 | print("═"*80) 309 | 310 | 311 | # METADATA ******************** 312 | 313 | # META { 314 | # META "language": "python", 315 | # META "language_group": "synapse_pyspark" 316 | # META } 317 | --------------------------------------------------------------------------------