├── .vscode
    └── settings.json
├── Fabric
    ├── Readme.md
    ├── LibraryManager
    │   ├── SomeLakehouse.Lakehouse
    │   │   ├── lakehouse.metadata.json
    │   │   ├── shortcuts.metadata.json
    │   │   └── .platform
    │   ├── MyCustomEnvironment.Environment
    │   │   ├── Libraries
    │   │   │   └── PublicLibraries
    │   │   │   │   └── environment.yml
    │   │   ├── Setting
    │   │   │   └── Sparkcompute.yml
    │   │   └── .platform
    │   ├── MyLibrary.Notebook
    │   │   ├── .platform
    │   │   └── notebook-content.py
    │   ├── LibraryManager.Notebook
    │   │   ├── .platform
    │   │   └── notebook-content.py
    │   ├── MyOtherLibrary.Notebook
    │   │   ├── .platform
    │   │   └── notebook-content.py
    │   ├── notebook with run.Notebook
    │   │   ├── .platform
    │   │   └── notebook-content.py
    │   ├── MyFabricSparkLibrary.Notebook
    │   │   ├── .platform
    │   │   └── notebook-content.py
    │   ├── load_LibraryManager.Notebook
    │   │   ├── .platform
    │   │   └── notebook-content.py
    │   ├── regular notebook.Notebook
    │   │   ├── .platform
    │   │   └── notebook-content.py
    │   ├── notebook with environment.Notebook
    │   │   ├── .platform
    │   │   └── notebook-content.py
    │   └── README.md
    └── get latest row.Notebook
    │   ├── .platform
    │   └── notebook-content.py
├── README.md
├── LICENSE
├── .gitignore
└── DataEngineering
    └── Library
        └── VisualizeExecutionPlan.py


/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | 	
3 | }


--------------------------------------------------------------------------------
/Fabric/Readme.md:
--------------------------------------------------------------------------------
1 | This is an auto-created file for /Fabric


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Fabric.Toolbox
2 | Tools for Microsoft Fabric
3 | 


--------------------------------------------------------------------------------
/Fabric/LibraryManager/SomeLakehouse.Lakehouse/lakehouse.metadata.json:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/Fabric/LibraryManager/SomeLakehouse.Lakehouse/shortcuts.metadata.json:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/Fabric/LibraryManager/MyCustomEnvironment.Environment/Libraries/PublicLibraries/environment.yml:
--------------------------------------------------------------------------------
1 | dependencies:
2 |   - pip:
3 |       - pydantic==2.11.5
4 | 


--------------------------------------------------------------------------------
/Fabric/LibraryManager/MyCustomEnvironment.Environment/Setting/Sparkcompute.yml:
--------------------------------------------------------------------------------
 1 | enable_native_execution_engine: false
 2 | driver_cores: 8
 3 | driver_memory: 56g
 4 | executor_cores: 8
 5 | executor_memory: 56g
 6 | dynamic_executor_allocation:
 7 |   enabled: true
 8 |   min_executors: 1
 9 |   max_executors: 9
10 | runtime_version: 1.3
11 | 


--------------------------------------------------------------------------------
/Fabric/LibraryManager/SomeLakehouse.Lakehouse/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Lakehouse",
 5 |     "displayName": "SomeLakehouse"
 6 |   },
 7 |   "config": {
 8 |     "version": "2.0",
 9 |     "logicalId": "989099a7-86ab-8bc6-4f32-02932925655f"
10 |   }
11 | }


--------------------------------------------------------------------------------
/Fabric/get latest row.Notebook/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Notebook",
 5 |     "displayName": "get latest row",
 6 |     "description": "New notebook"
 7 |   },
 8 |   "config": {
 9 |     "version": "2.0",
10 |     "logicalId": "04768130-9f28-ac6e-4a68-a328a262441e"
11 |   }
12 | }


--------------------------------------------------------------------------------
/Fabric/LibraryManager/MyLibrary.Notebook/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Notebook",
 5 |     "displayName": "MyLibrary",
 6 |     "description": "New notebook"
 7 |   },
 8 |   "config": {
 9 |     "version": "2.0",
10 |     "logicalId": "42613894-966c-8038-408a-f3c6e0b056da"
11 |   }
12 | }


--------------------------------------------------------------------------------
/Fabric/LibraryManager/LibraryManager.Notebook/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Notebook",
 5 |     "displayName": "LibraryManager",
 6 |     "description": "New notebook"
 7 |   },
 8 |   "config": {
 9 |     "version": "2.0",
10 |     "logicalId": "fd937145-f03f-9309-4f99-df9effe06382"
11 |   }
12 | }


--------------------------------------------------------------------------------
/Fabric/LibraryManager/MyOtherLibrary.Notebook/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Notebook",
 5 |     "displayName": "MyOtherLibrary",
 6 |     "description": "New notebook"
 7 |   },
 8 |   "config": {
 9 |     "version": "2.0",
10 |     "logicalId": "cdbe2b88-86d0-bdd6-4537-99f8a8190032"
11 |   }
12 | }


--------------------------------------------------------------------------------
/Fabric/LibraryManager/notebook with run.Notebook/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Notebook",
 5 |     "displayName": "notebook with run",
 6 |     "description": "New notebook"
 7 |   },
 8 |   "config": {
 9 |     "version": "2.0",
10 |     "logicalId": "da713db2-3ea6-b2e6-40e8-43713a2c6432"
11 |   }
12 | }


--------------------------------------------------------------------------------
/Fabric/LibraryManager/MyFabricSparkLibrary.Notebook/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Notebook",
 5 |     "displayName": "MyFabricSparkLibrary",
 6 |     "description": "New notebook"
 7 |   },
 8 |   "config": {
 9 |     "version": "2.0",
10 |     "logicalId": "230a4da8-f37c-8793-4f16-003b80706781"
11 |   }
12 | }


--------------------------------------------------------------------------------
/Fabric/LibraryManager/load_LibraryManager.Notebook/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Notebook",
 5 |     "displayName": "load_LibraryManager",
 6 |     "description": "New notebook"
 7 |   },
 8 |   "config": {
 9 |     "version": "2.0",
10 |     "logicalId": "4e245d8f-ac1f-8d12-4d59-350bd396f44e"
11 |   }
12 | }


--------------------------------------------------------------------------------
/Fabric/LibraryManager/MyCustomEnvironment.Environment/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Environment",
 5 |     "displayName": "MyCustomEnvironment",
 6 |     "description": "Environment"
 7 |   },
 8 |   "config": {
 9 |     "version": "2.0",
10 |     "logicalId": "70983ad4-37e2-8559-43f2-fdb5ea9bf3be"
11 |   }
12 | }


--------------------------------------------------------------------------------
/Fabric/LibraryManager/regular notebook.Notebook/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Notebook",
 5 |     "displayName": "notebook with LibraryManager",
 6 |     "description": "New notebook"
 7 |   },
 8 |   "config": {
 9 |     "version": "2.0",
10 |     "logicalId": "31aba13c-8c3d-bd15-4062-41b95cf2e5da"
11 |   }
12 | }


--------------------------------------------------------------------------------
/Fabric/LibraryManager/notebook with environment.Notebook/.platform:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://developer.microsoft.com/json-schemas/fabric/gitIntegration/platformProperties/2.0.0/schema.json",
 3 |   "metadata": {
 4 |     "type": "Notebook",
 5 |     "displayName": "notebook with environment",
 6 |     "description": "New notebook"
 7 |   },
 8 |   "config": {
 9 |     "version": "2.0",
10 |     "logicalId": "13b53598-fcc5-8acc-4505-8077d78d57f6"
11 |   }
12 | }


--------------------------------------------------------------------------------
/Fabric/LibraryManager/README.md:
--------------------------------------------------------------------------------
1 | # LibraryManager for Microsoft Fabric Data Engineering
2 | 
3 | This notebook allows you to specify a set of notebooks from the current workspace to be bundled into a library. This library is stored in the `/Files` section of a lakehouse. Additionally a new notebook is created in the workspace called `load_LibraryManager` which you can call from main notebook to load the LibraryManager and all its libraries into the current notebook session


--------------------------------------------------------------------------------
/Fabric/LibraryManager/load_LibraryManager.Notebook/notebook-content.py:
--------------------------------------------------------------------------------
 1 | # Fabric notebook source
 2 | 
 3 | 
 4 | # CELL ********************
 5 | 
 6 | library_path = "abfss://ca0a79b9-9c03-40b5-9d6a-cfd3fda1c31e@onelake.dfs.fabric.microsoft.com/2925655f-0293-4f32-8bc6-86ab989099a7/Files/Libraries/LibraryManager.zip"
 7 | print(f"Loading LibraryManager from '{library_path}' ... ", end = "")
 8 | sc.addPyFile(library_path)
 9 | print("Done!")
10 | 
11 | from LibraryManager import *
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Gerhard Brueckl
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Fabric/LibraryManager/notebook with environment.Notebook/notebook-content.py:
--------------------------------------------------------------------------------
 1 | # Fabric notebook source
 2 | 
 3 | # METADATA ********************
 4 | 
 5 | # META {
 6 | # META   "kernel_info": {
 7 | # META     "name": "synapse_pyspark"
 8 | # META   },
 9 | # META   "dependencies": {
10 | # META     "lakehouse": {
11 | # META       "default_lakehouse": "2925655f-0293-4f32-8bc6-86ab989099a7",
12 | # META       "default_lakehouse_name": "SomeLakehouse",
13 | # META       "default_lakehouse_workspace_id": "ca0a79b9-9c03-40b5-9d6a-cfd3fda1c31e",
14 | # META       "known_lakehouses": [
15 | # META         {
16 | # META           "id": "2925655f-0293-4f32-8bc6-86ab989099a7"
17 | # META         }
18 | # META       ]
19 | # META     },
20 | # META     "environment": {
21 | # META       "environmentId": "70983ad4-37e2-8559-43f2-fdb5ea9bf3be",
22 | # META       "workspaceId": "00000000-0000-0000-0000-000000000000"
23 | # META     }
24 | # META   }
25 | # META }
26 | 
27 | # CELL ********************
28 | 
29 | print("This notebook uses a custom Environment")
30 | 
31 | # METADATA ********************
32 | 
33 | # META {
34 | # META   "language": "python",
35 | # META   "language_group": "synapse_pyspark"
36 | # META }
37 | 
38 | # CELL ********************
39 | 
40 | # MAGIC %%sql
41 | # MAGIC SELECT 'This notebook uses a custom Environment'
42 | 
43 | # METADATA ********************
44 | 
45 | # META {
46 | # META   "language": "sparksql",
47 | # META   "language_group": "synapse_pyspark"
48 | # META }
49 | 


--------------------------------------------------------------------------------
/Fabric/LibraryManager/MyLibrary.Notebook/notebook-content.py:
--------------------------------------------------------------------------------
 1 | # Fabric notebook source
 2 | 
 3 | # METADATA ********************
 4 | 
 5 | # META {
 6 | # META   "kernel_info": {
 7 | # META     "name": "synapse_pyspark"
 8 | # META   },
 9 | # META   "dependencies": {}
10 | # META }
11 | 
12 | # CELL ********************
13 | 
14 | # variable "spark" is not set when calling sc.addPyFile()
15 | _is_sc_addPyFile = not "spark" in locals()
16 | 
17 | # METADATA ********************
18 | 
19 | # META {
20 | # META   "language": "python",
21 | # META   "language_group": "synapse_pyspark"
22 | # META }
23 | 
24 | # CELL ********************
25 | 
26 | from pyspark.sql import DataFrame
27 | from pyspark import  SparkContext
28 | from pyspark.sql import DataFrame, SparkSession
29 | 
30 | # METADATA ********************
31 | 
32 | # META {
33 | # META   "language": "python",
34 | # META   "language_group": "synapse_pyspark"
35 | # META }
36 | 
37 | # CELL ********************
38 | 
39 | PI = 3.14
40 | 
41 | # METADATA ********************
42 | 
43 | # META {
44 | # META   "language": "python",
45 | # META   "language_group": "synapse_pyspark"
46 | # META }
47 | 
48 | # CELL ********************
49 | 
50 | def get_area(radius):
51 |     area = float(PI)*radius*radius
52 |     return area
53 | 
54 | # some code to run only when interactively developing the library
55 | if not _is_sc_addPyFile:
56 |     print(get_area(4))
57 | 
58 | # METADATA ********************
59 | 
60 | # META {
61 | # META   "language": "python",
62 | # META   "language_group": "synapse_pyspark"
63 | # META }
64 | 


--------------------------------------------------------------------------------
/Fabric/LibraryManager/notebook with run.Notebook/notebook-content.py:
--------------------------------------------------------------------------------
 1 | # Fabric notebook source
 2 | 
 3 | # METADATA ********************
 4 | 
 5 | # META {
 6 | # META   "kernel_info": {
 7 | # META     "name": "synapse_pyspark"
 8 | # META   },
 9 | # META   "dependencies": {
10 | # META     "lakehouse": {
11 | # META       "default_lakehouse": "2925655f-0293-4f32-8bc6-86ab989099a7",
12 | # META       "default_lakehouse_name": "SomeLakehouse",
13 | # META       "default_lakehouse_workspace_id": "ca0a79b9-9c03-40b5-9d6a-cfd3fda1c31e",
14 | # META       "known_lakehouses": [
15 | # META         {
16 | # META           "id": "2925655f-0293-4f32-8bc6-86ab989099a7"
17 | # META         }
18 | # META       ]
19 | # META     },
20 | # META     "environment": {}
21 | # META   }
22 | # META }
23 | 
24 | # MARKDOWN ********************
25 | 
26 | # # Using `%run` to import custom code from other notebooks
27 | 
28 | # CELL ********************
29 | 
30 | %run MyLibrary
31 | 
32 | # METADATA ********************
33 | 
34 | # META {
35 | # META   "language": "python",
36 | # META   "language_group": "synapse_pyspark"
37 | # META }
38 | 
39 | # CELL ********************
40 | 
41 | %run MyOtherLibrary
42 | 
43 | # METADATA ********************
44 | 
45 | # META {
46 | # META   "language": "python",
47 | # META   "language_group": "synapse_pyspark"
48 | # META }
49 | 
50 | # CELL ********************
51 | 
52 | # my regular code using imported library functions ...
53 | 
54 | # METADATA ********************
55 | 
56 | # META {
57 | # META   "language": "python",
58 | # META   "language_group": "synapse_pyspark"
59 | # META }
60 | 


--------------------------------------------------------------------------------
/Fabric/LibraryManager/MyOtherLibrary.Notebook/notebook-content.py:
--------------------------------------------------------------------------------
 1 | # Fabric notebook source
 2 | 
 3 | # METADATA ********************
 4 | 
 5 | # META {
 6 | # META   "kernel_info": {
 7 | # META     "name": "synapse_pyspark"
 8 | # META   },
 9 | # META   "dependencies": {}
10 | # META }
11 | 
12 | # CELL ********************
13 | 
14 | # variable "spark" is not set when calling sc.addPyFile()
15 | _is_sc_addPyFile = not "spark" in locals()
16 | 
17 | # METADATA ********************
18 | 
19 | # META {
20 | # META   "language": "python",
21 | # META   "language_group": "synapse_pyspark"
22 | # META }
23 | 
24 | # CELL ********************
25 | 
26 | import datetime as dt
27 | 
28 | # METADATA ********************
29 | 
30 | # META {
31 | # META   "language": "python",
32 | # META   "language_group": "synapse_pyspark"
33 | # META }
34 | 
35 | # CELL ********************
36 | 
37 | # when called via sc.addPyFile can import other libraries as well using the syntax below
38 | if _is_sc_addPyFile:
39 |     from .MyLibrary import PI
40 | else:
41 |     # for debugging we need to define the variables here
42 |     PI = 123
43 | 
44 | # METADATA ********************
45 | 
46 | # META {
47 | # META   "language": "python",
48 | # META   "language_group": "synapse_pyspark"
49 | # META }
50 | 
51 | # CELL ********************
52 | 
53 | def get_circumference(radius):
54 |     return 2*radius*PI
55 | 
56 | if not _is_sc_addPyFile:
57 |     print(get_circumference(1))
58 | 
59 | # METADATA ********************
60 | 
61 | # META {
62 | # META   "language": "python",
63 | # META   "language_group": "synapse_pyspark"
64 | # META }
65 | 
66 | # CELL ********************
67 | 
68 | def log(text: str, end: str = None):
69 |   print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\t" + str(text), end = end)
70 | 
71 | # some code to run only when interactively developing the library
72 | if not _is_sc_addPyFile:
73 |     log("This is a test log!")
74 | 
75 | # METADATA ********************
76 | 
77 | # META {
78 | # META   "language": "python",
79 | # META   "language_group": "synapse_pyspark"
80 | # META }
81 | 


--------------------------------------------------------------------------------
/Fabric/LibraryManager/regular notebook.Notebook/notebook-content.py:
--------------------------------------------------------------------------------
 1 | # Fabric notebook source
 2 | 
 3 | # METADATA ********************
 4 | 
 5 | # META {
 6 | # META   "kernel_info": {
 7 | # META     "name": "synapse_pyspark"
 8 | # META   },
 9 | # META   "dependencies": {
10 | # META     "lakehouse": {
11 | # META       "default_lakehouse": "2925655f-0293-4f32-8bc6-86ab989099a7",
12 | # META       "default_lakehouse_name": "SomeLakehouse",
13 | # META       "default_lakehouse_workspace_id": "ca0a79b9-9c03-40b5-9d6a-cfd3fda1c31e",
14 | # META       "known_lakehouses": [
15 | # META         {
16 | # META           "id": "2925655f-0293-4f32-8bc6-86ab989099a7"
17 | # META         }
18 | # META       ]
19 | # META     }
20 | # META   }
21 | # META }
22 | 
23 | # MARKDOWN ********************
24 | 
25 | # # Using LibraryManager to import custom code
26 | 
27 | # CELL ********************
28 | 
29 | %run load_LibraryManager
30 | 
31 | # METADATA ********************
32 | 
33 | # META {
34 | # META   "language": "python",
35 | # META   "language_group": "synapse_pyspark"
36 | # META }
37 | 
38 | # MARKDOWN ********************
39 | 
40 | # # Use log() function and PI constant from library
41 | 
42 | # CELL ********************
43 | 
44 | log(PI)
45 | 
46 | # METADATA ********************
47 | 
48 | # META {
49 | # META   "language": "python",
50 | # META   "language_group": "synapse_pyspark"
51 | # META }
52 | 
53 | # MARKDOWN ********************
54 | 
55 | # # Use library function that uses `notebookutils` under the hood
56 | 
57 | # CELL ********************
58 | 
59 | display(ls("/"))
60 | 
61 | # METADATA ********************
62 | 
63 | # META {
64 | # META   "language": "python",
65 | # META   "language_group": "synapse_pyspark"
66 | # META }
67 | 
68 | # MARKDOWN ********************
69 | 
70 | # # Use library function that uses Spark under the hood
71 | 
72 | # CELL ********************
73 | 
74 | display(table_to_df("myTable"))
75 | 
76 | # METADATA ********************
77 | 
78 | # META {
79 | # META   "language": "python",
80 | # META   "language_group": "synapse_pyspark"
81 | # META }
82 | 
83 | # MARKDOWN ********************
84 | 
85 | # # Use nested library function 
86 | 
87 | # CELL ********************
88 | 
89 | print(get_circumference(8))
90 | 
91 | # METADATA ********************
92 | 
93 | # META {
94 | # META   "language": "python",
95 | # META   "language_group": "synapse_pyspark"
96 | # META }
97 | 


--------------------------------------------------------------------------------
/Fabric/LibraryManager/MyFabricSparkLibrary.Notebook/notebook-content.py:
--------------------------------------------------------------------------------
  1 | # Fabric notebook source
  2 | 
  3 | # METADATA ********************
  4 | 
  5 | # META {
  6 | # META   "kernel_info": {
  7 | # META     "name": "synapse_pyspark"
  8 | # META   },
  9 | # META   "dependencies": {}
 10 | # META }
 11 | 
 12 | # CELL ********************
 13 | 
 14 | # variable "spark" is not set when calling sc.addPyFile()
 15 | _is_sc_addPyFile = not "spark" in locals()
 16 | 
 17 | # METADATA ********************
 18 | 
 19 | # META {
 20 | # META   "language": "python",
 21 | # META   "language_group": "synapse_pyspark"
 22 | # META }
 23 | 
 24 | # CELL ********************
 25 | 
 26 | from pyspark.sql import DataFrame
 27 | from pyspark import  SparkContext
 28 | from pyspark.sql import DataFrame, SparkSession
 29 | 
 30 | 
 31 | 
 32 | # METADATA ********************
 33 | 
 34 | # META {
 35 | # META   "language": "python",
 36 | # META   "language_group": "synapse_pyspark"
 37 | # META }
 38 | 
 39 | # MARKDOWN ********************
 40 | 
 41 | # # Using notebookutils
 42 | 
 43 | # CELL ********************
 44 | 
 45 | import notebookutils
 46 | 
 47 | # METADATA ********************
 48 | 
 49 | # META {
 50 | # META   "language": "python",
 51 | # META   "language_group": "synapse_pyspark"
 52 | # META }
 53 | 
 54 | # CELL ********************
 55 | 
 56 | def ls(path: str) -> DataFrame:
 57 |     print(f"Listing {path} ...")
 58 |     return notebookutils.fs.ls(path)
 59 | 
 60 | # a simple test which is only run when developing the library but not when the library is added via sc.addPyFile()
 61 | if not _is_sc_addPyFile:
 62 |     display(ls("/"))
 63 | 
 64 | # METADATA ********************
 65 | 
 66 | # META {
 67 | # META   "language": "python",
 68 | # META   "language_group": "synapse_pyspark"
 69 | # META }
 70 | 
 71 | # MARKDOWN ********************
 72 | 
 73 | # # Using Spark(`spark`) and SparkContext(`sc`)
 74 | 
 75 | # CELL ********************
 76 | 
 77 | if _is_sc_addPyFile:
 78 |     # as our library makes calls to spark, we need to create local instances of SparkContext "sc" and Sparksession "spark"
 79 |     if not "sc" in locals():
 80 |         #print("Creating local SparkContext variable 'sc' ... ", end = "")
 81 |         sc = SparkContext.getOrCreate()
 82 |         #print("Done!")
 83 | 
 84 |     if not "spark" in locals():
 85 |         #print("Creating local SparkSession variable 'spark' ... ", end = "")
 86 |         spark = (SparkSession(sc)
 87 |             .builder
 88 |             .getOrCreate())
 89 |         #print("Done!")
 90 | 
 91 | # METADATA ********************
 92 | 
 93 | # META {
 94 | # META   "language": "python",
 95 | # META   "language_group": "synapse_pyspark"
 96 | # META }
 97 | 
 98 | # CELL ********************
 99 | 
100 | def table_to_df(table_name: str) -> DataFrame:
101 |     df = spark.sql(f"SELECT * FROM {table_name}")
102 | 
103 |     return df
104 | 
105 | # METADATA ********************
106 | 
107 | # META {
108 | # META   "language": "python",
109 | # META   "language_group": "synapse_pyspark"
110 | # META }
111 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/Fabric/get latest row.Notebook/notebook-content.py:
--------------------------------------------------------------------------------
  1 | # Fabric notebook source
  2 | 
  3 | # METADATA ********************
  4 | 
  5 | # META {
  6 | # META   "kernel_info": {
  7 | # META     "name": "synapse_pyspark"
  8 | # META   },
  9 | # META   "dependencies": {
 10 | # META     "lakehouse": {
 11 | # META       "default_lakehouse": "8e6faaa0-29be-48c1-b0f3-1ffe17396203",
 12 | # META       "default_lakehouse_name": "TPCH",
 13 | # META       "default_lakehouse_workspace_id": "e2278f6a-27f2-4261-b759-052b593650b0"
 14 | # META     }
 15 | # META   }
 16 | # META }
 17 | 
 18 | # CELL ********************
 19 | 
 20 | # MAGIC %%configure -f
 21 | # MAGIC {
 22 | # MAGIC     // You can get a list of valid parameters to config the session from https://github.com/cloudera/livy#request-body.
 23 | # MAGIC     "driverMemory": "56g", // Recommended values: ["28g", "56g", "112g", "224g", "400g"]
 24 | # MAGIC     "driverCores": 8, // Recommended values: [4, 8, 16, 32, 64]
 25 | # MAGIC     "executorMemory": "56g",
 26 | # MAGIC     "executorCores": 8,
 27 | # MAGIC     "numExecutors": 4,
 28 | # MAGIC     "useStarterPool": false  // Set to true to force using starter pool
 29 | # MAGIC }
 30 | 
 31 | # METADATA ********************
 32 | 
 33 | # META {
 34 | # META   "language": "python",
 35 | # META   "language_group": "synapse_pyspark"
 36 | # META }
 37 | 
 38 | # CELL ********************
 39 | 
 40 | import time
 41 | import datetime as dt
 42 | import sempy.fabric as fabric
 43 | 
 44 | import pyspark.sql.functions as F
 45 | import pyspark.sql.types as T
 46 | from pyspark.sql import Window, Row
 47 | 
 48 | sc.addPyFile("https://raw.githubusercontent.com/gbrueckl/Fabric.Toolbox/main/DataEngineering/Library/VisualizeExecutionPlan.py")
 49 | from VisualizeExecutionPlan import show_plan
 50 | 
 51 | # METADATA ********************
 52 | 
 53 | # META {
 54 | # META   "language": "python",
 55 | # META   "language_group": "synapse_pyspark"
 56 | # META }
 57 | 
 58 | # CELL ********************
 59 | 
 60 | scale_factor = 10 # 1, 10 or 100
 61 | 
 62 | df = spark.sql(f"SELECT * FROM TPCH.sf{scale_factor}_lineitem")
 63 | display(df.limit(100))
 64 | 
 65 | group_by_cols = ["OrderId"]
 66 | sorting_cols = ["ExtendedPrice"]
 67 | 
 68 | # METADATA ********************
 69 | 
 70 | # META {
 71 | # META   "language": "python",
 72 | # META   "language_group": "synapse_pyspark"
 73 | # META }
 74 | 
 75 | # CELL ********************
 76 | 
 77 | # for validation of the code
 78 | if False:
 79 |     rdd = sc.parallelize([
 80 |         [1, date(2024, 1, 7), 13.90],
 81 |         [1, date(2024, 1, 16), 14.50],
 82 |         [2, date(2024, 1, 9), 10.50],
 83 |         [2, date(2024, 1, 28), 9.90],
 84 |         [3, date(2024, 1, 5), 1.50]
 85 |     ])
 86 | 
 87 |     df = rdd.toDF(['product_key', 'date', 'price'])
 88 | 
 89 |     display(df)
 90 | 
 91 |     group_by_cols = ['product_key']
 92 |     sorting_cols = ['date']
 93 | 
 94 | # METADATA ********************
 95 | 
 96 | # META {
 97 | # META   "language": "python",
 98 | # META   "language_group": "synapse_pyspark"
 99 | # META }
100 | 
101 | # CELL ********************
102 | 
103 | def log(text: str, end: str = None):
104 |   print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\t" + str(text), end = end)
105 | 
106 | # METADATA ********************
107 | 
108 | # META {
109 | # META   "language": "python",
110 | # META   "language_group": "synapse_pyspark"
111 | # META }
112 | 
113 | # CELL ********************
114 | 
115 | def evaluate_result(df, iterations = 3):
116 |     durations = []
117 |     for it in range(1, iterations+1):
118 |         log(f"Simulating write operation - iteration {it} ... ")
119 |         start = time.time()
120 |         df.write.format("noop").mode("overwrite").save()
121 |         end = time.time()
122 |         log(f"Done - Duration: {end - start:5.2f} seconds")
123 |         durations.append(end - start)
124 | 
125 |     log(f"Total Duration: {sum(durations)/len(durations)}")
126 |     log(f"Avg. Duration:  {sum(durations)}")
127 |     log(f"Run Durations:  {durations}")
128 | 
129 | # read the df once 
130 | evaluate_result(df)
131 | log(f"Rowcount: {df.count()}")
132 | 
133 | # METADATA ********************
134 | 
135 | # META {
136 | # META   "language": "python",
137 | # META   "language_group": "synapse_pyspark"
138 | # META }
139 | 
140 | # MARKDOWN ********************
141 | 
142 | # # Get the most recent price per product
143 | 
144 | # MARKDOWN ********************
145 | 
146 | # ## using window function
147 | 
148 | # CELL ********************
149 | 
150 | # define window, used DESC sort order
151 | w = Window.partitionBy(group_by_cols).orderBy([F.col(x).desc() for x in sorting_cols])
152 | 
153 | #filter DataFrame to only show first row for each group
154 | df_latest_window = (
155 |     df.withColumn('__row_num__', F.row_number().over(w))
156 |     .filter(F.col('__row_num__') == 1)
157 |     .drop('__row_num__')
158 | )
159 | 
160 | evaluate_result(df_latest_window)
161 | 
162 | # SF1:      
163 | # SF10:     21.9 s ± 381 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
164 | # SF100:    1min 10s ± 18.4 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
165 | 
166 | # METADATA ********************
167 | 
168 | # META {
169 | # META   "language": "python",
170 | # META   "language_group": "synapse_pyspark"
171 | # META }
172 | 
173 | # CELL ********************
174 | 
175 | show_plan(df_latest_window)
176 | 
177 | # METADATA ********************
178 | 
179 | # META {
180 | # META   "language": "python",
181 | # META   "language_group": "synapse_pyspark"
182 | # META }
183 | 
184 | # MARKDOWN ********************
185 | 
186 | # ## using self join
187 | 
188 | # CELL ********************
189 | 
190 | df_latest_dates_per_group = (
191 |     df.groupBy(group_by_cols)
192 |     .agg(*[F.max(x).alias(x) for x in sorting_cols])
193 | )
194 | #display(df_latest_dates_per_group)
195 | 
196 | df_latest_join = (
197 |     df.alias("base")
198 |     .join(df_latest_dates_per_group, group_by_cols + sorting_cols, "inner")
199 |     .select("base.*")
200 | )
201 | 
202 | evaluate_result(df_latest_join)
203 | 
204 | # SF1:      
205 | # SF10:     23.9 s ± 1.35 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
206 | # SF100:    1min 21s ± 4.76 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
207 | 
208 | # METADATA ********************
209 | 
210 | # META {
211 | # META   "language": "python",
212 | # META   "language_group": "synapse_pyspark"
213 | # META }
214 | 
215 | # CELL ********************
216 | 
217 | show_plan(df_latest_join)
218 | 
219 | # METADATA ********************
220 | 
221 | # META {
222 | # META   "language": "python",
223 | # META   "language_group": "synapse_pyspark"
224 | # META }
225 | 
226 | # MARKDOWN ********************
227 | 
228 | # ## using max and struct
229 | 
230 | # CELL ********************
231 | 
232 | df_latest_max_struct = (
233 |     df.groupBy(group_by_cols)
234 |     .agg(F.max(F.struct(*sorting_cols + [x for x in df.columns if x not in sorting_cols])).alias("latest"))
235 |     .select("latest.*")
236 |     .select(df.columns) # keep original column order
237 | )
238 | 
239 | evaluate_result(df_latest_max_struct)
240 | 
241 | # SF1:      
242 | # SF10:     21.6 s ± 1.5 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
243 | # SF100:    58.3 s ± 511 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
244 | 
245 | # METADATA ********************
246 | 
247 | # META {
248 | # META   "language": "python",
249 | # META   "language_group": "synapse_pyspark"
250 | # META }
251 | 
252 | # CELL ********************
253 | 
254 | show_plan(df_latest_max_struct)
255 | 
256 | # METADATA ********************
257 | 
258 | # META {
259 | # META   "language": "python",
260 | # META   "language_group": "synapse_pyspark"
261 | # META }
262 | 
263 | # MARKDOWN ********************
264 | 
265 | # ## using max_by
266 | 
267 | # CELL ********************
268 | 
269 | df_latest_max_by = (
270 |     df.groupBy(group_by_cols)
271 |     .agg(F.max_by(F.struct("*"), F.struct(*sorting_cols)).alias("latest"))
272 |     .select("latest.*")
273 | )
274 | 
275 | evaluate_result(df_latest_max_by)
276 | 
277 | # SF1:      
278 | # SF10:     24.4 s ± 5.76 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
279 | # SF100:    53.5 s ± 1.24 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
280 | 
281 | # METADATA ********************
282 | 
283 | # META {
284 | # META   "language": "python",
285 | # META   "language_group": "synapse_pyspark"
286 | # META }
287 | 
288 | # CELL ********************
289 | 
290 | show_plan(df_latest_max_by)
291 | 
292 | # METADATA ********************
293 | 
294 | # META {
295 | # META   "language": "python",
296 | # META   "language_group": "synapse_pyspark"
297 | # META }
298 | 
299 | # MARKDOWN ********************
300 | 
301 | # ## using max_by - no struct in sort
302 | 
303 | # CELL ********************
304 | 
305 | # MAGIC %%timeit -n 1 -r 3
306 | # MAGIC df_latest_max_by_no_struct = df.groupBy(group_by_cols).agg(F.max_by(F.struct("*"), sorting_cols[0]).alias("latest")).select("latest.*")
307 | # MAGIC #display(df_latest_max_struct)
308 | # MAGIC #show_plan(df_latest_max_struct)
309 | # MAGIC evaluate_result(df_latest_max_by_no_struct)
310 | # MAGIC 
311 | # MAGIC # SF1:      
312 | # MAGIC # SF10:     27.5 s ± 4.28 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
313 | # MAGIC # SF100:    52.8 s ± 1.21 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
314 | 
315 | # METADATA ********************
316 | 
317 | # META {
318 | # META   "language": "python",
319 | # META   "language_group": "synapse_pyspark"
320 | # META }
321 | 
322 | # CELL ********************
323 | 
324 | show_plan(df_latest_max_by_no_struct)
325 | 
326 | # METADATA ********************
327 | 
328 | # META {
329 | # META   "language": "python",
330 | # META   "language_group": "synapse_pyspark"
331 | # META }
332 | 


--------------------------------------------------------------------------------
/DataEngineering/Library/VisualizeExecutionPlan.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql import DataFrame
  2 | import re, contextlib, io, math
  3 | from graphviz import Digraph
  4 | 
  5 | print("Loading VisualizeExecutionPlan library ...")
  6 | 
  7 | def get_execution_plan(df: DataFrame) -> str:
  8 |     with contextlib.redirect_stdout(io.StringIO()) as stdout:
  9 |         df.explain(mode="cost")
 10 | 
 11 |     plan = stdout.getvalue()
 12 |     return plan
 13 | 
 14 | 
 15 | class PlanNode:
 16 |     plan_type: str
 17 |     level: int
 18 |     line: str
 19 |     line_number: int
 20 |     parent: any
 21 |     identifier: str
 22 |     table: str
 23 |     operation: str
 24 |     sub_operation: str
 25 |     
 26 |     size_in_bytes: int
 27 |     size: str
 28 | 
 29 |     node_type: str
 30 |     node_label: str
 31 |     node_tooltip: str
 32 | 
 33 |     edge_label: str
 34 |     edge_tooltip: str
 35 | 
 36 |     node_matching_text: str
 37 | 
 38 |     def __init__(self, line, line_number, plan_type):
 39 |         if line.startswith("*"): # replace whole-stage code gen prefix
 40 |             line = re.sub('([^*]*)\*\([0-9]*\)\s(.*)', r'\1\2', line)
 41 | 
 42 |         self.line = line
 43 |         self.line_number = line_number
 44 |         self.plan_type = plan_type
 45 | 
 46 |         self.level = self.get_level()
 47 | 
 48 |         self.parent = None
 49 | 
 50 |         self.populate_fields()
 51 | 
 52 |     def populate_fields(self):
 53 |         
 54 |         self.text = self.get_text()
 55 |         
 56 |         self.identifier = self.get_identifier()
 57 |         self.table = self.get_table()
 58 |         self.operation = self.get_operation()
 59 |         self.sub_operation = self.get_sub_operation()
 60 |         self.size_in_bytes = self.get_size_in_bytes()
 61 |         self.size = self.get_size()
 62 |         self.node_type = self.get_node_type()
 63 |         self.node_label = self.get_node_label()
 64 |         self.node_tooltip = self.get_node_tooltip()
 65 |         self.edge_label = self.get_edge_label()
 66 |         self.edge_tooltip = self.get_edge_tooltip()
 67 | 
 68 |         self.node_matching_text = self.get_node_matching_text()
 69 | 
 70 |     def get_level(self) -> int:
 71 |         return int(re.search(r'[A-Z]', self.line).start() / 3)
 72 | 
 73 |     def get_parent(self, skip_ops: list[str] = []):
 74 |         if not self.parent:
 75 |             return None
 76 | 
 77 |         it = self.parent
 78 |         while it.get_operation() in skip_ops:
 79 |             it = it.parent
 80 | 
 81 |         return it
 82 | 
 83 |     def get_identifier(self):
 84 |         return str(self.line_number)
 85 | 
 86 |     def get_operation(self) -> str:
 87 |         #m = re.search('^[:\s+-]*(.*?)[\(),\s]', self.line)
 88 |         m = re.search('([A-Za-z]+)', self.line)
 89 |         if m:
 90 |             return m.group(1)
 91 |         else:
 92 |             return self.line
 93 | 
 94 |     def get_sub_operation(self) -> str:
 95 |         m = re.search('^[:\s+-]*([A-Za-z]+)\s([A-Za-z]+)', self.line)
 96 |         if m:
 97 |             if m.group(1) in ["Join", "FileScan"]:
 98 |                 return m.group(2)
 99 |             return m.group(1)
100 |         else:
101 |             return self.get_operation()
102 | 
103 |     def get_table(self) -> str:
104 |         m = re.search('([a-z0-9-_]*)\.([a-z0-9-_]*)\.([a-z0-9-_]*)', self.line)
105 |         if m:
106 |             #return f"{m.group(1)}.{m.group(2)}.{m.group(3)}"
107 |             return f"{m.group(2)}.{m.group(3)}"
108 |         else:
109 |             return None
110 | 
111 |     def get_size_in_bytes(self) -> int:
112 |         # from https://semyonsinchenko.github.io/ssinchenko/post/estimation-spark-df-size/
113 |         m = re.search('sizeInBytes\s*=\s*([0-9.]*)\s(.*?)[),]', self.line)
114 |         if m:
115 |             size = float(m.group(1))
116 |             units = m.group(2)
117 |         else:
118 |             return -1
119 | 
120 |         units = units.replace("i", "")
121 | 
122 |         size_names = ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
123 |         i = size_names.index(units)
124 |         p = math.pow(1024, i)
125 |         
126 |         size = size * p
127 | 
128 |         return size  # size in byte
129 | 
130 |     def get_size(self) -> str:
131 |         size_bytes = self.get_size_in_bytes()
132 |         if size_bytes == -1:
133 |             return None
134 | 
135 |         if size_bytes == 0:
136 |             return "0B"
137 | 
138 |         size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
139 |         i = int(math.floor(math.log(size_bytes, 1024)))
140 |         p = math.pow(1024, i)
141 |         s = round(size_bytes / p, 2)
142 |         return "%s %s" % (s, size_name[i])
143 | 
144 |     def get_node_type(self) -> str:
145 |         if self.table:
146 |             return "table"
147 |         else:
148 |             if "Join" in self.operation:
149 |                 return "join"
150 |             return self.operation.lower()
151 | 
152 |     def get_node_label(self) -> str:
153 |         if not self.parent:
154 |             return f"RESULT"
155 |         if self.node_type == "table":
156 |             return self.get_table()
157 | 
158 |         ret = self.operation
159 |         if self.node_type == "join":
160 |             if self.plan_type == "logical":
161 |                 ret += "\n" + self.sub_operation
162 |             elif self.plan_type == "physical":
163 |                 ret += "\n" + self.text.split(",")[-3]
164 | 
165 |         return ret
166 | 
167 |     def get_node_tooltip(self) -> str:
168 |         return self.text
169 | 
170 |     def get_edge_label(self) -> str:
171 |         return self.get_size()
172 | 
173 |     def get_edge_tooltip(self) -> str:
174 |         return None
175 | 
176 |     def get_text(self) -> str:
177 |         m = re.search('^[:\s+-]*(.*)', self.line)
178 |         return m.group(1)
179 | 
180 |     def get_node_matching_text(self) -> str:
181 |         if self.node_type == "table":
182 |             return self.table
183 |         if self.operation == "Filter":
184 |             return self.text.split(',')[0]
185 | 
186 | 
187 | def execution_plan_to_nodes(exec_plan: str, plan_type: str = "combined") -> list[PlanNode]:
188 |     assert plan_type in ["logical", "physical", "combined"]
189 |     
190 |     if plan_type == "combined":
191 |         logical_nodes = execution_plan_to_nodes(exec_plan, "logical")
192 |         physical_nodes = execution_plan_to_nodes(exec_plan, "physical")
193 | 
194 |         for node in physical_nodes:
195 |             log_node = next((nd for nd in logical_nodes if nd.node_matching_text == node.node_matching_text), None)
196 |             if log_node:
197 |                 node.size = log_node.size
198 |                 node.size_in_bytes = log_node.size_in_bytes
199 |                 node.edge_label = log_node.edge_label
200 |                 node.edge_tooltip = log_node.edge_tooltip
201 | 
202 |         return physical_nodes
203 | 
204 |     lines = [l for l in exec_plan.split("\n") if len(l.strip()) > 0]
205 | 
206 |     nodes = []
207 |     line_number = 1
208 |     capture_started = False
209 |     for line in lines:          
210 |         node = PlanNode(line, line_number, plan_type)
211 |         line_number += 1
212 | 
213 |         if line.startswith("==") and line.endswith("=="):
214 |             if not capture_started: 
215 |                 if plan_type == "logical":
216 |                     if "== Optimized Logical Plan ==" in line:
217 |                         capture_started = True
218 |                         node.line = "RESULT"
219 |                         node.level = -1
220 |                 elif plan_type == "physical":
221 |                     if line == "== Physical Plan ==":
222 |                         capture_started = True
223 |                         node.line = "RESULT"
224 |                         node.level = -1
225 |                 else:
226 |                     raise Exception("Invalid plan_type! Only 'logical' and 'physical' are supported!")
227 |             else:
228 |                 break
229 | 
230 |         if not capture_started:
231 |             continue
232 |         
233 |         parent = next((nd for nd in reversed(nodes) if nd.level == node.level - 1), None)
234 |         node.parent = parent
235 |         node.populate_fields()
236 | 
237 |         nodes.append(node)
238 | 
239 |     return nodes
240 | 
241 | 
242 | # https://graphviz.readthedocs.io/en/stable/examples.html
243 | def get_graph_from_nodes(nodes: list[PlanNode], skip_operations: list[str] = []):
244 |     g = Digraph(name="Execution Plan", comment='Execution Plan')
245 | 
246 |     g.attr(label=r'Execution Plan\nSizes are estimates based on table statistics\nThey are not reliable anymore after joins are involved!')
247 |     g.attr(fontsize='12')
248 | 
249 |     g.attr('node', shape='box')
250 |     g.attr('node', color='black')
251 | 
252 |     for node in nodes:
253 |         if node.operation in skip_operations:
254 |             continue
255 | 
256 |         if node.node_type == "table":
257 |             color = 'lightgreen'
258 |         elif node.node_type == "join":
259 |             color = 'lightblue'
260 |         else:
261 |             color = 'white'
262 | 
263 |         g.node(node.identifier, node.get_node_label(), style='filled', fillcolor=color, tooltip=node.node_tooltip)
264 | 
265 |         if node.get_parent(skip_operations):
266 |             g.edge(node.identifier, node.get_parent(skip_operations).identifier, node.edge_label, tooltip=node.edge_tooltip )
267 | 
268 |     return g
269 | 
270 | 
271 | def get_plan_viz(df: DataFrame, skip_operations: list[str] = []):
272 |     exec_plan = get_execution_plan(df)
273 |     nodes = execution_plan_to_nodes(exec_plan)
274 |     plan_viz = get_graph_from_nodes(nodes, skip_operations)
275 | 
276 |     return plan_viz
277 | 
278 | 
279 | def get_plan_viz_html(df: DataFrame, skip_operations: list[str] = [], image_format: str = 'svg'):
280 |     plan_viz = get_plan_viz(df, skip_operations)
281 | 
282 |     # Set the format to 'svg'
283 |     plan_viz.format = image_format
284 | 
285 |     return plan_viz.pipe().decode()
286 | 
287 | 
288 | def show_plan(df: DataFrame, displayHTML = None, skip_operations: list[str] = [], image_format: str = 'svg'):
289 |     if displayHTML == None:
290 |         # a simple display() might not work in Databricks
291 |         display(get_plan_viz(df, skip_operations))
292 |     else:
293 |         displayHTML(get_plan_viz_html(df, skip_operations, image_format))


--------------------------------------------------------------------------------
/Fabric/LibraryManager/LibraryManager.Notebook/notebook-content.py:
--------------------------------------------------------------------------------
  1 | # Fabric notebook source
  2 | 
  3 | # METADATA ********************
  4 | 
  5 | # META {
  6 | # META   "kernel_info": {
  7 | # META     "name": "synapse_pyspark"
  8 | # META   },
  9 | # META   "dependencies": {
 10 | # META     "lakehouse": {
 11 | # META       "default_lakehouse": "2925655f-0293-4f32-8bc6-86ab989099a7",
 12 | # META       "default_lakehouse_name": "SomeLakehouse",
 13 | # META       "default_lakehouse_workspace_id": "ca0a79b9-9c03-40b5-9d6a-cfd3fda1c31e",
 14 | # META       "known_lakehouses": [
 15 | # META         {
 16 | # META           "id": "2925655f-0293-4f32-8bc6-86ab989099a7"
 17 | # META         }
 18 | # META       ]
 19 | # META     }
 20 | # META   }
 21 | # META }
 22 | 
 23 | # MARKDOWN ********************
 24 | 
 25 | # # Fabric Library Manager
 26 | # This notebook allows you to specify a set of notebooks from the current workspace to be bundled into a library. This library is stored in the `/Files` section of a lakehouse. Additionally a new notebook is created in the workspace called `load_LibraryManager` which you can call from your other notebooks using `%run` to load the LibraryManager and all its libraries into the current notebook session.
 27 | # 
 28 | # If you change anything in the library notebooks, simply run this LibraryManager notebook again to also update the actual library thats imported in all notebooks.
 29 | 
 30 | # CELL ********************
 31 | 
 32 | library_notebooks = [
 33 |     {
 34 |         "notebook": "MyLibrary",
 35 |         "library_name": "MyLibrary.py"
 36 |     },
 37 |     {
 38 |         "notebook": "MyOtherLibrary",
 39 |         "library_name": "MyOtherLibrary.py"
 40 |     },
 41 |     {
 42 |         "notebook": "MyFabricSparkLibrary",
 43 |         "library_name": "MyFabricSparkLibrary.py"
 44 |     },
 45 | ]
 46 | 
 47 | # METADATA ********************
 48 | 
 49 | # META {
 50 | # META   "language": "python",
 51 | # META   "language_group": "synapse_pyspark"
 52 | # META }
 53 | 
 54 | # CELL ********************
 55 | 
 56 | import os
 57 | import base64
 58 | import zipfile
 59 | import time
 60 | import json
 61 | import requests
 62 | 
 63 | import notebookutils
 64 | from sempy.fabric import FabricRestClient
 65 | 
 66 | # METADATA ********************
 67 | 
 68 | # META {
 69 | # META   "language": "python",
 70 | # META   "language_group": "synapse_pyspark"
 71 | # META }
 72 | 
 73 | # MARKDOWN ********************
 74 | 
 75 | # # Global Settings
 76 | 
 77 | # CELL ********************
 78 | 
 79 | RUNTIME_CONTEXT = {k:v for k,v in notebookutils.runtime.context.items() if v is not None}
 80 | 
 81 | LIBRARY_FOLDER = "/Libraries" # must start with "/"
 82 | LIBRARY_NAME = "LibraryManager"
 83 | LIBRARY_LAKEHOUSE_NAME = "SomeLakehouse"
 84 | LIBRARY_IMPORT = f"from {LIBRARY_NAME} import *" # could also be a named import
 85 | # LIBRARY_IMPORT = f"import {LIBRARY_NAME} as lm"
 86 | LIBRARY_LOAD_NOTEBOOK_NAME = f"load_{LIBRARY_NAME}"
 87 | 
 88 | LIBRARY_LAKEHOUSE = notebookutils.lakehouse.get(LIBRARY_LAKEHOUSE_NAME)
 89 | 
 90 | assert LIBRARY_FOLDER.startswith("/"), "LIBRARY_FOLDER must start with '/'"
 91 | 
 92 | # METADATA ********************
 93 | 
 94 | # META {
 95 | # META   "language": "python",
 96 | # META   "language_group": "synapse_pyspark"
 97 | # META }
 98 | 
 99 | # CELL ********************
100 | 
101 | REST_CLIENT = FabricRestClient()
102 | ALL_NOTEBOOKS = REST_CLIENT.get_paged(f"v1/workspaces/{RUNTIME_CONTEXT['currentWorkspaceId']}/notebooks")
103 | 
104 | # we leverage the REST_CLIENT to get basic parameters for further requests
105 | dummy_call = REST_CLIENT.get(f"v1/workspaces")
106 | BASE_URL = dummy_call.url[:-13]
107 | HEADERS = dummy_call.request.headers
108 | 
109 | invalid_libs = [lib for lib in library_notebooks if not lib["notebook"] in [nb["displayName"] for nb in ALL_NOTEBOOKS]]
110 | assert invalid_libs == [], f"The following Library Notebooks could not be found: {invalid_libs}"
111 | 
112 | 
113 | # METADATA ********************
114 | 
115 | # META {
116 | # META   "language": "python",
117 | # META   "language_group": "synapse_pyspark"
118 | # META }
119 | 
120 | # MARKDOWN ********************
121 | 
122 | # # api_request 
123 | # While we in general rely on SemPy library, there are some things that do not work for us or needed to be improved
124 | # 
125 | # - you cannot run a POST request with a body using SemPy 
126 | # - the default delay for Long-Running-Operations (LRO) is 20 seconds, which is way too long to simply get the definition of a notebook
127 | 
128 | # CELL ********************
129 | 
130 | def api_request(method: str, api_path: str, body: dict = None, interval: int = 1):
131 |     if body != None:
132 |         if method.upper() == "POST":
133 |             response = requests.post(BASE_URL + api_path, json = body, headers = HEADERS)
134 |         else:
135 |             raise Exception("Only method POST is supported with a body")
136 |     else:
137 |         response = REST_CLIENT.request(method, api_path)
138 | 
139 |     if response.status_code == 202:
140 |         lro_path = response.headers["Location"]
141 | 
142 |         while lro_path:
143 |             time.sleep(interval) # lower interval than the original sempy request with LRO_wait = True
144 |             response = REST_CLIENT.request("GET", lro_path)
145 |             lro_path = response.headers.get("Location")
146 |             
147 |             if lro_path and lro_path.endswith("/result"):
148 |                 response = REST_CLIENT.request("GET", lro_path)
149 |                 return response.json()            
150 | 
151 |     return response.json()
152 | 
153 | if False:
154 |     method = "POST"
155 |     api_path = f"v1/workspaces/{RUNTIME_CONTEXT['currentWorkspaceId']}/items/{RUNTIME_CONTEXT['currentNotebookId']}/getDefinition?format=fabricGitSource"
156 |     definition = api_request(method, api_path)
157 | 
158 | # METADATA ********************
159 | 
160 | # META {
161 | # META   "language": "python",
162 | # META   "language_group": "synapse_pyspark"
163 | # META }
164 | 
165 | # MARKDOWN ********************
166 | 
167 | # # notebook_to_library
168 | # 
169 | # Downloads a notebook from the current workspace as `.py` file and stores it in the Lakehouse. From there we create the library where we add all individual libraries to a `.zip` file which is used and imported in the end.
170 | 
171 | # CELL ********************
172 | 
173 | def notebook_to_library(notebook: str, library_name: str = None):
174 |     if not library_name: library_name = f"{notebook}.py"
175 |     assert library_name.endswith(".py"), "library_name must end with '.py'"
176 | 
177 |     library_path = f"{LIBRARY_LAKEHOUSE['properties']['abfsPath']}/Files{LIBRARY_FOLDER}/{library_name}"
178 | 
179 |     print(f"Uploading library '{library_name}' to {library_path} ... ", end = "")
180 | 
181 |     library_notebook = [nb for nb in ALL_NOTEBOOKS if nb["displayName"] == notebook][0]
182 |     notebook_id = library_notebook["id"]
183 | 
184 |     notebook_definition = api_request("POST", f"v1/workspaces/{RUNTIME_CONTEXT['currentWorkspaceId']}/items/{notebook_id}/getDefinition?format=fabricGitSource")
185 |     notebook_part = [part for part in notebook_definition["definition"]["parts"] if part["path"].startswith("notebook-content")][0]
186 | 
187 |     try:
188 |         file_content = base64.b64decode(notebook_part["payload"]).decode("utf-8")
189 |         mssparkutils.fs.put(library_path, file_content, True)
190 |         print("Done!")
191 |     except Exception as e:
192 |         print("ERROR!")
193 |         print(str(e))
194 | 
195 | 
196 | # METADATA ********************
197 | 
198 | # META {
199 | # META   "language": "python",
200 | # META   "language_group": "synapse_pyspark"
201 | # META }
202 | 
203 | # MARKDOWN ********************
204 | 
205 | # # Update all know libraries
206 | 
207 | # CELL ********************
208 | 
209 | for library_notebook in library_notebooks:
210 |     notebook_to_library(**library_notebook)
211 | 
212 | # METADATA ********************
213 | 
214 | # META {
215 | # META   "language": "python",
216 | # META   "language_group": "synapse_pyspark"
217 | # META }
218 | 
219 | # MARKDOWN ********************
220 | 
221 | # # Write final library as ZIP-file
222 | 
223 | # CELL ********************
224 | 
225 | # write CustomLibrary.zip
226 | libraries_local_path = f"/lakehouse/default/Files{LIBRARY_FOLDER}"
227 | 
228 | libraries = [lib["library_name"] for lib in library_notebooks]
229 | 
230 | try:
231 |     os.remove(f"{libraries_local_path}/{LIBRARY_NAME}.zip")
232 | except OSError:
233 |     pass
234 | 
235 | 
236 | print(f"Writing __init__.py file ... ", end = "")
237 | with open(f"{libraries_local_path}/__init__.py", "w") as init_file:
238 |     init_file.writelines([f"from .{lib[:-3]} import *\n" for lib in libraries if lib != "__init__.py"])
239 | print("Done!")
240 | 
241 | print(f"Creating new archive at {libraries_local_path}/{LIBRARY_NAME}.zip ... ")
242 | with zipfile.ZipFile(f"{libraries_local_path}/{LIBRARY_NAME}.zip", "w", zipfile.ZIP_DEFLATED) as myzip:
243 |     libs_to_add = libraries + ["__init__.py"]
244 |     for lib in libs_to_add:
245 |         print(f"\tAdding '{lib}' to archive ... ", end = "")
246 |         myzip.write(f"{libraries_local_path}/{lib}", f"/{LIBRARY_NAME}/{lib}")
247 |         print("Done!")
248 | 
249 | print("Done!")    
250 | 
251 | # METADATA ********************
252 | 
253 | # META {
254 | # META   "language": "python",
255 | # META   "language_group": "synapse_pyspark"
256 | # META }
257 | 
258 | # MARKDOWN ********************
259 | 
260 | # # Write the notebook to initialize the LibraryManager
261 | 
262 | # CELL ********************
263 | 
264 | 
265 | init_script = f"""# Fabric notebook source
266 | 
267 | # CELL ********************
268 | 
269 | library_path = "{LIBRARY_LAKEHOUSE['properties']['abfsPath']}/Files{LIBRARY_FOLDER}/{LIBRARY_NAME}.zip"
270 | print(f"Loading {LIBRARY_NAME} from '{{library_path}}' ... ", end = "")
271 | sc.addPyFile(library_path)
272 | print("Done!")
273 | 
274 | {LIBRARY_IMPORT}
275 | """
276 | 
277 | existing_notebook_id = [nb for nb in ALL_NOTEBOOKS if nb["displayName"] == LIBRARY_LOAD_NOTEBOOK_NAME]
278 | 
279 | if existing_notebook_id:
280 |     api_path = f"v1/workspaces/{RUNTIME_CONTEXT['currentWorkspaceId']}/items/{existing_notebook_id[0]['id']}/updateDefinition"
281 |     body = {}
282 | else:
283 |     api_path = f"v1/workspaces/{RUNTIME_CONTEXT['currentWorkspaceId']}/notebooks"
284 |     body = {
285 |     "displayName": LIBRARY_LOAD_NOTEBOOK_NAME,
286 |     "description": "A notebook that can be run to load the LibraryManager via run"
287 |     }
288 | 
289 | definition = {
290 |     "format": "fabricGitSource",
291 |     "parts": [
292 |         {
293 |             "path": "notebook-content.py",
294 |             "payload": base64.b64encode(init_script.encode("utf-8")).decode("utf-8"),
295 |             "payloadType": "InlineBase64"
296 |         }
297 |     ]
298 | }
299 | 
300 | body["definition"] = definition
301 | 
302 | result = api_request("POST", api_path, body)
303 | 
304 | print(f"Successfully created notebook '{LIBRARY_LOAD_NOTEBOOK_NAME}' (ID = {result['id']})!")
305 | print(f"To load the LibraryManager into your notebooks, you can now use the following command in a notebook cell:")
306 | print("═"*80)
307 | print(f"%run {LIBRARY_LOAD_NOTEBOOK_NAME}")
308 | print("═"*80)
309 | 
310 | 
311 | # METADATA ********************
312 | 
313 | # META {
314 | # META   "language": "python",
315 | # META   "language_group": "synapse_pyspark"
316 | # META }
317 | 


--------------------------------------------------------------------------------