├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
└── notebooks
├── databricks_daily_cost_slack
├── README.md
├── databricks_cost_slack.ipynb
└── databricks_daily_cost.png
├── delta_docs_pydantic
├── README.md
└── delta_docs_pydantic.ipynb
├── migrate_workspace
├── README.md
└── migrate_workspace.ipynb
├── pandas_delta
├── README.md
├── assets
│ ├── databricks_sql_python.png
│ └── unity_catalog_cluster.png
└── pandas_delta.ipynb
├── update_job_cluster
├── README.md
└── update_job_cluster.ipynb
├── workflow_calendar
├── README.md
├── assets
│ └── example_viz.png
└── workflow_calender.ipynb
└── workflow_config_exporter
├── README.md
├── assets
└── example_config.png
└── workflow_config_exporter.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to `databricks_helpers` 👷🏾♀️
2 |
3 | ## Contribution Guidelines
4 |
5 | - 🍴 **Fork the Repository:** Start by forking the repository to make your contributions.
6 |
7 | - 🌲 **Create a New Branch:** Always create a new branch for your contributions (`git checkout -b feature-branch`).
8 |
9 | - 🔐 **Handle Secrets Carefully:** Ensure that Databricks secrets, tokens, cluster configurations, or hostnames are never exposed publicly.
10 |
11 | - 📙 **Export as .ipynb:** Ensure that Databricks notebooks are exported as `.ipynb` files.
12 |
13 | - 📃 **Ensure Relevance:** Contributions should be directly related to Databricks, offering unique insights not found in official documentation or other common repositories.
14 |
15 | - 🔖 **Document Your Code:** Ensure your code is well-documented, explaining the purpose and functionality of your contribution.
16 |
17 | - 🧑🏻💻 **Follow Coding Conventions:** Ensure your code aligns with existing coding conventions for consistency and readability. Format your code cells. Our team can assist with this during PR review.
18 |
19 | - 🔨 **Test Your Code:** Ensure your code is thoroughly tested to maintain the repository's quality and reliability.
20 |
21 | - 🎫 **Submitting Issues:** Feel free to submit issues and enhancement requests, ensuring they are well-described and labeled.
22 |
23 | - 🤝 **Submitting Pull Requests:** Make sure your code is in a new branch and submit a pull request, ensuring it's well-described.
24 |
25 | Thank you for contributing to `databricks_helpers`! ❤️
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2023, Dotlas
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | 3. Neither the name of the copyright holder nor the names of its
16 | contributors may be used to endorse or promote products derived from
17 | this software without specific prior written permission.
18 |
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
Databricks Helpers 🧱
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | Easy-to-use Databricks Notebooks for Admin Tasks.
13 | Made with ❤️ by Dotlas Inc
14 |
15 |
16 | ## About
17 |
18 | This repository contains a directory of Databricks notebooks that assists with administrative tasks for Databricks, or otherwise helps as a supporting utility.
19 |
20 | For example, consider the following use-cases:
21 |
22 | * 📆 View a calendar of scheduled jobs to resolve conflicts in Databricks workflows.
23 | * 🐼 Upload a [Pandas](https://pypi.org/project/pandas) DataFrame to Delta Lake
24 | * 📑 Update Delta Lake table Documentation using [Pydantic](https://docs.pydantic.dev/latest/) Models
25 | * ➿ Migrate Jobs between Databricks workspaces
26 | * ⚙️ Mass-edit Job Clusters in Existing Jobs
27 |
28 | ## Directory
29 |
30 | | Notebook | Description |
31 | | --- | --- |
32 | | [Databricks Daily Cost to Slack](./notebooks/databricks_daily_cost_slack) | Schedule a daily cost summary of Databricks bills to notify you on Slack. |
33 | | [Workflow Calendar](./notebooks/workflow_calendar/README.md) | Visualize scheduled Jobs on a calendar, eyeball conflicts and view historic runs as a [Gantt](https://en.wikipedia.org/wiki/Gantt_chart) chart |
34 | | [Delta Docs with Pydantic](./notebooks/delta_docs_pydantic/README.md) | If you have pydantic models with fields containing `description` and `tags` that are used as data models, transfer these field descriptions to Delta lake columns as comments and tags. |
35 | | [Pandas to Delta](./notebooks/pandas_delta/README.md) | Use [databricks-sql-python](https://github.com/databricks/databricks-sql-python/) and [SQLAlchemy](https://pypi.org/project/sqlalchemy/) to upload a Pandas DataFrame to Delta Lake from outside a Databricks environment |
36 | | [Workspace Jobs Migration](./notebooks/migrate_workspace/README.md) | Migrate Workflows from one Databricks workspace to another |
37 | | [Job Cluster Update](./notebooks/update_job_cluster/README.md) | Use the Databricks API to mass-update Job and Task configs |
38 | | [Workflow Config Exporter](./notebooks/workflow_config_exporter/README.md) | Export existing workflow configuration and save it for future consumption |
39 |
40 | ## Discussions
41 |
42 | * Check out the launch discussion on this [LinkedIn Release Post](https://www.linkedin.com/feed/update/urn:li:activity:7119179773444030465), with a highlight from Databricks CEO, [Ali Ghodsi](https://www.linkedin.com/in/alighodsi).
43 | * Feel free to raise an issue on this repository to start a discussion about new features, bug fixes or enhancements.
44 | * See [CONTRIBUTING.md](./CONTRIBUTING.md) for guidelines when adding or modifying notebooks in this repository.
45 |
--------------------------------------------------------------------------------
/notebooks/databricks_daily_cost_slack/README.md:
--------------------------------------------------------------------------------
1 | # Databricks Daily Cost - Slack
2 |
3 | 
4 |
--------------------------------------------------------------------------------
/notebooks/databricks_daily_cost_slack/databricks_cost_slack.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {
8 | "byteLimit": 2048000,
9 | "rowLimit": 10000
10 | },
11 | "inputWidgets": {},
12 | "nuid": "2101dea2-6085-4982-aec2-b961bd745dd1",
13 | "showTitle": false,
14 | "title": ""
15 | }
16 | },
17 | "source": [
18 | "# Databricks Cost to Slack 🧱\n",
19 | "\n",
20 | "\n",
21 | "\n",
22 | "\n",
23 | "> Authors: [Eshwaran Venkat](github.com/cricksmaidiene)"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 0,
29 | "metadata": {
30 | "application/vnd.databricks.v1+cell": {
31 | "cellMetadata": {
32 | "byteLimit": 2048000,
33 | "rowLimit": 10000
34 | },
35 | "inputWidgets": {},
36 | "nuid": "07c123a1-3fa9-47e6-ac0d-545959aaee6e",
37 | "showTitle": true,
38 | "title": "Install Packages"
39 | }
40 | },
41 | "outputs": [],
42 | "source": [
43 | "!pip install tabulate -q"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 0,
49 | "metadata": {
50 | "application/vnd.databricks.v1+cell": {
51 | "cellMetadata": {
52 | "byteLimit": 2048000,
53 | "rowLimit": 10000
54 | },
55 | "inputWidgets": {},
56 | "nuid": "3b6078c1-599f-4805-bc32-d92fa236258c",
57 | "showTitle": true,
58 | "title": "Imports"
59 | }
60 | },
61 | "outputs": [],
62 | "source": [
63 | "import warnings\n",
64 | "warnings.filterwarnings(\"ignore\")\n",
65 | "\n",
66 | "import pandas as pd\n",
67 | "import os\n",
68 | "import requests"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 0,
74 | "metadata": {
75 | "application/vnd.databricks.v1+cell": {
76 | "cellMetadata": {
77 | "byteLimit": 2048000,
78 | "rowLimit": 10000
79 | },
80 | "inputWidgets": {},
81 | "nuid": "960f5304-ba21-4664-a960-019843601c98",
82 | "showTitle": true,
83 | "title": "Declare and Consume Notebook Parameters"
84 | }
85 | },
86 | "outputs": [],
87 | "source": [
88 | "dbutils.widgets.removeAll()\n",
89 | "dbutils.widgets.text(\"DAY_COUNT\", \"2\")\n",
90 | "dbutils.widgets.text(\"SLACK_WEBHOOK\", \"\")\n",
91 | "\n",
92 | "N: int = int(dbutils.widgets.get(\"DAY_COUNT\")) # Example value for N days ago, adjust as needed\n",
93 | "SLACK_WEBHOOK = dbutils.widgets.get(\"SLACK_WEBHOOK\")\n",
94 | "\n",
95 | "assert all([N, SLACK_WEBHOOK]), \"One or more required parameters not set\""
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 0,
101 | "metadata": {
102 | "application/vnd.databricks.v1+cell": {
103 | "cellMetadata": {
104 | "byteLimit": 2048000,
105 | "rowLimit": 10000
106 | },
107 | "inputWidgets": {},
108 | "nuid": "10496414-7426-43ea-9b61-2eee1bc520c0",
109 | "showTitle": true,
110 | "title": "Declare Slack Message Payload Template"
111 | }
112 | },
113 | "outputs": [],
114 | "source": [
115 | "# The message payload using Block Kit for formatting\n",
116 | "message_payload = {\n",
117 | " \"blocks\": [\n",
118 | " {\n",
119 | " \"type\": \"section\",\n",
120 | " \"text\": {\n",
121 | " \"type\": \"mrkdwn\",\n",
122 | " \"text\": \"\"\n",
123 | " }\n",
124 | " },\n",
125 | " {\n",
126 | " \"type\": \"divider\"\n",
127 | " },\n",
128 | " {\n",
129 | " \"type\": \"section\",\n",
130 | " \"text\": {\n",
131 | " \"type\": \"mrkdwn\",\n",
132 | " \"text\": \"\"\n",
133 | " }\n",
134 | " }\n",
135 | " ]\n",
136 | "}"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 0,
142 | "metadata": {
143 | "application/vnd.databricks.v1+cell": {
144 | "cellMetadata": {
145 | "byteLimit": 2048000,
146 | "rowLimit": 10000
147 | },
148 | "inputWidgets": {},
149 | "nuid": "47bec7b8-8abd-4fd4-9c8c-b0867798d307",
150 | "showTitle": true,
151 | "title": "Read Billing Data"
152 | }
153 | },
154 | "outputs": [],
155 | "source": [
156 | "df = spark.sql(\n",
157 | " f\"\"\"\n",
158 | " SELECT *, identity_metadata.run_as as user FROM system.billing.usage \n",
159 | " WHERE usage_date >= date_trunc('day', NOW()) - interval '{N} day' \n",
160 | " AND usage_date < date_trunc('day', NOW()) - interval '{N-1} day'\n",
161 | " \"\"\"\n",
162 | ").toPandas()\n",
163 | "\n",
164 | "print(df.shape)\n",
165 | "df.head()"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 0,
171 | "metadata": {
172 | "application/vnd.databricks.v1+cell": {
173 | "cellMetadata": {
174 | "byteLimit": 2048000,
175 | "implicitDf": true,
176 | "rowLimit": 10000
177 | },
178 | "inputWidgets": {},
179 | "nuid": "15612e57-f6e2-40ca-9c22-b02fee194135",
180 | "showTitle": true,
181 | "title": "Calculate cost by SKU"
182 | }
183 | },
184 | "outputs": [],
185 | "source": [
186 | "dbu_usd_prices = spark.sql(\n",
187 | " f\"\"\"\n",
188 | " SELECT sku_name, round(pricing.default, 2) as usd_price \n",
189 | " FROM system.billing.list_prices \n",
190 | " WHERE sku_name in (\n",
191 | " SELECT sku_name FROM system.billing.usage \n",
192 | " WHERE usage_date >= date_trunc('day', NOW()) - interval '{N} day' \n",
193 | " AND usage_date < date_trunc('day', NOW()) - interval '{N-1} day'\n",
194 | " )\n",
195 | " \"\"\"\n",
196 | ").toPandas()\n",
197 | "\n",
198 | "print(dbu_usd_prices.shape)\n",
199 | "dbu_usd_prices.head()"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 0,
205 | "metadata": {
206 | "application/vnd.databricks.v1+cell": {
207 | "cellMetadata": {
208 | "byteLimit": 2048000,
209 | "rowLimit": 10000
210 | },
211 | "inputWidgets": {},
212 | "nuid": "103a8a1a-b2c4-4933-8e63-0543414257bc",
213 | "showTitle": true,
214 | "title": "USD Calculation"
215 | }
216 | },
217 | "outputs": [],
218 | "source": [
219 | "df[\"usd\"] = (\n",
220 | " df[\"sku_name\"].map(dbu_usd_prices.set_index(\"sku_name\").to_dict()[\"usd_price\"])\n",
221 | " * df[\"usage_quantity\"]\n",
222 | ")\n",
223 | "df[\"usd\"] = df[\"usd\"].astype(float).round(2)\n",
224 | "print(df[\"usd\"].info())"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 0,
230 | "metadata": {
231 | "application/vnd.databricks.v1+cell": {
232 | "cellMetadata": {
233 | "byteLimit": 2048000,
234 | "rowLimit": 10000
235 | },
236 | "inputWidgets": {},
237 | "nuid": "10f51852-a121-4c5f-900c-6079d0036cdd",
238 | "showTitle": true,
239 | "title": "Calculate Final Daily Bill"
240 | }
241 | },
242 | "outputs": [],
243 | "source": [
244 | "report = df.groupby([\"billing_origin_product\"])[[\"usd\"]].sum().reset_index()\n",
245 | "report.columns = [\"product\", \"cost\"]\n",
246 | "report['product'] = report['product'].str.replace(\"_\", \" \").str.title()\n",
247 | "report"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 0,
253 | "metadata": {
254 | "application/vnd.databricks.v1+cell": {
255 | "cellMetadata": {
256 | "byteLimit": 2048000,
257 | "rowLimit": 10000
258 | },
259 | "inputWidgets": {},
260 | "nuid": "d80ac4cc-4a32-48bb-b726-55f9ee7c119f",
261 | "showTitle": true,
262 | "title": "Prepare Messages"
263 | }
264 | },
265 | "outputs": [],
266 | "source": [
267 | "usage_date = df[\"usage_date\"].iloc[0].strftime(r\"%b %d\")\n",
268 | "day_cost = report[\"cost\"].sum()\n",
269 | "message_color = \"info\" if day_cost < 22 else \"error\"\n",
270 | "\n",
271 | "message_title = f\"🧱 *Databricks Cost* for *{usage_date}* is *${day_cost:,.2f}*\"\n",
272 | "message_log = f\"```{report.to_markdown(index=False)}```\"\n",
273 | "\n",
274 | "message_payload['blocks'][0]['text']['text'] = message_title\n",
275 | "message_payload['blocks'][2]['text']['text'] = message_log"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 0,
281 | "metadata": {
282 | "application/vnd.databricks.v1+cell": {
283 | "cellMetadata": {
284 | "byteLimit": 2048000,
285 | "rowLimit": 10000
286 | },
287 | "inputWidgets": {},
288 | "nuid": "091f0117-c47e-4c99-85e9-0dab4ca6a462",
289 | "showTitle": true,
290 | "title": "Post to Slack using Incoming Webhooks"
291 | }
292 | },
293 | "outputs": [],
294 | "source": [
295 | "# Sending the POST request to the Slack webhook URL\n",
296 | "response = requests.post(SLACK_WEBHOOK, json=message_payload)\n",
297 | "assert response.status_code == 200, \"Unable to post to slack\""
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 0,
303 | "metadata": {
304 | "application/vnd.databricks.v1+cell": {
305 | "cellMetadata": {},
306 | "inputWidgets": {},
307 | "nuid": "ae70af1e-12eb-4c34-b201-84eadf746212",
308 | "showTitle": false,
309 | "title": ""
310 | }
311 | },
312 | "outputs": [],
313 | "source": []
314 | }
315 | ],
316 | "metadata": {
317 | "application/vnd.databricks.v1+notebook": {
318 | "dashboards": [],
319 | "environmentMetadata": {
320 | "base_environment": "",
321 | "client": "1"
322 | },
323 | "language": "python",
324 | "notebookMetadata": {
325 | "mostRecentlyExecutedCommandWithImplicitDF": {
326 | "commandId": -1,
327 | "dataframes": [
328 | "_sqldf"
329 | ]
330 | },
331 | "pythonIndentUnit": 4
332 | },
333 | "notebookName": "databricks_cost_slack",
334 | "widgets": {
335 | "DAY_COUNT": {
336 | "currentValue": "1",
337 | "nuid": "7d1b9b75-b7db-4c39-ad5f-16c884db15c9",
338 | "typedWidgetInfo": {
339 | "autoCreated": false,
340 | "defaultValue": "2",
341 | "label": null,
342 | "name": "DAY_COUNT",
343 | "options": {
344 | "widgetDisplayType": "Text",
345 | "validationRegex": null
346 | },
347 | "parameterDataType": "String"
348 | },
349 | "widgetInfo": {
350 | "widgetType": "text",
351 | "defaultValue": "2",
352 | "label": null,
353 | "name": "DAY_COUNT",
354 | "options": {
355 | "widgetType": "text",
356 | "autoCreated": null,
357 | "validationRegex": null
358 | }
359 | }
360 | },
361 | "SLACK_WEBHOOK": {
362 | "currentValue": "",
363 | "nuid": "9a846535-f013-40e2-835c-8f8a29da9807",
364 | "typedWidgetInfo": {
365 | "autoCreated": false,
366 | "defaultValue": "",
367 | "label": null,
368 | "name": "SLACK_WEBHOOK",
369 | "options": {
370 | "widgetDisplayType": "Text",
371 | "validationRegex": null
372 | },
373 | "parameterDataType": "String"
374 | },
375 | "widgetInfo": {
376 | "widgetType": "text",
377 | "defaultValue": "",
378 | "label": null,
379 | "name": "SLACK_WEBHOOK",
380 | "options": {
381 | "widgetType": "text",
382 | "autoCreated": null,
383 | "validationRegex": null
384 | }
385 | }
386 | }
387 | }
388 | }
389 | },
390 | "nbformat": 4,
391 | "nbformat_minor": 0
392 | }
393 |
--------------------------------------------------------------------------------
/notebooks/databricks_daily_cost_slack/databricks_daily_cost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dotlas/databricks_helpers/6b9a6f1eb5eb2c1d57ce61f7d3ae5443fba39b9b/notebooks/databricks_daily_cost_slack/databricks_daily_cost.png
--------------------------------------------------------------------------------
/notebooks/delta_docs_pydantic/README.md:
--------------------------------------------------------------------------------
1 | Update Metadata for Delta Lake using Pydantic Data Models 📑
2 |
3 |
4 |
5 |
6 |
7 |
8 | ## Introduction
9 |
10 | This notebook is used to update metadata of an existing table in Delta Lake using the table's equivalent Pydantic data model. This is especially useful if you have data from an application that flows into Delta and happens to already have dataclasses or datamodels that define the schema of raw data.
11 |
12 | > Note that updating Delta table metadata is highly dependent on the Pydantic models already being pre-defined with `tags` and `description` per `Field`
13 |
14 | ## Use Cases
15 |
16 | The Delta Lake Table metadata updater is a helpful tool with below use cases:
17 |
18 | 1. **Metadata Enrichment**: Enhance the quality of your data by adding descriptions and tags to your table columns.
19 |
20 | 2. **Automated Documentation**: Save time and effort by automatically generating metadata based on your Pydantic data models.
21 |
22 | 3. **Consistency and Quality**: Ensure consistent metadata across your Delta Lake tables.
23 |
24 | ---
25 | See more details in the notebook (ipynb)
26 |
--------------------------------------------------------------------------------
/notebooks/delta_docs_pydantic/delta_docs_pydantic.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Update Delta Lake Documentation with Pydantic Data Models 📑"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {
13 | "application/vnd.databricks.v1+cell": {
14 | "cellMetadata": {},
15 | "inputWidgets": {},
16 | "nuid": "7807f803-ee1d-4e2c-9ddb-9284a3df60d6",
17 | "showTitle": false,
18 | "title": ""
19 | }
20 | },
21 | "source": [
22 | "## Requirements\n",
23 | "\n",
24 | "### Databricks\n",
25 | "* A Databricks Workspace & Workspace Access Token\n",
26 | "* At least one runnable cluster within the workspace\n",
27 | "* Workspace attached to a metastore for Delta Lake\n",
28 | "* Access to one or more Pydantic model classes, or instances of those classes\n",
29 | " \n",
30 | "### Packages\n",
31 | "`pandas` for data manipulation and `pydantic` for data modeling.\n",
32 | "\n",
33 | "* `pandas < 2.0`\n",
34 | "* `pydantic < 1.11`\n",
35 | "\n",
36 | "### Delta Table\n",
37 | "The table whose column description and tags you want to write / update needs to already exist in your delta lake\n",
38 | "\n",
39 | "### Infra\n",
40 | "A cluster is required to be running on the Databricks workspace from where the Delta lake will be accessed. This cluster will behave as an intermediary to accept connections and data from outside Databricks and add the data into Delta lake.\n",
41 | "\n",
42 | "> In order to add data to Unity catalog, the cluster must be configured to access Unity Catalog.\n",
43 | "\n",
44 | ""
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {
50 | "application/vnd.databricks.v1+cell": {
51 | "cellMetadata": {},
52 | "inputWidgets": {},
53 | "nuid": "256cfca3-dd69-4b03-b0d3-07504f2ed67a",
54 | "showTitle": false,
55 | "title": ""
56 | }
57 | },
58 | "source": [
59 | "## Imports"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {
66 | "application/vnd.databricks.v1+cell": {
67 | "cellMetadata": {
68 | "byteLimit": 2048000,
69 | "rowLimit": 10000
70 | },
71 | "inputWidgets": {},
72 | "nuid": "77998633-02bb-4a93-9e3b-f59273ce2d50",
73 | "showTitle": false,
74 | "title": ""
75 | }
76 | },
77 | "outputs": [],
78 | "source": [
79 | "from pydantic import BaseModel\n",
80 | "import pandas as pd"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {
86 | "application/vnd.databricks.v1+cell": {
87 | "cellMetadata": {},
88 | "inputWidgets": {},
89 | "nuid": "318bb8c0-456a-41a1-950e-bbd7ab706a12",
90 | "showTitle": false,
91 | "title": ""
92 | }
93 | },
94 | "source": [
95 | "## Inputs"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {
102 | "application/vnd.databricks.v1+cell": {
103 | "cellMetadata": {
104 | "byteLimit": 2048000,
105 | "rowLimit": 10000
106 | },
107 | "inputWidgets": {},
108 | "nuid": "46c7d8e8-0cd5-46a1-9e92-a49afc97da08",
109 | "showTitle": false,
110 | "title": ""
111 | }
112 | },
113 | "outputs": [],
114 | "source": [
115 | "dbutils.widgets.removeAll()\n",
116 | "\n",
117 | "dbutils.widgets.text(\"catalog\", \"\")\n",
118 | "catalog: str = getArgument(\"catalog\")\n",
119 | "\n",
120 | "dbutils.widgets.text(\"schema\", \"\")\n",
121 | "schema: str = getArgument(\"schema\")\n",
122 | "\n",
123 | "dbutils.widgets.text(\"table\", \"\")\n",
124 | "table: str = getArgument(\"table\")"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "\n",
132 | "\n",
133 | "## Steps 📊\n",
134 | "\n",
135 | "### 1. Input Pydantic Data Model 📝\n",
136 | "\n",
137 | "Initialize your pydantic data model which inherits from pydantic `BaseModel` where you have declared all the column descriptions and tags.\n",
138 | "\n",
139 | "### 2. Convert the Pydantic data model to a dataframe 🚀\n",
140 | "\n",
141 | "Next we convert the data model into a dataframe containing the relevant fields, making it easier to retrieve the needed data.\n",
142 | "\n",
143 | "\n",
144 | "### 3. Update Delta Lake Table 🔄\n",
145 | "\n",
146 | "Once you are satisfied with the inferred metadata, apply the updates to your Delta Lake table, and it will be enriched with the new descriptions and tags."
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {
152 | "application/vnd.databricks.v1+cell": {
153 | "cellMetadata": {},
154 | "inputWidgets": {},
155 | "nuid": "e1357b26-0eb2-4206-877b-b33e4d253d93",
156 | "showTitle": false,
157 | "title": ""
158 | }
159 | },
160 | "source": [
161 | "## Create your pydantic data model class\n",
162 | "#### Example:\n",
163 | "```python\n",
164 | "class pydantic_data_model(BaseModel):\n",
165 | " column_1: str = Field(\n",
166 | " ...,\n",
167 | " title=\"Column One\",\n",
168 | " description=\"The is column one\",\n",
169 | " tags=[\"test_tag_1\"],\n",
170 | " )\n",
171 | " column_2: str = Field(\n",
172 | " ...,\n",
173 | " title=\"Column Two\",\n",
174 | " description=\"The is column two\",\n",
175 | " tags=[\"test_tag_2\"],\n",
176 | " )\n",
177 | " column_3: str = Field(\n",
178 | " ...,\n",
179 | " title=\"Column Three\",\n",
180 | " description=\"The is column three\",\n",
181 | " tags=[\"test_tag_3\"],\n",
182 | " )\n",
183 | "\n",
184 | "```"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "application/vnd.databricks.v1+cell": {
192 | "cellMetadata": {
193 | "byteLimit": 2048000,
194 | "rowLimit": 10000
195 | },
196 | "inputWidgets": {},
197 | "nuid": "bdda71da-98a8-432b-8653-21a62a91be56",
198 | "showTitle": false,
199 | "title": ""
200 | }
201 | },
202 | "outputs": [],
203 | "source": [
204 | "#initialize your pydantic datamodel class here with the class name as pydantic_date_model which inherits from BaseModel\n",
205 | "pydantic_data_model = None"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {
211 | "application/vnd.databricks.v1+cell": {
212 | "cellMetadata": {},
213 | "inputWidgets": {},
214 | "nuid": "008093c5-05e9-4ad4-aca6-533513d97ed4",
215 | "showTitle": false,
216 | "title": ""
217 | }
218 | },
219 | "source": [
220 | "## Parse the data model \n",
221 | "#### Convert the declared data model to a data frame containing the needed info"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {
228 | "application/vnd.databricks.v1+cell": {
229 | "cellMetadata": {},
230 | "inputWidgets": {},
231 | "nuid": "d8f2e2b1-b113-455e-9dcd-b41169fc384f",
232 | "showTitle": false,
233 | "title": ""
234 | }
235 | },
236 | "outputs": [],
237 | "source": [
238 | "def create_data_dictionary(model: type[BaseModel]) -> pd.DataFrame:\n",
239 | " \"\"\"Describe the fields of a pydantic model as a pandas DataFrame.\n",
240 | "\n",
241 | " Args:\n",
242 | " model (Type[BaseModel]): A pydantic model.\n",
243 | "\n",
244 | " Returns:\n",
245 | " pd.DataFrame: A pandas DataFrame describing the model.\n",
246 | " \"\"\"\n",
247 | " return pd.DataFrame(\n",
248 | " [\n",
249 | " {\n",
250 | " \"field_name\": field,\n",
251 | " \"field_title\": field_mf.field_info.title,\n",
252 | " \"python_type\": field_type\n",
253 | " if \"Workspace Migration ✈️
2 |
3 |
4 |
5 |
6 | ## Introduction
7 |
8 | This notebook is used to migrate clusters and workflows from one workspace to another using the Databricks REST API. It works by fetching the current cluster / workflow configs and then using it to create the same in a new workspace.
9 |
10 | ## Use Cases
11 |
12 | Areas where such a notebook may be helpful:
13 |
14 | 1. **Migrating clusters and workflows to a new workspace**: This is the obvious use case, and the notebook would be particularly useful for large or complex workspaces, where migrating everything manually would be time-consuming and error-prone.
15 | 2. **Creating a new workspace from scratch**: The notebook could be used to quickly create a new workspace with the same clusters and workflows as an existing workspace. This could be useful for creating a development or staging environment, or for creating a new workspace for a specific project or team.
16 |
17 | ---
18 | See more details in the notebook (ipynb)
19 |
--------------------------------------------------------------------------------
/notebooks/migrate_workspace/migrate_workspace.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "d08046ea-5af6-4d2e-9bfb-483adbd72f55",
10 | "showTitle": false,
11 | "title": ""
12 | }
13 | },
14 | "source": [
15 | "# Workspace Migration ✈️\n",
16 | "\n",
17 | "## Requirements\n",
18 | "\n",
19 | "### Databricks\n",
20 | "\n",
21 | "* Two Databricks Workspaces & Workspace Access Tokens for the same\n",
22 | "* At least one runnable cluster within any workspace\n",
23 | "\n",
24 | "> Note: The word `job` and `wokflow` is used interchangeably throughout "
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {
31 | "application/vnd.databricks.v1+cell": {
32 | "cellMetadata": {
33 | "byteLimit": 2048000,
34 | "rowLimit": 10000
35 | },
36 | "inputWidgets": {},
37 | "nuid": "aae8da7c-d1f1-4f52-ac7e-28c1b1e686e2",
38 | "showTitle": false,
39 | "title": ""
40 | }
41 | },
42 | "outputs": [],
43 | "source": [
44 | "import json\n",
45 | "\n",
46 | "import requests\n",
47 | "from typing import Optional, Callable"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {
53 | "application/vnd.databricks.v1+cell": {
54 | "cellMetadata": {
55 | "byteLimit": 2048000,
56 | "rowLimit": 10000
57 | },
58 | "inputWidgets": {},
59 | "nuid": "bef603a0-00df-4319-b7eb-0ff8f3bbbeb9",
60 | "showTitle": false,
61 | "title": ""
62 | }
63 | },
64 | "source": [
65 | "## Steps 📊\n",
66 | "\n",
67 | "\n",
68 | "### 1. Fetch workflow / cluster configurations 📬\n",
69 | "\n",
70 | "We fetch all the workflows/clusters present in your workspace, each fetched workflow config will also contain the individual task config present in the workflow and their respective job cluster configs. \n",
71 | "\n",
72 | "### 2. Parse Information 🧩\n",
73 | "\n",
74 | "In this step we parse the obtained config info. The main thing to keep in mind is that the cluster config contains some fields which are populated after the cluster is initialized but will be fetched anyway from step 1, we need to remove this field or else when we use the same config to create the workflow later it will throw an error. You can also add any custom logic here. For example: You can include webhook notification ID to be associated with a workflow you like, You can also associate an existing all-purpose-compute to a workflow that you want, etc. \n",
75 | "\n",
76 | "### 3. Create new workflow / config 👶🏽\n",
77 | "\n",
78 | "Using the parsed info we create workflows/clusters in the new workspace.\n"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {
84 | "application/vnd.databricks.v1+cell": {
85 | "cellMetadata": {},
86 | "inputWidgets": {},
87 | "nuid": "4f517d5d-b817-4e75-85d6-5de5b317bbf9",
88 | "showTitle": false,
89 | "title": ""
90 | }
91 | },
92 | "source": [
93 | "### Set up workspace urls and access tokens\n"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "application/vnd.databricks.v1+cell": {
101 | "cellMetadata": {
102 | "byteLimit": 2048000,
103 | "rowLimit": 10000
104 | },
105 | "inputWidgets": {},
106 | "nuid": "d2688c9e-f89c-4b7b-b84c-913016163080",
107 | "showTitle": false,
108 | "title": ""
109 | }
110 | },
111 | "outputs": [],
112 | "source": [
113 | "dbutils.widgets.removeAll()\n",
114 | "\n",
115 | "dbutils.widgets.text(\"old_workspace_url\", \"\")\n",
116 | "old_workspace_url: str = getArgument(\"old_workspace_url\")\n",
117 | "\n",
118 | "dbutils.widgets.text(\"old_workspace_token\", \"\")\n",
119 | "old_workspace_token: str = getArgument(\"old_workspace_token\")\n",
120 | "\n",
121 | "dbutils.widgets.text(\"new_workspace_url\", \"\")\n",
122 | "new_workspace_url: str = getArgument(\"new_workspace_url\")\n",
123 | "\n",
124 | "dbutils.widgets.text(\"new_workspace_token\", \"\")\n",
125 | "new_workspace_token: str = getArgument(\"new_workspace_token\")\n",
126 | "\n",
127 | "\n",
128 | "query_params = {\n",
129 | " \"LIST_JOBS_LIMIT\": 100, # max limit\n",
130 | " \"EXPAND_TASKS\": \"true\", # provides the complete config info for each job\n",
131 | "}"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {
138 | "application/vnd.databricks.v1+cell": {
139 | "cellMetadata": {
140 | "byteLimit": 2048000,
141 | "rowLimit": 10000
142 | },
143 | "inputWidgets": {},
144 | "nuid": "3dae11b4-39ad-4229-9c06-de60b96a770f",
145 | "showTitle": false,
146 | "title": ""
147 | }
148 | },
149 | "outputs": [],
150 | "source": [
151 | "def paginate(\n",
152 | " can_paginate: bool,\n",
153 | " next_page_token: Optional[str],\n",
154 | " url: str,\n",
155 | " workspace_token: str,\n",
156 | " function_to_call: Callable,\n",
157 | ") -> None:\n",
158 | " \"\"\"\n",
159 | " Paginates to the next page if possible\n",
160 | " input:\n",
161 | " can_paginate [bool]: Boolean info about wheather there is additional info.\n",
162 | " next_page_token [str]: Token needed in url query param to paginate to next page.\n",
163 | " url [str]: Url used to list the needed info.\n",
164 | " function_to_call [Callable]: Function that gets called with the paginated url to paginate further.\n",
165 | " output:\n",
166 | " None\n",
167 | " \"\"\"\n",
168 | "\n",
169 | " if next_page_token and can_paginate:\n",
170 | " if \"&page_token\" in url:\n",
171 | " url = f\"{url[:url.find('&page_token')]}&page_token={next_page_token}\"\n",
172 | " else:\n",
173 | " url = f\"{url}&page_token={next_page_token}\"\n",
174 | "\n",
175 | " function_to_call(url, workspace_token)\n",
176 | " else:\n",
177 | " return"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {
183 | "application/vnd.databricks.v1+cell": {
184 | "cellMetadata": {},
185 | "inputWidgets": {},
186 | "nuid": "3a11f70f-5c0d-41b6-92f7-74684f5a606a",
187 | "showTitle": false,
188 | "title": ""
189 | }
190 | },
191 | "source": [
192 | "## List Clusters \n",
193 | "#### Fetches all clusters in current workspace and its respective configs\n",
194 | "API Docs \n"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {
201 | "application/vnd.databricks.v1+cell": {
202 | "cellMetadata": {
203 | "byteLimit": 2048000,
204 | "rowLimit": 10000
205 | },
206 | "inputWidgets": {},
207 | "nuid": "9dece1e7-42d3-437a-a375-f56cc87b9074",
208 | "showTitle": false,
209 | "title": ""
210 | }
211 | },
212 | "outputs": [],
213 | "source": [
214 | "def getAllClusters(list_clusters_url: str, workspace_token: str) -> None:\n",
215 | " \"\"\"\n",
216 | " Fetches all the clusters and metadata about them.\n",
217 | " input:\n",
218 | " list_clusters_url [str]: Databricks API used to fetch all the clusters.\n",
219 | " workspace_token [str]: Databricks workspace access token.\n",
220 | " output:\n",
221 | " None\n",
222 | " \"\"\"\n",
223 | "\n",
224 | " response = requests.get(\n",
225 | " list_clusters_url,\n",
226 | " headers={\"Authorization\": f\"Bearer {workspace_token}\"},\n",
227 | " )\n",
228 | " assert response.status_code == 200\n",
229 | "\n",
230 | " response_data = response.json()\n",
231 | "\n",
232 | " for cluster_info in response_data.get(\"clusters\", []):\n",
233 | " clusters.append(cluster_info)\n",
234 | "\n",
235 | " paginate(\n",
236 | " response_data.get(\"has_more\", False),\n",
237 | " response_data.get(\"next_page_token\"),\n",
238 | " list_clusters_url,\n",
239 | " workspace_token,\n",
240 | " getAllClusters,\n",
241 | " )\n",
242 | "\n",
243 | "\n",
244 | "clusters = [] # holds all cluster' info\n",
245 | "List_clusters_url = str(old_workspace_url + \"/api/2.0/clusters/list\")\n",
246 | "getAllClusters(List_clusters_url, old_workspace_token)"
247 | ]
248 | },
249 | {
250 | "cell_type": "markdown",
251 | "metadata": {
252 | "application/vnd.databricks.v1+cell": {
253 | "cellMetadata": {},
254 | "inputWidgets": {},
255 | "nuid": "804512f8-b360-4ca3-8d85-6f63a8ae235d",
256 | "showTitle": false,
257 | "title": ""
258 | }
259 | },
260 | "source": [
261 | "## Filter and Parse info"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "metadata": {
268 | "application/vnd.databricks.v1+cell": {
269 | "cellMetadata": {
270 | "byteLimit": 2048000,
271 | "rowLimit": 10000
272 | },
273 | "inputWidgets": {},
274 | "nuid": "8134e80d-8207-45ca-adab-9314791403ce",
275 | "showTitle": false,
276 | "title": ""
277 | }
278 | },
279 | "outputs": [],
280 | "source": [
281 | "def filterClusters(cluster_info: dict) -> bool:\n",
282 | " \"\"\"Filter clusters based on custom logic\"\"\"\n",
283 | " return True\n",
284 | "\n",
285 | "\n",
286 | "def parseClusters(cluster_info: dict) -> dict:\n",
287 | " \"\"\"Modefies the cluster config.\n",
288 | " input:\n",
289 | " cluster_info [dict]: Dict containing all the config info about the cluster.\n",
290 | " output:\n",
291 | " dict : parsed result in accordance with the `create cluster` api payload.\"\"\"\n",
292 | " if cluster_info.get(\"aws_attributes\"):\n",
293 | " cluster_info.pop(\"aws_attributes\")\n",
294 | " if cluster_info.get(\"cluster_id\"):\n",
295 | " cluster_info.pop(\"cluster_id\")\n",
296 | "\n",
297 | " # add more custom parsing logic if needed\n",
298 | " return cluster_info\n",
299 | "\n",
300 | "\n",
301 | "filtered_clusters = []\n",
302 | "\n",
303 | "# filter\n",
304 | "for cluster_info in clusters:\n",
305 | " if filterClusters(cluster_info):\n",
306 | " filtered_clusters.append(cluster_info)\n",
307 | "\n",
308 | "# parse\n",
309 | "for idx in range(len(filtered_clusters)):\n",
310 | " cluster_info = filtered_clusters[idx]\n",
311 | " parsed_cluster_info = parseClusters(cluster_info)\n",
312 | " filtered_clusters[idx] = parsed_cluster_info\n",
313 | "\n",
314 | "clusters = filtered_clusters"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {
320 | "application/vnd.databricks.v1+cell": {
321 | "cellMetadata": {
322 | "byteLimit": 2048000,
323 | "rowLimit": 10000
324 | },
325 | "inputWidgets": {},
326 | "nuid": "1f5dc0e5-e714-43bb-a014-7e55a301119d",
327 | "showTitle": false,
328 | "title": ""
329 | }
330 | },
331 | "source": [
332 | "## Create new cluster\n",
333 | "#### Use the parsed info as payload to create clusters in the new workspace\n",
334 | "API Docs \n",
335 | "\n"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "metadata": {
342 | "application/vnd.databricks.v1+cell": {
343 | "cellMetadata": {},
344 | "inputWidgets": {},
345 | "nuid": "97b639a3-6bd6-4132-9c3b-4802ca5cf73b",
346 | "showTitle": false,
347 | "title": ""
348 | }
349 | },
350 | "outputs": [],
351 | "source": [
352 | "for cluster_info in clusters:\n",
353 | " response = requests.post(\n",
354 | " f\"{new_workspace_url}/api/2.0/clusters/create\",\n",
355 | " headers={\n",
356 | " \"Content-Type\": \"application/json\",\n",
357 | " \"Authorization\": f\"Bearer {new_workspace_token}\",\n",
358 | " },\n",
359 | " data=json.dumps(cluster_info),\n",
360 | " )\n",
361 | " assert response.status_code in {\n",
362 | " 200,\n",
363 | " 201,\n",
364 | " }"
365 | ]
366 | },
367 | {
368 | "cell_type": "markdown",
369 | "metadata": {
370 | "application/vnd.databricks.v1+cell": {
371 | "cellMetadata": {},
372 | "inputWidgets": {},
373 | "nuid": "1383ed56-7a81-4e91-ae46-a45f59ee65d9",
374 | "showTitle": false,
375 | "title": ""
376 | }
377 | },
378 | "source": [
379 | "## List Workflows \n",
380 | "#### Fetches all workflows in current workspace and its respective configs\n",
381 | "API Docs \n"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "metadata": {
388 | "application/vnd.databricks.v1+cell": {
389 | "cellMetadata": {},
390 | "inputWidgets": {},
391 | "nuid": "3ed71bea-8300-4c9a-9601-ed22e50b406c",
392 | "showTitle": false,
393 | "title": ""
394 | }
395 | },
396 | "outputs": [],
397 | "source": [
398 | "def getAllJobs(list_jobs_url: str, workspace_token: str) -> None:\n",
399 | " \"\"\"\n",
400 | " Fetches all the jobs and metadata about them.\n",
401 | " input:\n",
402 | " lists_jobs_url [str]: Databricks API used to fetch all the jobs.\n",
403 | " workspace_token [str]: Databricks workspace access token.\n",
404 | " output:\n",
405 | " None\n",
406 | " \"\"\"\n",
407 | "\n",
408 | " response = requests.get(\n",
409 | " list_jobs_url,\n",
410 | " headers={\"Authorization\": f\"Bearer {workspace_token}\"},\n",
411 | " )\n",
412 | " assert response.status_code == 200\n",
413 | "\n",
414 | " response_data = response.json()\n",
415 | "\n",
416 | " for job in response_data.get(\"jobs\", []):\n",
417 | " jobs.append(job.get(\"settings\"))\n",
418 | "\n",
419 | " paginate(\n",
420 | " response_data.get(\"has_more\", False),\n",
421 | " response_data.get(\"next_page_token\"),\n",
422 | " list_jobs_url,\n",
423 | " workspace_token,\n",
424 | " getAllJobs,\n",
425 | " )\n",
426 | "\n",
427 | "\n",
428 | "jobs = [] # holds all jobs' info\n",
429 | "List_jobs_url = str(\n",
430 | " old_workspace_url\n",
431 | " + \"/api/2.1/jobs/list?\"\n",
432 | " + f\"limit={query_params.get('LIST_JOBS_LIMIT')}\"\n",
433 | " + f\"&expand_tasks={query_params.get('EXPAND_TASKS')}\"\n",
434 | ")\n",
435 | "getAllJobs(List_jobs_url, old_workspace_token)"
436 | ]
437 | },
438 | {
439 | "cell_type": "markdown",
440 | "metadata": {
441 | "application/vnd.databricks.v1+cell": {
442 | "cellMetadata": {},
443 | "inputWidgets": {},
444 | "nuid": "26d699ee-6076-49aa-bd54-8a5d918936b3",
445 | "showTitle": false,
446 | "title": ""
447 | }
448 | },
449 | "source": [
450 | "## Filter and Parse info\n",
451 | "#### Some of the parsing we can do \n",
452 | "1. You can add new webhook notif ID \n",
453 | "2. Tag an existing all-prupose compute to the workflow \n",
454 | "3. Tag an existing task if the new task (from the workflow) depends on it"
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": null,
460 | "metadata": {
461 | "application/vnd.databricks.v1+cell": {
462 | "cellMetadata": {},
463 | "inputWidgets": {},
464 | "nuid": "7f8252ae-94e8-4513-9e60-a4b4b62d2054",
465 | "showTitle": false,
466 | "title": ""
467 | }
468 | },
469 | "outputs": [],
470 | "source": [
471 | "def filterWorkflows(workflow_info: dict) -> bool:\n",
472 | " \"\"\"Filter Workflow based on custom logic\"\"\"\n",
473 | " return True\n",
474 | "\n",
475 | "\n",
476 | "def parseWorkflows(workflow_info: dict) -> dict:\n",
477 | " \"\"\"Modefies the workflow config.\n",
478 | " input:\n",
479 | " workflow_info [dict]: Dict containing all the config info about the workflow.\n",
480 | " output:\n",
481 | " dict : parsed result in accordance with the `create job` api payload.\"\"\"\n",
482 | " for cluster_info in workflow_info.get(\n",
483 | " \"job_clusters\", []\n",
484 | " ): # below parsing is same for cluster config payload too.\n",
485 | " if \"aws_attributes\" in cluster_info.get(\"new_cluster\"):\n",
486 | " cluster_info.get(\"new_cluster\").pop(\"aws_attributes\")\n",
487 | "\n",
488 | " # add more custom parsing logic if needed\n",
489 | " return workflow_info\n",
490 | "\n",
491 | "\n",
492 | "filtered_jobs = []\n",
493 | "\n",
494 | "# filter\n",
495 | "for workflow_info in jobs:\n",
496 | " if filterWorkflows(workflow_info):\n",
497 | " filtered_jobs.append(workflow_info)\n",
498 | "\n",
499 | "# parse\n",
500 | "for idx in range(len(filtered_jobs)):\n",
501 | " workflow_info = filtered_jobs[idx]\n",
502 | " parsed_workflow_info = parseWorkflows(workflow_info)\n",
503 | " filtered_jobs[idx] = parsed_workflow_info\n",
504 | "\n",
505 | "jobs = filtered_jobs"
506 | ]
507 | },
508 | {
509 | "cell_type": "markdown",
510 | "metadata": {
511 | "application/vnd.databricks.v1+cell": {
512 | "cellMetadata": {
513 | "byteLimit": 2048000,
514 | "rowLimit": 10000
515 | },
516 | "inputWidgets": {},
517 | "nuid": "61459da2-bf71-48eb-9bde-0be70462aa6d",
518 | "showTitle": false,
519 | "title": ""
520 | }
521 | },
522 | "source": [
523 | "## Create Workflow\n",
524 | "#### Use the parsed info to create workflow in new workspace\n",
525 | "API Docs \n"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": null,
531 | "metadata": {
532 | "application/vnd.databricks.v1+cell": {
533 | "cellMetadata": {},
534 | "inputWidgets": {},
535 | "nuid": "106ec485-e5a3-4dff-a5a9-6e01a68e68b1",
536 | "showTitle": false,
537 | "title": ""
538 | }
539 | },
540 | "outputs": [],
541 | "source": [
542 | "for workflow_info in jobs:\n",
543 | " response = requests.post(\n",
544 | " url=f\"{new_workspace_url}/api/2.1/jobs/create\",\n",
545 | " headers={\n",
546 | " \"Content-Type\": \"application/json\",\n",
547 | " \"Authorization\": f\"Bearer {new_workspace_token}\",\n",
548 | " },\n",
549 | " data=json.dumps(workflow_info),\n",
550 | " )\n",
551 | " assert response.status_code in {\n",
552 | " 200,\n",
553 | " 201,\n",
554 | " }"
555 | ]
556 | }
557 | ],
558 | "metadata": {
559 | "application/vnd.databricks.v1+notebook": {
560 | "dashboards": [],
561 | "language": "python",
562 | "notebookMetadata": {
563 | "pythonIndentUnit": 4
564 | },
565 | "notebookName": "migrate_workspace",
566 | "widgets": {
567 | "new_workspace_token": {
568 | "currentValue": "",
569 | "nuid": "47394cf3-4b2e-427e-ab85-7fe7998f33de",
570 | "widgetInfo": {
571 | "defaultValue": "",
572 | "label": null,
573 | "name": "new_workspace_token",
574 | "options": {
575 | "validationRegex": null,
576 | "widgetType": "text"
577 | },
578 | "widgetType": "text"
579 | }
580 | },
581 | "new_workspace_url": {
582 | "currentValue": "",
583 | "nuid": "efdcc97f-e245-4c68-bca9-992c5489cc0d",
584 | "widgetInfo": {
585 | "defaultValue": "",
586 | "label": null,
587 | "name": "new_workspace_url",
588 | "options": {
589 | "validationRegex": null,
590 | "widgetType": "text"
591 | },
592 | "widgetType": "text"
593 | }
594 | },
595 | "old_workspace_token": {
596 | "currentValue": "",
597 | "nuid": "f0561168-26a1-434e-af57-8b2405d96362",
598 | "widgetInfo": {
599 | "defaultValue": "",
600 | "label": null,
601 | "name": "old_workspace_token",
602 | "options": {
603 | "validationRegex": null,
604 | "widgetType": "text"
605 | },
606 | "widgetType": "text"
607 | }
608 | },
609 | "old_workspace_url": {
610 | "currentValue": "",
611 | "nuid": "1dfaf69b-5d6b-4782-bf25-d94605ef9848",
612 | "widgetInfo": {
613 | "defaultValue": "",
614 | "label": null,
615 | "name": "old_workspace_url",
616 | "options": {
617 | "validationRegex": null,
618 | "widgetType": "text"
619 | },
620 | "widgetType": "text"
621 | }
622 | }
623 | }
624 | },
625 | "language_info": {
626 | "name": "python"
627 | }
628 | },
629 | "nbformat": 4,
630 | "nbformat_minor": 0
631 | }
632 |
--------------------------------------------------------------------------------
/notebooks/pandas_delta/README.md:
--------------------------------------------------------------------------------
1 | # Delta Lake I/O with Pandas DataFrames
2 |
3 | 
4 | 
5 | 
6 |
7 | ## Use-Cases
8 |
9 | * Read and write Delta Lake tables into Pandas DataFrames.
10 | * Access Schemas and Catalogs as Pandas DataFrames
11 | * Access Delta lake from external services for table reads and writes
12 |
13 | ## Structure
14 |
15 | A running cluster on a Databricks workspace is required to interface any outside data from pandas DataFrames to mediate access and I/O with Delta lake.
16 |
17 |
18 |
19 | See more details in the notebook (`ipynb`)
20 |
--------------------------------------------------------------------------------
/notebooks/pandas_delta/assets/databricks_sql_python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dotlas/databricks_helpers/6b9a6f1eb5eb2c1d57ce61f7d3ae5443fba39b9b/notebooks/pandas_delta/assets/databricks_sql_python.png
--------------------------------------------------------------------------------
/notebooks/pandas_delta/assets/unity_catalog_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dotlas/databricks_helpers/6b9a6f1eb5eb2c1d57ce61f7d3ae5443fba39b9b/notebooks/pandas_delta/assets/unity_catalog_cluster.png
--------------------------------------------------------------------------------
/notebooks/pandas_delta/pandas_delta.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {
8 | "byteLimit": 2048000,
9 | "rowLimit": 10000
10 | },
11 | "inputWidgets": {},
12 | "nuid": "122e3c8d-e602-407a-abb7-a5b521ef7057",
13 | "showTitle": false,
14 | "title": ""
15 | }
16 | },
17 | "source": [
18 | "\n",
19 | "# Delta Lake I/O with Pandas Dataframes outside Databricks Environment\n",
20 | "\n",
21 | "\n",
22 | "\n",
23 | "\n",
24 | "\n",
25 | "[](https://colab.research.google.com/github/dotlas/databricks_helpers/blob/main/notebooks/pandas_delta/pandas_delta.ipynb)\n",
26 | "\n",
27 | "\n",
28 | "In this notebook, we showcase some utility functions built on top of existing third-party open source libraries in Python to read or write Pandas Dataframes **from within or outside a Databricks environment into Delta lake on Databricks**. The Delta lake can exist on [Unity Catalog](https://www.databricks.com/product/unity-catalog), or simply be the `hive_metastore` default. \n",
29 | "\n",
30 | "## Requirements\n",
31 | "\n",
32 | "### Databricks\n",
33 | "* A Databricks Workspace & Workspace Access Token\n",
34 | "* At least one runnable cluster within the workspace\n",
35 | "* Workspace attached to a metastore for Delta Lake\n",
36 | "\n",
37 | "### Packages\n",
38 | "\n",
39 | "This process heavily relies on [databricks-sql-python](https://github.com/databricks/databricks-sql-python) library which provides us with a [SQLAlchemy](https://sqlalche.me/) interface to write data. `databricks-sql-python` is an open source Python package maintained by Databricks, and `SQLAlchemy` is used since it is the default ORM wrapper used by the Pandas library\n",
40 | "\n",
41 | "\n",
42 | "* `databricks-sql-connector`\n",
43 | "* `sqlalchemy == 1.4.41`\n",
44 | "* `pandas < 2.0`\n",
45 | "\n",
46 | "### Infra\n",
47 | "\n",
48 | "A cluster is required to be running on the Databricks workspace from where the Delta lake will be accessed. This cluster will behave as an intermediary to accept connections and data from outside Databricks and add the data into Delta lake. \n",
49 | "\n",
50 | "> In order to add data to Unity catalog, the cluster must be configured to access `Unity Catalog`\n",
51 | "\n",
52 | ""
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {
59 | "application/vnd.databricks.v1+cell": {
60 | "cellMetadata": {},
61 | "inputWidgets": {},
62 | "nuid": "86830a84-762c-43c8-98e2-b4b46daf4b80",
63 | "showTitle": false,
64 | "title": ""
65 | }
66 | },
67 | "outputs": [],
68 | "source": [
69 | "pip install pandas databricks-sql-connector sqlalchemy==1.4.41 -q"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {
76 | "application/vnd.databricks.v1+cell": {
77 | "cellMetadata": {
78 | "byteLimit": 2048000,
79 | "rowLimit": 10000
80 | },
81 | "inputWidgets": {},
82 | "nuid": "88bed7fd-8efa-4b41-bfc0-49d4f1b5f62a",
83 | "showTitle": false,
84 | "title": ""
85 | }
86 | },
87 | "outputs": [],
88 | "source": [
89 | "import os\n",
90 | "\n",
91 | "import pandas as pd\n",
92 | "from sqlalchemy import types as sql_types\n",
93 | "from sqlalchemy import create_engine\n",
94 | "from sqlalchemy.engine import Engine\n",
95 | "\n",
96 | "# databricks imports\n",
97 | "from databricks import sql as databricks_sql"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {
103 | "application/vnd.databricks.v1+cell": {
104 | "cellMetadata": {},
105 | "inputWidgets": {},
106 | "nuid": "434bb481-e475-4241-9da6-667817a1480e",
107 | "showTitle": false,
108 | "title": ""
109 | }
110 | },
111 | "source": [
112 | "\n",
113 | "### Setup User Inputs\n",
114 | "\n",
115 | "When running this on Databricks, `CLUSTER HTTP PATH` and `WORKSPACE HOSTNAME` can be inferred. When running outside Databricks, you need to start a cluster, and then get these values, copy them over to this notebook when it's run externally and use those as parameters\n",
116 | "\n",
117 | "Use `HTTP_PATH` from within the Cluster configuration page for `CLUSTER HTTP PATH` variable like so:\n",
118 | "\n",
119 | ""
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {
125 | "application/vnd.databricks.v1+cell": {
126 | "cellMetadata": {},
127 | "inputWidgets": {},
128 | "nuid": "1c35434f-ceb4-424d-b550-06d593681818",
129 | "showTitle": false,
130 | "title": ""
131 | }
132 | },
133 | "source": [
134 | "\n",
135 | "**Fill up the values for the 3 parameters within the cell below when running this notebook outside a Databricks environment**"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {
142 | "application/vnd.databricks.v1+cell": {
143 | "cellMetadata": {
144 | "byteLimit": 2048000,
145 | "rowLimit": 10000
146 | },
147 | "inputWidgets": {},
148 | "nuid": "18996dc5-875a-4347-816b-770b8e385597",
149 | "showTitle": false,
150 | "title": ""
151 | }
152 | },
153 | "outputs": [],
154 | "source": [
155 | "# Check if notebook is running inside databricks environment\n",
156 | "DATABRICKS_ENV = any(\"SPARK\" in k for k in os.environ)\n",
157 | "\n",
158 | "if DATABRICKS_ENV:\n",
159 | " dbutils.widgets.removeAll()\n",
160 | " dbutils.widgets.text(\"WORKSPACE ACCESS TOKEN\", \"\")\n",
161 | " dbutils.widgets.text(\"WORKSPACE HOSTNAME\", \"\")\n",
162 | " dbutils.widgets.text(\"CLUSTER HTTP PATH\", \"\")\n",
163 | "\n",
164 | "# INPUT VALUES HERE\n",
165 | "\n",
166 | "# The workspace access token. Usually of the form *******\n",
167 | "databricks_workspace_access_token: str = (\n",
168 | " getArgument(\"WORKSPACE ACCESS TOKEN\")\n",
169 | " if DATABRICKS_ENV\n",
170 | " else \" \"\n",
171 | ")\n",
172 | "\n",
173 | "# server hostname like dbc-xxxx.cloud.databricks.com\n",
174 | "# do not prefix with https:// or add a / at the end\n",
175 | "databricks_server_hostname: str = (\n",
176 | " getArgument(\"WORKSPACE HOSTNAME\")\n",
177 | " if DATABRICKS_ENV\n",
178 | " else \" \"\n",
179 | ")\n",
180 | "\n",
181 | "# the http path from the cluster configuration -> JDBC/ODBC tab\n",
182 | "databricks_cluster_jdbc_http_path: str = (\n",
183 | " getArgument(\"CLUSTER HTTP PATH\")\n",
184 | " if DATABRICKS_ENV\n",
185 | " else \" \"\n",
186 | ")"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {
192 | "application/vnd.databricks.v1+cell": {
193 | "cellMetadata": {},
194 | "inputWidgets": {},
195 | "nuid": "1bc3bd88-ceb0-4a2c-8276-79ca5f6ce471",
196 | "showTitle": false,
197 | "title": ""
198 | }
199 | },
200 | "source": [
201 | "\n",
202 | "### Infer & Assert Inputs"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": null,
208 | "metadata": {
209 | "application/vnd.databricks.v1+cell": {
210 | "cellMetadata": {
211 | "byteLimit": 2048000,
212 | "rowLimit": 10000
213 | },
214 | "inputWidgets": {},
215 | "nuid": "66eae1b2-0c3b-4be2-a1d8-b5a29a601f93",
216 | "showTitle": false,
217 | "title": ""
218 | }
219 | },
220 | "outputs": [],
221 | "source": [
222 | "if DATABRICKS_ENV:\n",
223 | " # if notebook is running on databricks environment, then infer parameters\n",
224 | " if not databricks_cluster_jdbc_http_path:\n",
225 | " # spark works without imports within databricks environment\n",
226 | " cluster_id: str = spark.conf.get(\n",
227 | " \"spark.databricks.clusterUsageTags.clusterId\",\n",
228 | " ) # type: ignore\n",
229 | " workspace_id: str = spark.conf.get(\n",
230 | " \"spark.databricks.clusterUsageTags.clusterOwnerOrgId\",\n",
231 | " ) # type: ignore\n",
232 | " databricks_cluster_jdbc_http_path = (\n",
233 | " f\"sql/protocolv1/o/{workspace_id}/{cluster_id}\"\n",
234 | " )\n",
235 | "\n",
236 | " if not databricks_server_hostname:\n",
237 | " databricks_server_hostname = spark.conf.get(\"spark.databricks.workspaceUrl\")\n",
238 | "\n",
239 | "assert databricks_workspace_access_token, \"Databricks Workspace Access Token Missing\"\n",
240 | "assert databricks_server_hostname, \"Databricks Hostname Missing\"\n",
241 | "assert databricks_cluster_jdbc_http_path, \"Cluster JDBC path Missing\""
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "metadata": {
247 | "application/vnd.databricks.v1+cell": {
248 | "cellMetadata": {},
249 | "inputWidgets": {},
250 | "nuid": "bf44e023-6416-42ec-9b7e-cbe1e771518a",
251 | "showTitle": false,
252 | "title": ""
253 | }
254 | },
255 | "source": [
256 | "\n",
257 | "### Setup Connection\n",
258 | "\n",
259 | "We will create a SQLAlchemy engine using the credentials required to connect to the cluster and workspace"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {
266 | "application/vnd.databricks.v1+cell": {
267 | "cellMetadata": {
268 | "byteLimit": 2048000,
269 | "rowLimit": 10000
270 | },
271 | "inputWidgets": {},
272 | "nuid": "65a854e9-a983-49c9-9189-86508b083fe1",
273 | "showTitle": false,
274 | "title": ""
275 | }
276 | },
277 | "outputs": [],
278 | "source": [
279 | "databricks_sqlalchemy_url: str = (\n",
280 | " \"databricks://token:\"\n",
281 | " + databricks_workspace_access_token\n",
282 | " + \"@\"\n",
283 | " + databricks_server_hostname\n",
284 | " + \"?http_path=\"\n",
285 | " + databricks_cluster_jdbc_http_path\n",
286 | ")\n",
287 | "\n",
288 | "databricks_alch_engine: Engine = create_engine(databricks_sqlalchemy_url)"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {
294 | "application/vnd.databricks.v1+cell": {
295 | "cellMetadata": {},
296 | "inputWidgets": {},
297 | "nuid": "42369035-2002-40f4-b95d-767c0879c29d",
298 | "showTitle": false,
299 | "title": ""
300 | }
301 | },
302 | "source": [
303 | "\n",
304 | "Verify that the connection works by listing catalogs on Databricks"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {
311 | "application/vnd.databricks.v1+cell": {
312 | "cellMetadata": {
313 | "byteLimit": 2048000,
314 | "rowLimit": 10000
315 | },
316 | "inputWidgets": {},
317 | "nuid": "1decf93d-be28-4a59-bec7-e11a2ea19b38",
318 | "showTitle": false,
319 | "title": ""
320 | }
321 | },
322 | "outputs": [],
323 | "source": [
324 | "catalogs = pd.read_sql(\"show catalogs\", databricks_alch_engine)"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {
330 | "application/vnd.databricks.v1+cell": {
331 | "cellMetadata": {},
332 | "inputWidgets": {},
333 | "nuid": "59dc2781-b6ae-4725-ab2f-cd65f951cc47",
334 | "showTitle": false,
335 | "title": ""
336 | }
337 | },
338 | "source": [
339 | "\n",
340 | "### Run Queries"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {
347 | "application/vnd.databricks.v1+cell": {
348 | "cellMetadata": {
349 | "byteLimit": 2048000,
350 | "rowLimit": 10000
351 | },
352 | "inputWidgets": {},
353 | "nuid": "b61812d3-395e-480f-8a62-20b06ab6e797",
354 | "showTitle": false,
355 | "title": ""
356 | }
357 | },
358 | "outputs": [],
359 | "source": [
360 | "catalog_name: str = \"samples\"\n",
361 | "schema_name: str = \"nyctaxi\"\n",
362 | "table_name: str = \"trips\""
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": null,
368 | "metadata": {
369 | "application/vnd.databricks.v1+cell": {
370 | "cellMetadata": {
371 | "byteLimit": 2048000,
372 | "rowLimit": 10000
373 | },
374 | "inputWidgets": {},
375 | "nuid": "665b0c87-f1a5-4496-868a-22df48f29c7d",
376 | "showTitle": false,
377 | "title": ""
378 | }
379 | },
380 | "outputs": [],
381 | "source": [
382 | "df: pd.DataFrame = pd.read_sql(\n",
383 | " f\"SELECT * FROM {catalog_name}.{schema_name}.{table_name} limit 100\",\n",
384 | " databricks_alch_engine,\n",
385 | ")\n",
386 | "\n",
387 | "df.head()"
388 | ]
389 | }
390 | ],
391 | "metadata": {
392 | "application/vnd.databricks.v1+notebook": {
393 | "dashboards": [],
394 | "language": "python",
395 | "notebookMetadata": {
396 | "pythonIndentUnit": 4
397 | },
398 | "notebookName": "pandas_delta",
399 | "widgets": {
400 | "CLUSTER HTTP PATH": {
401 | "currentValue": "",
402 | "nuid": "9b33f01f-e642-41f9-bd6c-f06013c3d6c2",
403 | "widgetInfo": {
404 | "defaultValue": "",
405 | "label": null,
406 | "name": "CLUSTER HTTP PATH",
407 | "options": {
408 | "validationRegex": null,
409 | "widgetType": "text"
410 | },
411 | "widgetType": "text"
412 | }
413 | },
414 | "WORKSPACE ACCESS TOKEN": {
415 | "currentValue": "",
416 | "nuid": "4ef011e5-b2e1-4ffc-8fb8-b09907d809a5",
417 | "widgetInfo": {
418 | "defaultValue": "",
419 | "label": null,
420 | "name": "WORKSPACE ACCESS TOKEN",
421 | "options": {
422 | "validationRegex": null,
423 | "widgetType": "text"
424 | },
425 | "widgetType": "text"
426 | }
427 | },
428 | "WORKSPACE HOSTNAME": {
429 | "currentValue": "",
430 | "nuid": "c0a80a2b-f1b1-433c-8c44-633a3786b835",
431 | "widgetInfo": {
432 | "defaultValue": "",
433 | "label": null,
434 | "name": "WORKSPACE HOSTNAME",
435 | "options": {
436 | "validationRegex": null,
437 | "widgetType": "text"
438 | },
439 | "widgetType": "text"
440 | }
441 | }
442 | }
443 | },
444 | "language_info": {
445 | "name": "python"
446 | }
447 | },
448 | "nbformat": 4,
449 | "nbformat_minor": 0
450 | }
451 |
--------------------------------------------------------------------------------
/notebooks/update_job_cluster/README.md:
--------------------------------------------------------------------------------
1 | Update Workflows and Clusters ♻️
2 |
3 |
4 |
5 |
6 | ## Introduction
7 |
8 | This notebook is used to update clusters and workflows in the current workspace. It works by fetching the current cluster / workflow configs, performing some parsing and finally updating the same in current workspace.
9 |
10 | ## Use Cases
11 |
12 | Areas where such a notebook may be helpful:
13 |
14 | 1. **Cluster management**: The notebook could be used to automate the process of updating clusters, such as changing the cluster size, node type, or Spark version. This could be useful for organizations that need to scale their clusters up or down dynamically, or that need to keep their clusters up to date with the latest Spark releases.
15 | 2. **Workflow management**: The notebook could be used to automate the process of updating workflows, such as adding or removing tasks, changing the order of tasks, or updating the parameters of tasks. This could be useful for organizations that need to make changes to their workflows on a regular basis, or that need to deploy new workflows to production quickly and reliably.
16 |
17 | ---
18 | See more details in the notebook (ipynb)
19 |
--------------------------------------------------------------------------------
/notebooks/update_job_cluster/update_job_cluster.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {
8 | "byteLimit": 2048000,
9 | "rowLimit": 10000
10 | },
11 | "inputWidgets": {},
12 | "nuid": "d08046ea-5af6-4d2e-9bfb-483adbd72f55",
13 | "showTitle": false,
14 | "tableResultSettingsMap": {},
15 | "title": ""
16 | }
17 | },
18 | "source": [
19 | "# Update Clusters & Jobs ♻️\n",
20 | "\n",
21 | "## Requirements\n",
22 | "### Databricks\n",
23 | "* A Databricks Workspace & Workspace Access Token\n",
24 | "* At least one runnable cluster within the workspace\n"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 0,
30 | "metadata": {
31 | "application/vnd.databricks.v1+cell": {
32 | "cellMetadata": {
33 | "byteLimit": 2048000,
34 | "rowLimit": 10000
35 | },
36 | "inputWidgets": {},
37 | "nuid": "5905cc9b-8d77-442e-b915-b0d9cf8825c4",
38 | "showTitle": true,
39 | "tableResultSettingsMap": {},
40 | "title": "Update SDK to latest version"
41 | }
42 | },
43 | "outputs": [],
44 | "source": [
45 | "!pip install --upgrade databricks-sdk -q\n",
46 | "!pip install loguru -q"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 0,
52 | "metadata": {
53 | "application/vnd.databricks.v1+cell": {
54 | "cellMetadata": {
55 | "byteLimit": 2048000,
56 | "rowLimit": 10000
57 | },
58 | "inputWidgets": {},
59 | "nuid": "8ca906cb-450b-4806-8717-94603a670cef",
60 | "showTitle": true,
61 | "tableResultSettingsMap": {},
62 | "title": "Restart Python"
63 | }
64 | },
65 | "outputs": [],
66 | "source": [
67 | "dbutils.library.restartPython()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 0,
73 | "metadata": {
74 | "application/vnd.databricks.v1+cell": {
75 | "cellMetadata": {
76 | "byteLimit": 2048000,
77 | "rowLimit": 10000
78 | },
79 | "inputWidgets": {},
80 | "nuid": "aae8da7c-d1f1-4f52-ac7e-28c1b1e686e2",
81 | "showTitle": true,
82 | "tableResultSettingsMap": {},
83 | "title": "Imports"
84 | }
85 | },
86 | "outputs": [],
87 | "source": [
88 | "from pathlib import Path\n",
89 | "import re\n",
90 | "\n",
91 | "import pandas as pd\n",
92 | "from loguru import logger\n",
93 | "\n",
94 | "from databricks.sdk import WorkspaceClient\n",
95 | "from databricks.sdk.service.compute import (\n",
96 | " ClusterDetails,\n",
97 | " UpdateClusterResource,\n",
98 | " ListClustersFilterBy,\n",
99 | " ClusterSource,\n",
100 | " InitScriptInfo,\n",
101 | ")\n",
102 | "from databricks.sdk.service.jobs import Job, JobSettings, BaseJob, JobCluster"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {
108 | "application/vnd.databricks.v1+cell": {
109 | "cellMetadata": {
110 | "byteLimit": 2048000,
111 | "rowLimit": 10000
112 | },
113 | "inputWidgets": {},
114 | "nuid": "5ff0f6ad-d92a-4ce2-94f5-a7d3d572d8e4",
115 | "showTitle": false,
116 | "tableResultSettingsMap": {},
117 | "title": ""
118 | }
119 | },
120 | "source": [
121 | "## Setup\n",
122 | "\n",
123 | "| Parameter Name | Description | Allowed Values |\n",
124 | "| ------------------------------- | ------------------------------------------------------------------------------------------------------------------ | ----------------------------------- |\n",
125 | "| `workspace_host` | The **domain** of the Databricks workspace. | `str` |\n",
126 | "| `workspace_token` | The **token** for accessing the Databricks Workspace API | `str` |\n",
127 | "| `desired_runtime_version` | The desired **Databricks Runtime Version** for the updated clusters/job clusters. | `str` [Eg: `\"15.4\"`] |\n",
128 | "| `init_scripts_dir` | Path to the common **directory with init scripts** on a Unity Catalog **Volume** | `str` |\n",
129 | "| `cluster_init_script_files` | **Filenames** for the scripts to be used when initializing the **clusters**. Use `,` commas to separate files. | `str` [Eg: `\"S-154.sh, RE-154.sh\"`] |\n",
130 | "| `job_cluster_init_script_files` | **Filenames** for the scripts to be used when initializing the **job clusters**. Use `,` commas to separate files. | `str` [Eg: `\"S-154.sh, RE-154.sh\"`] |\n"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 0,
136 | "metadata": {
137 | "application/vnd.databricks.v1+cell": {
138 | "cellMetadata": {
139 | "byteLimit": 2048000,
140 | "rowLimit": 10000
141 | },
142 | "inputWidgets": {},
143 | "nuid": "8551cf68-7418-4596-853a-364ba45e9059",
144 | "showTitle": true,
145 | "tableResultSettingsMap": {},
146 | "title": "Setup Widgets"
147 | }
148 | },
149 | "outputs": [],
150 | "source": [
151 | "dbutils.widgets.removeAll()\n",
152 | "\n",
153 | "dbutils.widgets.text(\"workspace_host\", \"\")\n",
154 | "workspace_host: str = getArgument(\"workspace_host\")\n",
155 | "\n",
156 | "dbutils.widgets.text(\"workspace_token\", \"\")\n",
157 | "workspace_token: str = getArgument(\"workspace_token\")\n",
158 | "\n",
159 | "dbutils.widgets.text(\"desired_runtime_version\", \"\")\n",
160 | "desired_runtime_version: str = getArgument(\"desired_runtime_version\")\n",
161 | "\n",
162 | "dbutils.widgets.text(\"init_scripts_dir\", \"\")\n",
163 | "# Validate if directory exists and normalize the path\n",
164 | "init_scripts_dir: str = str(Path(getArgument(\"init_scripts_dir\")).resolve(strict=True))\n",
165 | "\n",
166 | "dbutils.widgets.text(\"cluster_init_script_files\", \"\")\n",
167 | "cluster_init_script_files: list[str] = [\n",
168 | " filename.strip() for filename in getArgument(\"cluster_init_script_files\").split(\",\")\n",
169 | "]\n",
170 | "# Validate if files exist and are not empty\n",
171 | "assert all(\n",
172 | " (Path(init_scripts_dir) / file_name).exists()\n",
173 | " for file_name in cluster_init_script_files\n",
174 | "), \"One or more cluster init script files do not exist\"\n",
175 | "\n",
176 | "dbutils.widgets.text(\"job_cluster_init_script_files\", \"\")\n",
177 | "job_cluster_init_script_files: list[str] = [\n",
178 | " filename.strip()\n",
179 | " for filename in getArgument(\"job_cluster_init_script_files\").split(\",\")\n",
180 | "]\n",
181 | "# Validate if files exist and are not empty\n",
182 | "assert all(\n",
183 | " (Path(init_scripts_dir) / file_name).exists()\n",
184 | " for file_name in job_cluster_init_script_files\n",
185 | "), \"One or more job cluster init script files do not exist\"\n",
186 | "\n",
187 | "assert all(\n",
188 | " [\n",
189 | " workspace_host,\n",
190 | " workspace_token,\n",
191 | " desired_runtime_version,\n",
192 | " init_scripts_dir,\n",
193 | " cluster_init_script_files,\n",
194 | " job_cluster_init_script_files,\n",
195 | " ]\n",
196 | "), \"One or more required parameters for notebook functioning are missing\"\n",
197 | "\n",
198 | "logger.info(f\"{workspace_host=}\")\n",
199 | "logger.info(f\"{desired_runtime_version=}\")\n",
200 | "logger.info(f\"{init_scripts_dir=}\")\n",
201 | "logger.info(f\"{cluster_init_script_files=}\")\n",
202 | "logger.info(f\"{job_cluster_init_script_files=}\")"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 0,
208 | "metadata": {
209 | "application/vnd.databricks.v1+cell": {
210 | "cellMetadata": {
211 | "byteLimit": 2048000,
212 | "rowLimit": 10000
213 | },
214 | "inputWidgets": {},
215 | "nuid": "aa39e27a-0385-4b2b-a319-eeed62f96483",
216 | "showTitle": true,
217 | "tableResultSettingsMap": {},
218 | "title": "Setup the workspace client"
219 | }
220 | },
221 | "outputs": [],
222 | "source": [
223 | "ws = WorkspaceClient(host=workspace_host, token=workspace_token)\n",
224 | "logger.info(f\"{ws.get_workspace_id()=}\")"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 0,
230 | "metadata": {
231 | "application/vnd.databricks.v1+cell": {
232 | "cellMetadata": {
233 | "byteLimit": 2048000,
234 | "rowLimit": 10000
235 | },
236 | "inputWidgets": {},
237 | "nuid": "c9a7849d-1c94-4faf-afa1-8444c9213365",
238 | "showTitle": true,
239 | "tableResultSettingsMap": {},
240 | "title": "Validate Input Runtime"
241 | }
242 | },
243 | "outputs": [],
244 | "source": [
245 | "valid_workspace_versions: list[str] = sorted(\n",
246 | " list(\n",
247 | " set(\n",
248 | " [\n",
249 | " version_tuple.name.split(\" \")[0]\n",
250 | " for version_tuple in ws.clusters.spark_versions().versions\n",
251 | " ]\n",
252 | " )\n",
253 | " )\n",
254 | ")\n",
255 | "\n",
256 | "logger.info(f\"{len(valid_workspace_versions)=:,}\")\n",
257 | "\n",
258 | "assert (\n",
259 | " desired_runtime_version in valid_workspace_versions\n",
260 | "), f\"Invalid {desired_runtime_version=}\""
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {
266 | "application/vnd.databricks.v1+cell": {
267 | "cellMetadata": {
268 | "byteLimit": 2048000,
269 | "rowLimit": 10000
270 | },
271 | "inputWidgets": {},
272 | "nuid": "995e3a0e-0aca-4cac-bd6e-baa24078dff1",
273 | "showTitle": false,
274 | "tableResultSettingsMap": {},
275 | "title": ""
276 | }
277 | },
278 | "source": [
279 | "## Init Scripts"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 0,
285 | "metadata": {
286 | "application/vnd.databricks.v1+cell": {
287 | "cellMetadata": {
288 | "byteLimit": 2048000,
289 | "rowLimit": 10000
290 | },
291 | "inputWidgets": {},
292 | "nuid": "5aaedbf9-3102-406f-9c3b-870e24542702",
293 | "showTitle": true,
294 | "tableResultSettingsMap": {},
295 | "title": "Define Init scripts"
296 | }
297 | },
298 | "outputs": [],
299 | "source": [
300 | "def make_init_scripts(init_script_files: list[str]):\n",
301 | " return [\n",
302 | " InitScriptInfo.from_dict(\n",
303 | " {\n",
304 | " \"volumes\": {\n",
305 | " \"destination\": str(Path(init_scripts_dir) / file_name),\n",
306 | " }\n",
307 | " }\n",
308 | " )\n",
309 | " for file_name in init_script_files\n",
310 | " ]\n",
311 | "\n",
312 | "\n",
313 | "cluster_init_scripts = make_init_scripts(cluster_init_script_files)\n",
314 | "job_cluster_init_scripts = make_init_scripts(job_cluster_init_script_files)\n",
315 | "\n",
316 | "logger.info(f\"{cluster_init_scripts=}\")\n",
317 | "logger.info(f\"{job_cluster_init_scripts=}\")"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {
323 | "application/vnd.databricks.v1+cell": {
324 | "cellMetadata": {
325 | "byteLimit": 2048000,
326 | "rowLimit": 10000
327 | },
328 | "inputWidgets": {},
329 | "nuid": "38b92429-d1b1-48e5-a305-29c3007c3f03",
330 | "showTitle": false,
331 | "tableResultSettingsMap": {},
332 | "title": ""
333 | }
334 | },
335 | "source": [
336 | "## Clusters\n",
337 | "\n",
338 | "According to the SDK and REST API documentation:\n",
339 | "\n",
340 | "- Clusters created as a result of a job cannot be updated via this endpoint. Only those created either via the `UI` or `API` can be changed.\n",
341 | "- Those clusters that are `RUNNING` will be `TERMINATED` at the time of update and restart with the new configuration."
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 0,
347 | "metadata": {
348 | "application/vnd.databricks.v1+cell": {
349 | "cellMetadata": {
350 | "byteLimit": 2048000,
351 | "rowLimit": 10000
352 | },
353 | "inputWidgets": {},
354 | "nuid": "18545303-a26d-43e5-b500-39c9ea9c1975",
355 | "showTitle": true,
356 | "tableResultSettingsMap": {},
357 | "title": "List all clusters"
358 | }
359 | },
360 | "outputs": [],
361 | "source": [
362 | "clusters = list(\n",
363 | " ws.clusters.list(\n",
364 | " filter_by=ListClustersFilterBy(\n",
365 | " cluster_sources=[ClusterSource.API, ClusterSource.UI]\n",
366 | " )\n",
367 | " )\n",
368 | ")\n",
369 | "\n",
370 | "logger.info(f\"Found {len(clusters)} clusters\")"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 0,
376 | "metadata": {
377 | "application/vnd.databricks.v1+cell": {
378 | "cellMetadata": {
379 | "byteLimit": 2048000,
380 | "rowLimit": 10000
381 | },
382 | "inputWidgets": {},
383 | "nuid": "69aece1f-8c0c-496a-aa1a-cbc613c69c70",
384 | "showTitle": true,
385 | "tableResultSettingsMap": {},
386 | "title": "Display clusters as table"
387 | }
388 | },
389 | "outputs": [],
390 | "source": [
391 | "pd.DataFrame([cluster.as_dict() for cluster in clusters])"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": 0,
397 | "metadata": {
398 | "application/vnd.databricks.v1+cell": {
399 | "cellMetadata": {
400 | "byteLimit": 2048000,
401 | "rowLimit": 10000
402 | },
403 | "inputWidgets": {},
404 | "nuid": "a6ebd085-4694-409a-96f1-a89eaf42119c",
405 | "showTitle": true,
406 | "tableResultSettingsMap": {},
407 | "title": "Save cluster update parameters"
408 | }
409 | },
410 | "outputs": [],
411 | "source": [
412 | "# A dictionary which maps each cluster ID to parameters for the cluster update method\n",
413 | "cluster_updates = {}"
414 | ]
415 | },
416 | {
417 | "cell_type": "markdown",
418 | "metadata": {
419 | "application/vnd.databricks.v1+cell": {
420 | "cellMetadata": {
421 | "byteLimit": 2048000,
422 | "rowLimit": 10000
423 | },
424 | "inputWidgets": {},
425 | "nuid": "8c860c4d-9a70-42d7-92fb-cb664ac80a7a",
426 | "showTitle": false,
427 | "tableResultSettingsMap": {},
428 | "title": ""
429 | }
430 | },
431 | "source": [
432 | "### Updating the Databricks Runtime Version\n",
433 | "\n",
434 | "The runtime version is the `cluster.spark_version` field."
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 0,
440 | "metadata": {
441 | "application/vnd.databricks.v1+cell": {
442 | "cellMetadata": {
443 | "byteLimit": 2048000,
444 | "rowLimit": 10000
445 | },
446 | "inputWidgets": {},
447 | "nuid": "fd0f7b31-e0a8-471c-b395-ccf5297e87a6",
448 | "showTitle": true,
449 | "tableResultSettingsMap": {},
450 | "title": "Function for retrieving the new runtime version"
451 | }
452 | },
453 | "outputs": [],
454 | "source": [
455 | "valid_versions = set(\n",
456 | " pd.DataFrame(\n",
457 | " [version.as_dict() for version in ws.clusters.spark_versions().versions]\n",
458 | " )[\"key\"].tolist()\n",
459 | ")\n",
460 | "\n",
461 | "\n",
462 | "def get_updated_spark_version_key(\n",
463 | " spark_version_key: str, desired_runtime_version: str\n",
464 | ") -> str:\n",
465 | " new_spark_version = re.sub(\n",
466 | " r\"^\\d{2}\\.\\d\", desired_runtime_version, spark_version_key\n",
467 | " )\n",
468 | "\n",
469 | " if new_spark_version not in valid_versions:\n",
470 | " raise ValueError(f\"Could not validate version '{new_spark_version}'\")\n",
471 | "\n",
472 | " return new_spark_version\n",
473 | "\n",
474 | "\n",
475 | "assert (\n",
476 | " get_updated_spark_version_key(\"11.3.x-photon-scala2.12\", \"15.4\")\n",
477 | " == \"15.4.x-photon-scala2.12\"\n",
478 | ")"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 0,
484 | "metadata": {
485 | "application/vnd.databricks.v1+cell": {
486 | "cellMetadata": {
487 | "byteLimit": 2048000,
488 | "rowLimit": 10000
489 | },
490 | "inputWidgets": {},
491 | "nuid": "efb3415b-6f28-4500-a480-29b43a2373c6",
492 | "showTitle": true,
493 | "tableResultSettingsMap": {},
494 | "title": "Given a cluster, define the params for updating it's runtime version"
495 | }
496 | },
497 | "outputs": [],
498 | "source": [
499 | "def update_cluster_spark_version(cluster: ClusterDetails):\n",
500 | " cluster_updates[cluster.cluster_id] = {\n",
501 | " **(cluster_updates.get(cluster.cluster_id) or {}),\n",
502 | " \"spark_version\": get_updated_spark_version_key(\n",
503 | " cluster.spark_version, desired_runtime_version\n",
504 | " ),\n",
505 | " }\n",
506 | "\n",
507 | "\n",
508 | "for cluster in clusters:\n",
509 | " update_cluster_spark_version(cluster)"
510 | ]
511 | },
512 | {
513 | "cell_type": "markdown",
514 | "metadata": {
515 | "application/vnd.databricks.v1+cell": {
516 | "cellMetadata": {
517 | "byteLimit": 2048000,
518 | "rowLimit": 10000
519 | },
520 | "inputWidgets": {},
521 | "nuid": "9d802ce3-3004-4d6a-980c-a46c1ed81cbb",
522 | "showTitle": false,
523 | "tableResultSettingsMap": {},
524 | "title": ""
525 | }
526 | },
527 | "source": [
528 | "### Update the Init Scripts"
529 | ]
530 | },
531 | {
532 | "cell_type": "code",
533 | "execution_count": 0,
534 | "metadata": {
535 | "application/vnd.databricks.v1+cell": {
536 | "cellMetadata": {
537 | "byteLimit": 2048000,
538 | "rowLimit": 10000
539 | },
540 | "inputWidgets": {},
541 | "nuid": "2d188fd2-b3ae-4409-8870-3ad1dbf6dca9",
542 | "showTitle": true,
543 | "tableResultSettingsMap": {},
544 | "title": "Given a cluster, update it so that it uses specific init scripts"
545 | }
546 | },
547 | "outputs": [],
548 | "source": [
549 | "def update_cluster_init_scripts(\n",
550 | " cluster: ClusterDetails, init_scripts: list[InitScriptInfo]\n",
551 | "):\n",
552 | " cluster_updates[cluster.cluster_id] = {\n",
553 | " **(cluster_updates.get(cluster.cluster_id) or {}),\n",
554 | " \"init_scripts\": init_scripts,\n",
555 | " }\n",
556 | "\n",
557 | "\n",
558 | "for cluster in clusters:\n",
559 | " update_cluster_init_scripts(cluster, cluster_init_scripts)"
560 | ]
561 | },
562 | {
563 | "cell_type": "markdown",
564 | "metadata": {
565 | "application/vnd.databricks.v1+cell": {
566 | "cellMetadata": {
567 | "byteLimit": 2048000,
568 | "rowLimit": 10000
569 | },
570 | "inputWidgets": {},
571 | "nuid": "d1980665-fc7c-4341-b78f-c4c8b425ce42",
572 | "showTitle": false,
573 | "tableResultSettingsMap": {},
574 | "title": ""
575 | }
576 | },
577 | "source": [
578 | "### Execute the updates"
579 | ]
580 | },
581 | {
582 | "cell_type": "code",
583 | "execution_count": 0,
584 | "metadata": {
585 | "application/vnd.databricks.v1+cell": {
586 | "cellMetadata": {
587 | "byteLimit": 2048000,
588 | "rowLimit": 10000
589 | },
590 | "inputWidgets": {},
591 | "nuid": "4c32de4a-0967-495a-bce7-cd45ba442eb2",
592 | "showTitle": true,
593 | "tableResultSettingsMap": {},
594 | "title": "Use the SDK to make the cluster updates"
595 | }
596 | },
597 | "outputs": [],
598 | "source": [
599 | "clusters_to_update = clusters\n",
600 | "names_for_clusters_that_failed_update = []\n",
601 | "\n",
602 | "for cluster in clusters_to_update:\n",
603 | " cluster_id = cluster.cluster_id\n",
604 | "\n",
605 | " # Do not update the cluster which is running this notebook\n",
606 | " # because it will force a restart\n",
607 | " if cluster_id == spark.conf.get(\"spark.databricks.clusterUsageTags.clusterId\"):\n",
608 | " logger.info(\n",
609 | " f\"Skipping cluster: '{cluster.cluster_name}', because it is running this notebook\"\n",
610 | " )\n",
611 | " continue\n",
612 | "\n",
613 | " updates = cluster_updates.get(cluster_id)\n",
614 | "\n",
615 | " if updates is None:\n",
616 | " continue\n",
617 | "\n",
618 | " update_mask = \",\".join(updates.keys())\n",
619 | "\n",
620 | " try:\n",
621 | " ws.clusters.update(\n",
622 | " cluster_id=cluster_id,\n",
623 | " update_mask=update_mask,\n",
624 | " cluster=UpdateClusterResource(**updates),\n",
625 | " )\n",
626 | " logger.info(f\"Updated cluster: '{cluster.cluster_name}'\")\n",
627 | " except Exception as e:\n",
628 | " logger.error(f\"Failed to update cluster: '{cluster.cluster_name}'\")\n",
629 | " logger.error(e)\n",
630 | " names_for_clusters_that_failed_update.append(cluster.cluster_name)\n",
631 | "\n",
632 | "\n",
633 | "cluster_update_failures = len(names_for_clusters_that_failed_update)\n",
634 | "cluster_count = len(clusters_to_update)\n",
635 | "\n",
636 | "if cluster_update_failures > 0:\n",
637 | " cluster_update_failure_message = (\n",
638 | " f\"Failed to update {cluster_update_failures} of {cluster_count} cluster(s)\"\n",
639 | " )\n",
640 | " if cluster_update_failures / len(clusters) >= 0.25:\n",
641 | " raise Exception(cluster_update_failure_message)\n",
642 | "\n",
643 | " logger.warning(cluster_update_failure_message)\n",
644 | "else:\n",
645 | " logger.info(f\"Updated all {cluster_count} cluster(s)\")"
646 | ]
647 | },
648 | {
649 | "cell_type": "markdown",
650 | "metadata": {
651 | "application/vnd.databricks.v1+cell": {
652 | "cellMetadata": {
653 | "byteLimit": 2048000,
654 | "rowLimit": 10000
655 | },
656 | "inputWidgets": {},
657 | "nuid": "9ed4b505-873b-4211-8d62-5caff2de6e21",
658 | "showTitle": false,
659 | "tableResultSettingsMap": {},
660 | "title": ""
661 | }
662 | },
663 | "source": [
664 | "## Jobs"
665 | ]
666 | },
667 | {
668 | "cell_type": "code",
669 | "execution_count": 0,
670 | "metadata": {
671 | "application/vnd.databricks.v1+cell": {
672 | "cellMetadata": {
673 | "byteLimit": 2048000,
674 | "rowLimit": 10000
675 | },
676 | "inputWidgets": {},
677 | "nuid": "d5eb9996-eb42-408e-8851-13985e039ca4",
678 | "showTitle": true,
679 | "tableResultSettingsMap": {},
680 | "title": "List all jobs"
681 | }
682 | },
683 | "outputs": [],
684 | "source": [
685 | "jobs = list(ws.jobs.list(expand_tasks=True))\n",
686 | "logger.info(f\"Found {len(jobs)} jobs\")"
687 | ]
688 | },
689 | {
690 | "cell_type": "code",
691 | "execution_count": 0,
692 | "metadata": {
693 | "application/vnd.databricks.v1+cell": {
694 | "cellMetadata": {
695 | "byteLimit": 2048000,
696 | "rowLimit": 10000
697 | },
698 | "inputWidgets": {},
699 | "nuid": "ada4fa8c-43ea-45fb-bb0a-6df15bb2a7a3",
700 | "showTitle": true,
701 | "tableResultSettingsMap": {},
702 | "title": "Display jobs as a table"
703 | }
704 | },
705 | "outputs": [],
706 | "source": [
707 | "pd.DataFrame([job.as_dict() for job in jobs])"
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": 0,
713 | "metadata": {
714 | "application/vnd.databricks.v1+cell": {
715 | "cellMetadata": {
716 | "byteLimit": 2048000,
717 | "rowLimit": 10000
718 | },
719 | "inputWidgets": {},
720 | "nuid": "939f4dee-2999-4a05-b247-dbfb6f097b20",
721 | "showTitle": true,
722 | "tableResultSettingsMap": {},
723 | "title": "Creates a new job with its job clusters on the updated runtime version"
724 | }
725 | },
726 | "outputs": [],
727 | "source": [
728 | "def update_job_clusters_spark_version(job: Job | BaseJob) -> Job | BaseJob:\n",
729 | " job_clusters = []\n",
730 | " for jc in job.settings.job_clusters:\n",
731 | " njc = jc.__class__.from_dict(jc.as_dict())\n",
732 | " njc.new_cluster.spark_version = get_updated_spark_version_key(\n",
733 | " njc.new_cluster.spark_version, desired_runtime_version\n",
734 | " )\n",
735 | " job_clusters.append(njc)\n",
736 | "\n",
737 | " new_job = job.__class__.from_dict(job.as_dict())\n",
738 | " new_job.settings.job_clusters = job_clusters\n",
739 | " return new_job"
740 | ]
741 | },
742 | {
743 | "cell_type": "code",
744 | "execution_count": 0,
745 | "metadata": {
746 | "application/vnd.databricks.v1+cell": {
747 | "cellMetadata": {
748 | "byteLimit": 2048000,
749 | "rowLimit": 10000
750 | },
751 | "inputWidgets": {},
752 | "nuid": "2c5b17e3-d89e-4a8e-a7b8-5ed714c4a7ef",
753 | "showTitle": true,
754 | "tableResultSettingsMap": {},
755 | "title": "Creates a new job with its job clusters using the specified init scripts"
756 | }
757 | },
758 | "outputs": [],
759 | "source": [
760 | "def update_job_clusters_init_scripts(\n",
761 | " job: Job | BaseJob, init_scripts: list[InitScriptInfo]\n",
762 | ") -> Job | BaseJob:\n",
763 | " job_clusters = []\n",
764 | " for jc in job.settings.job_clusters:\n",
765 | " njc = jc.__class__.from_dict(jc.as_dict())\n",
766 | " njc.new_cluster.init_scripts = init_scripts\n",
767 | " job_clusters.append(njc)\n",
768 | "\n",
769 | " new_job = job.__class__.from_dict(job.as_dict())\n",
770 | " new_job.settings.job_clusters = job_clusters\n",
771 | " return new_job"
772 | ]
773 | },
774 | {
775 | "cell_type": "code",
776 | "execution_count": 0,
777 | "metadata": {
778 | "application/vnd.databricks.v1+cell": {
779 | "cellMetadata": {
780 | "byteLimit": 2048000,
781 | "rowLimit": 10000
782 | },
783 | "inputWidgets": {},
784 | "nuid": "80a63f0b-abce-4a5b-8ad5-a151f9ff35a6",
785 | "showTitle": true,
786 | "tableResultSettingsMap": {},
787 | "title": "Update all jobs"
788 | }
789 | },
790 | "outputs": [],
791 | "source": [
792 | "names_for_jobs_that_failed_update = []\n",
793 | "\n",
794 | "jobs_to_update = jobs\n",
795 | "\n",
796 | "for job in jobs_to_update:\n",
797 | " njob = update_job_clusters_spark_version(job)\n",
798 | " njob = update_job_clusters_init_scripts(njob, job_cluster_init_scripts)\n",
799 | "\n",
800 | " new_settings = njob.settings.as_dict()\n",
801 | " new_settings = {\n",
802 | " k: v for k, v in new_settings.items() if k in (\"job_clusters\", \"init_scripts\")\n",
803 | " }\n",
804 | " new_settings = JobSettings.from_dict(new_settings)\n",
805 | "\n",
806 | " try:\n",
807 | " ws.jobs.update(job_id=job.job_id, new_settings=njob.settings)\n",
808 | " logger.info(f\"Updated job: '{job.settings.name}'\")\n",
809 | " except Exception as e:\n",
810 | " logger.error(f\"Failed to update job: '{job.settings.name}'\")\n",
811 | " logger.error(e)\n",
812 | " names_for_jobs_that_failed_update.append(job.settings.name)\n",
813 | "\n",
814 | "job_update_failures = len(names_for_jobs_that_failed_update)\n",
815 | "job_count = len(jobs_to_update)\n",
816 | "\n",
817 | "if job_update_failures > 0:\n",
818 | " job_update_failure_message = (\n",
819 | " f\"Failed to update {job_update_failures} of {job_count} job(s)\"\n",
820 | " )\n",
821 | " if job_update_failures / len(jobs) >= 0.25:\n",
822 | " raise Exception(job_update_failure_message)\n",
823 | "\n",
824 | " logger.warning(job_update_failure_message)\n",
825 | "else:\n",
826 | " logger.info(f\"Updated all {job_count} job(s)\")"
827 | ]
828 | }
829 | ],
830 | "metadata": {
831 | "application/vnd.databricks.v1+notebook": {
832 | "computePreferences": null,
833 | "dashboards": [],
834 | "environmentMetadata": null,
835 | "language": "python",
836 | "notebookMetadata": {
837 | "pythonIndentUnit": 4
838 | },
839 | "notebookName": "update_job_cluster",
840 | "widgets": {
841 | "cluster_init_script_files": {
842 | "currentValue": "",
843 | "nuid": "57203fa8-3437-4dad-a7b2-ae12d82f7612",
844 | "typedWidgetInfo": {
845 | "autoCreated": false,
846 | "defaultValue": "",
847 | "label": null,
848 | "name": "cluster_init_script_files",
849 | "options": {
850 | "widgetDisplayType": "Text",
851 | "validationRegex": null
852 | },
853 | "parameterDataType": "String"
854 | },
855 | "widgetInfo": {
856 | "widgetType": "text",
857 | "defaultValue": "",
858 | "label": null,
859 | "name": "cluster_init_script_files",
860 | "options": {
861 | "widgetType": "text",
862 | "autoCreated": null,
863 | "validationRegex": null
864 | }
865 | }
866 | },
867 | "desired_runtime_version": {
868 | "currentValue": "",
869 | "nuid": "37292731-03a3-4422-9f9b-80d4b9fa8e0d",
870 | "typedWidgetInfo": {
871 | "autoCreated": false,
872 | "defaultValue": "",
873 | "label": null,
874 | "name": "desired_runtime_version",
875 | "options": {
876 | "widgetDisplayType": "Text",
877 | "validationRegex": null
878 | },
879 | "parameterDataType": "String"
880 | },
881 | "widgetInfo": {
882 | "widgetType": "text",
883 | "defaultValue": "",
884 | "label": null,
885 | "name": "desired_runtime_version",
886 | "options": {
887 | "widgetType": "text",
888 | "autoCreated": null,
889 | "validationRegex": null
890 | }
891 | }
892 | },
893 | "init_scripts_dir": {
894 | "currentValue": "",
895 | "nuid": "77a17daf-1ac2-4821-8842-a395e102a92c",
896 | "typedWidgetInfo": {
897 | "autoCreated": false,
898 | "defaultValue": "",
899 | "label": null,
900 | "name": "init_scripts_dir",
901 | "options": {
902 | "widgetDisplayType": "Text",
903 | "validationRegex": null
904 | },
905 | "parameterDataType": "String"
906 | },
907 | "widgetInfo": {
908 | "widgetType": "text",
909 | "defaultValue": "",
910 | "label": null,
911 | "name": "init_scripts_dir",
912 | "options": {
913 | "widgetType": "text",
914 | "autoCreated": null,
915 | "validationRegex": null
916 | }
917 | }
918 | },
919 | "job_cluster_init_script_files": {
920 | "currentValue": "",
921 | "nuid": "5017ecfc-acc5-46d4-83aa-44e7276c4ddf",
922 | "typedWidgetInfo": {
923 | "autoCreated": false,
924 | "defaultValue": "",
925 | "label": null,
926 | "name": "job_cluster_init_script_files",
927 | "options": {
928 | "widgetDisplayType": "Text",
929 | "validationRegex": null
930 | },
931 | "parameterDataType": "String"
932 | },
933 | "widgetInfo": {
934 | "widgetType": "text",
935 | "defaultValue": "",
936 | "label": null,
937 | "name": "job_cluster_init_script_files",
938 | "options": {
939 | "widgetType": "text",
940 | "autoCreated": null,
941 | "validationRegex": null
942 | }
943 | }
944 | },
945 | "workspace_host": {
946 | "currentValue": "",
947 | "nuid": "300637f4-1b1a-41b0-8507-13c8df6f5e65",
948 | "typedWidgetInfo": {
949 | "autoCreated": false,
950 | "defaultValue": "",
951 | "label": null,
952 | "name": "workspace_host",
953 | "options": {
954 | "widgetDisplayType": "Text",
955 | "validationRegex": null
956 | },
957 | "parameterDataType": "String"
958 | },
959 | "widgetInfo": {
960 | "widgetType": "text",
961 | "defaultValue": "",
962 | "label": null,
963 | "name": "workspace_host",
964 | "options": {
965 | "widgetType": "text",
966 | "autoCreated": null,
967 | "validationRegex": null
968 | }
969 | }
970 | },
971 | "workspace_token": {
972 | "currentValue": "",
973 | "nuid": "96fd30d3-9fe9-4666-a928-ac1afcdde420",
974 | "typedWidgetInfo": {
975 | "autoCreated": false,
976 | "defaultValue": "",
977 | "label": null,
978 | "name": "workspace_token",
979 | "options": {
980 | "widgetDisplayType": "Text",
981 | "validationRegex": null
982 | },
983 | "parameterDataType": "String"
984 | },
985 | "widgetInfo": {
986 | "widgetType": "text",
987 | "defaultValue": "",
988 | "label": null,
989 | "name": "workspace_token",
990 | "options": {
991 | "widgetType": "text",
992 | "autoCreated": null,
993 | "validationRegex": null
994 | }
995 | }
996 | }
997 | }
998 | },
999 | "language_info": {
1000 | "name": "python"
1001 | }
1002 | },
1003 | "nbformat": 4,
1004 | "nbformat_minor": 0
1005 | }
1006 |
--------------------------------------------------------------------------------
/notebooks/workflow_calendar/README.md:
--------------------------------------------------------------------------------
1 | Workflow Calendar 📆
2 |
3 |
4 |
5 |
6 |
7 | ## Introduction
8 |
9 | This notebook is designed to visualize workflow schedules and their respective runs. It showcases aspects of scheduling, execution, and duration of your tasks.
10 |
11 | ## Use Cases
12 |
13 | This notebook is useful for the below cases:
14 |
15 | 1. **Performance Monitoring**: Keep an eye on how long your runs are taking and identify potential bottlenecks.
16 | 2. **Scheduling Insights**: Understand when your workflows are scheduled and when the first run occurred. Resolve timing conflicts and ensure that your workflows are running as expected.
17 | 3. **Historical Analysis**: Analyze the historical data of your runs, making it easier to identify trends and patterns.
18 | 4. **Resource Allocation**: Optimize your resource allocation based on past performance.
19 | 5. **Troubleshooting**: Quickly identify runs that failed or took longer than expected.
20 |
21 | ---
22 |
23 |
24 |
25 | ---
26 |
27 | See more details in the notebook (ipynb)
28 |
--------------------------------------------------------------------------------
/notebooks/workflow_calendar/assets/example_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dotlas/databricks_helpers/6b9a6f1eb5eb2c1d57ce61f7d3ae5443fba39b9b/notebooks/workflow_calendar/assets/example_viz.png
--------------------------------------------------------------------------------
/notebooks/workflow_calendar/workflow_calender.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "27fd11cc-a11d-4c9a-9a74-e44f8763a4ce",
10 | "showTitle": false,
11 | "title": ""
12 | }
13 | },
14 | "source": [
15 | "# Workflow Calendar 📆\n",
16 | "## Requirements\n",
17 | "### Databricks\n",
18 | "* A Databricks Workspace & Workspace Access Token\n",
19 | "* At least one runnable cluster within the workspace\n",
20 | "* At least one scheduled job in Databricks workflows\n",
21 | "\n",
22 | "### Packages\n",
23 | "This process relies on a package called `cron-schedule-triggers` which is used to infer the cron-schedule expression. `pandas` for data manipulation and `plotly` for visualization.\n",
24 | "* cron-schedule-triggers \n",
25 | "* pandas \n",
26 | "* plotly \n",
27 | "\n"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "application/vnd.databricks.v1+cell": {
35 | "cellMetadata": {},
36 | "inputWidgets": {},
37 | "nuid": "6d4292f1-19fa-44ce-aeb4-3b108c69e659",
38 | "showTitle": false,
39 | "title": ""
40 | }
41 | },
42 | "outputs": [],
43 | "source": [
44 | "pip install cron-schedule-triggers -q"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {
50 | "application/vnd.databricks.v1+cell": {
51 | "cellMetadata": {},
52 | "inputWidgets": {},
53 | "nuid": "760300fa-6924-47b1-b471-40cc1c990670",
54 | "showTitle": false,
55 | "title": ""
56 | }
57 | },
58 | "source": [
59 | "## Imports"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {
66 | "application/vnd.databricks.v1+cell": {
67 | "cellMetadata": {
68 | "byteLimit": 2048000,
69 | "rowLimit": 10000
70 | },
71 | "inputWidgets": {},
72 | "nuid": "89018793-50c2-432c-8979-8e287a048ccc",
73 | "showTitle": false,
74 | "title": ""
75 | }
76 | },
77 | "outputs": [],
78 | "source": [
79 | "import requests\n",
80 | "from typing import Optional, Callable\n",
81 | "import pandas as pd\n",
82 | "import datetime\n",
83 | "import re\n",
84 | "\n",
85 | "from cstriggers.core.trigger import QuartzCron\n",
86 | "from datetime import timedelta\n",
87 | "import plotly.express as px\n",
88 | "\n",
89 | "\n",
90 | "import plotly.graph_objects as go\n",
91 | "import plotly.figure_factory as ff"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {
97 | "application/vnd.databricks.v1+cell": {
98 | "cellMetadata": {},
99 | "inputWidgets": {},
100 | "nuid": "764722d6-53d7-4d0e-93f6-4aa2ad4154d8",
101 | "showTitle": false,
102 | "title": ""
103 | }
104 | },
105 | "source": [
106 | "## Input Data\n",
107 | "\n",
108 | "> Provide the date values in `YYYY-MM-DD` format"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {
115 | "application/vnd.databricks.v1+cell": {
116 | "cellMetadata": {
117 | "byteLimit": 2048000,
118 | "rowLimit": 10000
119 | },
120 | "inputWidgets": {},
121 | "nuid": "249ef86e-0d94-47af-a9a1-6eea8f273919",
122 | "showTitle": false,
123 | "title": ""
124 | }
125 | },
126 | "outputs": [],
127 | "source": [
128 | "dbutils.widgets.removeAll()\n",
129 | "\n",
130 | "dbutils.widgets.text(\"start_date\", \"2023-10-01\")\n",
131 | "\n",
132 | "start_date: datetime.datetime = datetime.datetime.strptime(\n",
133 | " getArgument(\"start_date\"), \"%Y-%m-%d\"\n",
134 | ")\n",
135 | "\n",
136 | "dbutils.widgets.text(\"end_date\", \"2023-11-05\")\n",
137 | "\n",
138 | "end_date: datetime.datetime = datetime.datetime.strptime(\n",
139 | " getArgument(\"end_date\"), \"%Y-%m-%d\"\n",
140 | ")\n",
141 | "\n",
142 | "dbutils.widgets.text(\"databricks_url\", \"\")\n",
143 | "databricks_url: str = getArgument(\"databricks_url\")\n",
144 | "\n",
145 | "dbutils.widgets.text(\"databricks_workspace_token\", \"\")\n",
146 | "databricks_workspace_token: str = getArgument(\"databricks_workspace_token\")\n",
147 | "\n",
148 | "headers: dict = {\"Authorization\": f\"Bearer {databricks_workspace_token}\"}"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {
155 | "application/vnd.databricks.v1+cell": {
156 | "cellMetadata": {
157 | "byteLimit": 2048000,
158 | "rowLimit": 10000
159 | },
160 | "inputWidgets": {},
161 | "nuid": "9c5b390d-b903-4cec-a8b9-f8855f297f12",
162 | "showTitle": false,
163 | "title": ""
164 | }
165 | },
166 | "outputs": [],
167 | "source": [
168 | "query_params: dict = {\n",
169 | " \"LIST_JOBS_LIMIT\": 100, # max limit\n",
170 | " \"LIST_RUNS_LIMIT\": 25, # max limit\n",
171 | " \"EXPAND_RUNS\": \"true\",\n",
172 | " \"EXPAND_TASKS\": \"true\",\n",
173 | "}"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {
180 | "application/vnd.databricks.v1+cell": {
181 | "cellMetadata": {
182 | "byteLimit": 2048000,
183 | "rowLimit": 10000
184 | },
185 | "inputWidgets": {},
186 | "nuid": "131b84b3-735e-46bf-9c0e-5494c55606f9",
187 | "showTitle": false,
188 | "title": ""
189 | }
190 | },
191 | "outputs": [],
192 | "source": [
193 | "def paginate(\n",
194 | " can_paginate: bool,\n",
195 | " next_page_token: Optional[str],\n",
196 | " url: str,\n",
197 | " workspace_token: str,\n",
198 | " function_to_call: Callable,\n",
199 | ") -> None:\n",
200 | " \"\"\"\n",
201 | " Paginates to the next page if possible\n",
202 | " input:\n",
203 | " can_paginate [bool]: Boolean info about wheather there is additional info.\n",
204 | " next_page_token [str]: Token needed in url query param to paginate to next page.\n",
205 | " url [str]: Url used to list the needed info.\n",
206 | " workspace_token[str]: Databricks workspace token from the widget, needed for authorization.\n",
207 | " function_to_call [Callable]: Function that gets called with the paginated url to paginate further.\n",
208 | " output:\n",
209 | " None\n",
210 | " \"\"\"\n",
211 | "\n",
212 | " if next_page_token and can_paginate:\n",
213 | " if \"&page_token\" in url:\n",
214 | " url = f\"{url[:url.find('&page_token')]}&page_token={next_page_token}\"\n",
215 | " else:\n",
216 | " url = f\"{url}&page_token={next_page_token}\"\n",
217 | "\n",
218 | " function_to_call(url, workspace_token)\n",
219 | " else:\n",
220 | " return"
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {
226 | "application/vnd.databricks.v1+cell": {
227 | "cellMetadata": {},
228 | "inputWidgets": {},
229 | "nuid": "6d032e6a-8afa-4130-8f0c-d37f0c69f0ff",
230 | "showTitle": false,
231 | "title": ""
232 | }
233 | },
234 | "source": [
235 | "## Steps 📊\n",
236 | "\n",
237 | "### 1. Fetch Workflows and Runs 🏃♂️\n",
238 | "\n",
239 | "This notebook begins by fetching all the [workflows](https://docs.databricks.com/api/workspace/jobs/list) in your Databricks workspace. It also retrieves information about the [runs](https://docs.databricks.com/api/workspace/runs/list) that have occurred within a specified date range, which is provided by the user.\n",
240 | "\n",
241 | "### 2. Parse the fetched info 🧩\n",
242 | "Workflows have a schedule which is defined using a `quartz_cron-expression` using which we generate the dates of next runs.\n",
243 | "\n",
244 | "### 3. Visualizations 📈\n",
245 | "\n",
246 | "The notebook provides three insightful visualizations:\n",
247 | "\n",
248 | "- **First Scheduled Run of All Workflows**: Visualizes the first scheduled run of each workflow since the start date.\n",
249 | "\n",
250 | "- **Scheduled Runs Between Start and End Date**: Shows all scheduled runs that occurred within the specified date range.\n",
251 | "\n",
252 | "- **All Runs Since Start Date with Time Taken**: Displays all runs that have occurred since the start date, plotting them along with their execution time for performance analysis.\n",
253 | "\n"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {
259 | "application/vnd.databricks.v1+cell": {
260 | "cellMetadata": {},
261 | "inputWidgets": {},
262 | "nuid": "6f8a1915-b729-448f-a87f-536f0e8e01ef",
263 | "showTitle": false,
264 | "title": ""
265 | }
266 | },
267 | "source": [
268 | "## List workflows \n",
269 | "#### Fetches all workflows in current workspace and its respective configs\n",
270 | "API Docs \n"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {
277 | "application/vnd.databricks.v1+cell": {
278 | "cellMetadata": {
279 | "byteLimit": 2048000,
280 | "rowLimit": 10000
281 | },
282 | "inputWidgets": {},
283 | "nuid": "07dcd0ca-de2a-474b-be0b-906ebb6d9bd0",
284 | "showTitle": false,
285 | "title": ""
286 | }
287 | },
288 | "outputs": [],
289 | "source": [
290 | "def getAllJobs(list_jobs_url: str, workspace_token: str) -> None:\n",
291 | " \"\"\"\n",
292 | " Fetches all the jobs and metadata about them.\n",
293 | " input:\n",
294 | " lists_jobs_url [str]: Databricks API used to fetch all the jobs.\n",
295 | " workspace_token[str]: Databricks workspace token from the widget, needed for authorization.\n",
296 | " output:\n",
297 | " None\n",
298 | " \"\"\"\n",
299 | "\n",
300 | " response = requests.get(\n",
301 | " list_jobs_url,\n",
302 | " headers=headers,\n",
303 | " )\n",
304 | " assert response.status_code == 200\n",
305 | "\n",
306 | " response_data = response.json()\n",
307 | "\n",
308 | " for job in response_data.get(\"jobs\", []):\n",
309 | " if job.get(\"settings\", {}).get(\"schedule\"):\n",
310 | " jobs[job.get(\"job_id\")] = {\n",
311 | " \"name\": job.get(\"settings\", {}).get(\"name\"),\n",
312 | " \"quartz_cron_expression\": job.get(\"settings\", {})\n",
313 | " .get(\"schedule\", {})\n",
314 | " .get(\"quartz_cron_expression\")\n",
315 | " .lower(),\n",
316 | " }\n",
317 | "\n",
318 | " paginate(\n",
319 | " response_data.get(\"has_more\", False),\n",
320 | " response_data.get(\"next_page_token\"),\n",
321 | " list_jobs_url,\n",
322 | " workspace_token,\n",
323 | " getAllJobs,\n",
324 | " )\n",
325 | "\n",
326 | "\n",
327 | "jobs = {} # holds all jobs' info\n",
328 | "\n",
329 | "list_jobs_url: str = (\n",
330 | " databricks_url\n",
331 | " + \"/api/2.1/jobs/list\"\n",
332 | " + f\"?limit={query_params.get('LIST_JOBS_LIMIT')}\"\n",
333 | " + f\"&expand_tasks={query_params['EXPAND_TASKS']}\"\n",
334 | ")\n",
335 | "\n",
336 | "getAllJobs(list_jobs_url, databricks_workspace_token)"
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {
342 | "application/vnd.databricks.v1+cell": {
343 | "cellMetadata": {},
344 | "inputWidgets": {},
345 | "nuid": "e7996e9d-3e42-4be9-9d8e-5d814d1887f3",
346 | "showTitle": false,
347 | "title": ""
348 | }
349 | },
350 | "source": [
351 | "## Parse the fetched data\n",
352 | "#### Infer the cron expression and calculate the next run. \n",
353 | "#### Additionally you can also categorize workflows based on the title, as this category is what determines the colour of the plotted workflow."
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {
360 | "application/vnd.databricks.v1+cell": {
361 | "cellMetadata": {
362 | "byteLimit": 2048000,
363 | "rowLimit": 10000
364 | },
365 | "inputWidgets": {},
366 | "nuid": "fcfbf534-33c4-4f04-9157-002ba0858386",
367 | "showTitle": false,
368 | "title": ""
369 | }
370 | },
371 | "outputs": [],
372 | "source": [
373 | "def categorizeWorkflow(workflow_title: str) -> str:\n",
374 | " \"\"\"You can add custom grouping logic. as this will be used to\n",
375 | " group the workflows, as they will be coloured based on their categories\n",
376 | " in the plot.\n",
377 | " input:\n",
378 | " workflow_title : str\n",
379 | " output:\n",
380 | " category : str\n",
381 | " \"\"\"\n",
382 | "\n",
383 | " category = workflow_title # add custom logic to categorize the workflow\n",
384 | " return category\n",
385 | "\n",
386 | "\n",
387 | "for job_id, job_info in jobs.items():\n",
388 | " cron_expression = job_info[\"quartz_cron_expression\"]\n",
389 | "\n",
390 | " cron_obj = QuartzCron(\n",
391 | " schedule_string=cron_expression,\n",
392 | " start_date=start_date, # This is the start date based on which the next scheduled run is generated. You can change it as per your needs.\n",
393 | " )\n",
394 | "\n",
395 | " next_scheduled_run = cron_obj.next_trigger(isoformat=False)\n",
396 | " # print(next_scheduled_run)\n",
397 | " jobs[job_id][\"next_scheduled_run\"] = next_scheduled_run\n",
398 | " jobs[job_id][\"workflow_category\"] = categorizeWorkflow(jobs[job_id][\"name\"])"
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "metadata": {
404 | "application/vnd.databricks.v1+cell": {
405 | "cellMetadata": {},
406 | "inputWidgets": {},
407 | "nuid": "083dcc0f-f5fd-4a3b-b1ea-71c8cda66492",
408 | "showTitle": false,
409 | "title": ""
410 | }
411 | },
412 | "source": [
413 | "## Jitter workflows\n",
414 | "#### Sometimes workflows maybe scheduled too close to each other, this causes them to be too close to each other in the visualization, thus we jitter the workflows slighlty so as to obtain a neat visualization."
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {
421 | "application/vnd.databricks.v1+cell": {
422 | "cellMetadata": {
423 | "byteLimit": 2048000,
424 | "rowLimit": 10000
425 | },
426 | "inputWidgets": {},
427 | "nuid": "92f15423-f544-46f1-b040-9d1e6b96b895",
428 | "showTitle": false,
429 | "title": ""
430 | }
431 | },
432 | "outputs": [],
433 | "source": [
434 | "def jitterPoints(df: pd.DataFrame) -> pd.DataFrame:\n",
435 | " \"\"\"If two workflow's have schedules too close to each other\n",
436 | " then this function moves them a bit away from each other\n",
437 | " so that the visualization is neat\"\"\"\n",
438 | " # Initialize a flag to keep track of whether any adjustments were made\n",
439 | " adjusted = True\n",
440 | " max_iterations = 2 # Set a maximum number of iterations, increase if you have a lot of conflicting workflow schedules.\n",
441 | " jitter_minutes = 10 # adjust based on need\n",
442 | "\n",
443 | " iteration = 0\n",
444 | " while adjusted and iteration < max_iterations:\n",
445 | " adjusted = False\n",
446 | "\n",
447 | " for i in range(1, len(df)):\n",
448 | " diff = df[\"start_datetime\"].iloc[i] - df[\"start_datetime\"].iloc[i - 1]\n",
449 | "\n",
450 | " if diff <= timedelta(minutes=10):\n",
451 | " # Adjust the start time of the current event\n",
452 | " df[\"start_datetime\"].iloc[i] = df[\"start_datetime\"].iloc[\n",
453 | " i - 1\n",
454 | " ] + timedelta(minutes=jitter_minutes)\n",
455 | " adjusted = True\n",
456 | "\n",
457 | " iteration += 1\n",
458 | " return df"
459 | ]
460 | },
461 | {
462 | "cell_type": "markdown",
463 | "metadata": {
464 | "application/vnd.databricks.v1+cell": {
465 | "cellMetadata": {},
466 | "inputWidgets": {},
467 | "nuid": "3b24f2a5-adb0-49f2-aeb2-6f5dde794e09",
468 | "showTitle": false,
469 | "title": ""
470 | }
471 | },
472 | "source": [
473 | "## Helper Function\n",
474 | "#### Used to generate X axis tick values"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": null,
480 | "metadata": {
481 | "application/vnd.databricks.v1+cell": {
482 | "cellMetadata": {},
483 | "inputWidgets": {},
484 | "nuid": "e9042ec1-ea36-4265-a286-b12fd44f49e6",
485 | "showTitle": false,
486 | "title": ""
487 | }
488 | },
489 | "outputs": [],
490 | "source": [
491 | "def generateXAxisTickTexts() -> list:\n",
492 | " \"\"\"Helper function used to generate x axis tick values\"\"\"\n",
493 | " temp = list(range(1, 13)) + list(range(1, 13)) # 12 hour clock entries\n",
494 | " temp = temp[-1:] + temp[:-1] # right shifting\n",
495 | " for idx in range(len(temp)): # filling the AM/PM value as its a 12 hour format\n",
496 | " if idx < len(temp) // 2:\n",
497 | " temp[idx] = f\"{temp[idx]} AM\"\n",
498 | " else:\n",
499 | " temp[idx] = f\"{temp[idx]} PM\"\n",
500 | " return temp"
501 | ]
502 | },
503 | {
504 | "cell_type": "markdown",
505 | "metadata": {
506 | "application/vnd.databricks.v1+cell": {
507 | "cellMetadata": {},
508 | "inputWidgets": {},
509 | "nuid": "ead81d1a-8a7e-45ff-a444-22fc0ae1cc1d",
510 | "showTitle": false,
511 | "title": ""
512 | }
513 | },
514 | "source": [
515 | "## Plot the all the result\n"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": null,
521 | "metadata": {
522 | "application/vnd.databricks.v1+cell": {
523 | "cellMetadata": {
524 | "byteLimit": 2048000,
525 | "rowLimit": 10000
526 | },
527 | "inputWidgets": {},
528 | "nuid": "4d3b675a-9a3f-41ec-8551-2900c7616383",
529 | "showTitle": false,
530 | "title": ""
531 | }
532 | },
533 | "outputs": [],
534 | "source": [
535 | "# Adjust the plot dimensions here\n",
536 | "PLOT_HEIGHT = 700\n",
537 | "PLOT_WIDTH = 2000\n",
538 | "POINT_SIZE = 15\n",
539 | "\n",
540 | "events = [\n",
541 | " {\n",
542 | " \"name\": job_info[\"name\"],\n",
543 | " \"start_datetime\": job_info[\"next_scheduled_run\"],\n",
544 | " \"workflow_category\": job_info[\"workflow_category\"],\n",
545 | " }\n",
546 | " for job_info in jobs.values()\n",
547 | "]\n",
548 | "\n",
549 | "df = pd.DataFrame(events)\n",
550 | "\n",
551 | "df[\"start_datetime\"] = pd.to_datetime(df[\"start_datetime\"])\n",
552 | "\n",
553 | "# Sort DataFrame by 'start_datetime'\n",
554 | "df.sort_values(by=\"start_datetime\", inplace=True)\n",
555 | "\n",
556 | "# jitter closeby points\n",
557 | "df = jitterPoints(df)\n",
558 | "\n",
559 | "\n",
560 | "# Increase the size of all points by adjusting the marker size\n",
561 | "point_size = POINT_SIZE # Adjust the size as needed\n",
562 | "\n",
563 | "# Create an interactive scatter plot using Plotly Express\n",
564 | "fig = px.scatter(\n",
565 | " df,\n",
566 | " x=df[\"start_datetime\"].dt.hour\n",
567 | " + df[\"start_datetime\"].dt.minute / 60\n",
568 | " + df[\"start_datetime\"].dt.second / 3600,\n",
569 | " y=df[\"start_datetime\"].dt.strftime(\"%Y/%m/%d\"),\n",
570 | " # y= df['start_datetime'].dt.strftime('%d-%m-%y'),\n",
571 | " color=\"workflow_category\", # Color points by 'workflow_cateogry' column\n",
572 | " hover_name=\"name\", # Display event name on hover\n",
573 | " labels={\"x\": \"Time of Day (12-hour format)\", \"y\": \"Date\"},\n",
574 | " title=f\"Workflow's first run since {start_date.strftime('%Y-%m-%d')}\",\n",
575 | " template=\"plotly_white\",\n",
576 | ")\n",
577 | "\n",
578 | "\n",
579 | "# Customize the appearance of the plot\n",
580 | "fig.update_layout(\n",
581 | " xaxis=dict(\n",
582 | " tickmode=\"array\",\n",
583 | " tickvals=list(range(1, 25)),\n",
584 | " ticktext=generateXAxisTickTexts(),\n",
585 | " ),\n",
586 | " yaxis=dict(\n",
587 | " tickmode=\"array\",\n",
588 | " tickvals=list(\n",
589 | " range(\n",
590 | " 0,\n",
591 | " int((df[\"start_datetime\"].iloc[-1] - df[\"start_datetime\"].iloc[0]).days)\n",
592 | " + 10,\n",
593 | " )\n",
594 | " ),\n",
595 | " ),\n",
596 | " showlegend=True,\n",
597 | " legend_title_text=\"Workflow Category\",\n",
598 | " height=PLOT_HEIGHT, # Height of the plot\n",
599 | " width=PLOT_WIDTH, # Width of the plot\n",
600 | ")\n",
601 | "\n",
602 | "# Increase the marker size for all points\n",
603 | "fig.update_traces(marker=dict(size=point_size))\n",
604 | "\n",
605 | "# Show the interactive plot\n",
606 | "fig.show()"
607 | ]
608 | },
609 | {
610 | "cell_type": "markdown",
611 | "metadata": {
612 | "application/vnd.databricks.v1+cell": {
613 | "cellMetadata": {},
614 | "inputWidgets": {},
615 | "nuid": "a22a7529-72a6-4d56-b183-535e834cb2b6",
616 | "showTitle": false,
617 | "title": ""
618 | }
619 | },
620 | "source": [
621 | "## Calculate all the scheduled runs \n",
622 | "#### using `start_date` and `end_data` we calculate all the scheduled runs within the data range\n",
623 | "#### Using `cron-schedule-triggers` we calculate all the next scheduled runs since the mentioned `start_date` "
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": null,
629 | "metadata": {
630 | "application/vnd.databricks.v1+cell": {
631 | "cellMetadata": {
632 | "byteLimit": 2048000,
633 | "rowLimit": 10000
634 | },
635 | "inputWidgets": {},
636 | "nuid": "801365b4-09b4-4654-bd1d-018f3b38ae4b",
637 | "showTitle": false,
638 | "title": ""
639 | }
640 | },
641 | "outputs": [],
642 | "source": [
643 | "all_scheduled_runs = []\n",
644 | "for job_id, job_info in jobs.items():\n",
645 | " cron_expression = job_info[\"quartz_cron_expression\"]\n",
646 | "\n",
647 | " cron_obj = QuartzCron(\n",
648 | " schedule_string=cron_expression,\n",
649 | " start_date=start_date,\n",
650 | " )\n",
651 | "\n",
652 | " next_scheduled_run = cron_obj.next_trigger(isoformat=False)\n",
653 | " runs = []\n",
654 | " while next_scheduled_run <= end_date:\n",
655 | " runs.append(next_scheduled_run)\n",
656 | " next_scheduled_run = cron_obj.next_trigger(isoformat=False)\n",
657 | "\n",
658 | " for run in runs:\n",
659 | " all_scheduled_runs.append(\n",
660 | " {\n",
661 | " \"name\": jobs[job_id][\"name\"],\n",
662 | " \"start_datetime\": run,\n",
663 | " \"workflow_category\": jobs[job_id][\"workflow_category\"],\n",
664 | " }\n",
665 | " )"
666 | ]
667 | },
668 | {
669 | "cell_type": "markdown",
670 | "metadata": {
671 | "application/vnd.databricks.v1+cell": {
672 | "cellMetadata": {},
673 | "inputWidgets": {},
674 | "nuid": "4bdc6843-0f7b-472c-a1db-7398497399ea",
675 | "showTitle": false,
676 | "title": ""
677 | }
678 | },
679 | "source": [
680 | "## Plot the result"
681 | ]
682 | },
683 | {
684 | "cell_type": "code",
685 | "execution_count": null,
686 | "metadata": {
687 | "application/vnd.databricks.v1+cell": {
688 | "cellMetadata": {
689 | "byteLimit": 2048000,
690 | "rowLimit": 10000
691 | },
692 | "inputWidgets": {},
693 | "nuid": "da04d5ee-cf9d-441c-a1d2-95bab6a46eed",
694 | "showTitle": false,
695 | "title": ""
696 | }
697 | },
698 | "outputs": [],
699 | "source": [
700 | "# Adjust the plot dimensions here\n",
701 | "PLOT_HEIGHT = 700\n",
702 | "PLOT_WIDTH = 2000\n",
703 | "POINT_SIZE = 15\n",
704 | "\n",
705 | "\n",
706 | "df = pd.DataFrame(all_scheduled_runs)\n",
707 | "\n",
708 | "df[\"start_datetime\"] = pd.to_datetime(df[\"start_datetime\"])\n",
709 | "\n",
710 | "# Sort DataFrame by 'start_datetime'\n",
711 | "df.sort_values(by=\"start_datetime\", inplace=True)\n",
712 | "\n",
713 | "# jitter closeby points\n",
714 | "df = jitterPoints(df)\n",
715 | "\n",
716 | "# Increase the size of all points by adjusting the marker size\n",
717 | "point_size = POINT_SIZE # Adjust the size as needed\n",
718 | "\n",
719 | "# Create an interactive scatter plot using Plotly Express\n",
720 | "fig = px.scatter(\n",
721 | " df,\n",
722 | " x=df[\"start_datetime\"].dt.hour\n",
723 | " + df[\"start_datetime\"].dt.minute / 60\n",
724 | " + df[\"start_datetime\"].dt.second / 3600,\n",
725 | " y=df[\"start_datetime\"].dt.strftime(\"%Y/%m/%d\"),\n",
726 | " color=\"workflow_category\", # Color points by 'workflow_category' column\n",
727 | " hover_name=\"name\", # Display event name on hover\n",
728 | " labels={\"x\": \"Time of Day (12-hour format)\", \"y\": \"Date\"},\n",
729 | " title=f\"All Workflow runs scheduled from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}\",\n",
730 | " template=\"plotly_white\",\n",
731 | ")\n",
732 | "\n",
733 | "# Customize the appearance of the plot\n",
734 | "fig.update_layout(\n",
735 | " xaxis=dict(\n",
736 | " tickmode=\"array\",\n",
737 | " tickvals=list(range(1, 25)),\n",
738 | " ticktext=generateXAxisTickTexts(),\n",
739 | " ),\n",
740 | " yaxis=dict(\n",
741 | " tickmode=\"array\",\n",
742 | " tickvals=list(\n",
743 | " range(\n",
744 | " 0,\n",
745 | " int((df[\"start_datetime\"].iloc[-1] - df[\"start_datetime\"].iloc[0]).days)\n",
746 | " + 10,\n",
747 | " )\n",
748 | " ),\n",
749 | " ),\n",
750 | " showlegend=True,\n",
751 | " legend_title_text=\"Workflow category\",\n",
752 | " height=PLOT_HEIGHT, # Height of the plot\n",
753 | " width=PLOT_WIDTH, # Width of the plot\n",
754 | ")\n",
755 | "\n",
756 | "# Increase the marker size for all points\n",
757 | "fig.update_traces(marker=dict(size=point_size))\n",
758 | "\n",
759 | "# Show the interactive plot\n",
760 | "fig.show()"
761 | ]
762 | },
763 | {
764 | "cell_type": "markdown",
765 | "metadata": {
766 | "application/vnd.databricks.v1+cell": {
767 | "cellMetadata": {},
768 | "inputWidgets": {},
769 | "nuid": "bb306f7b-fa82-4949-a4de-b1f77e35d015",
770 | "showTitle": false,
771 | "title": ""
772 | }
773 | },
774 | "source": [
775 | "## List workflow runs\n",
776 | "#### Fetch all workflow runs that have taken place since the mentioned start date. Making sure to parse the necessary info\n",
777 | "API Docs \n",
778 | "\n",
779 | "\n"
780 | ]
781 | },
782 | {
783 | "cell_type": "code",
784 | "execution_count": null,
785 | "metadata": {
786 | "application/vnd.databricks.v1+cell": {
787 | "cellMetadata": {
788 | "byteLimit": 2048000,
789 | "rowLimit": 10000
790 | },
791 | "inputWidgets": {},
792 | "nuid": "26076b8c-d7d9-4ac4-b398-2f1a587801e8",
793 | "showTitle": false,
794 | "title": ""
795 | }
796 | },
797 | "outputs": [],
798 | "source": [
799 | "all_runs_info = []\n",
800 | "\n",
801 | "\n",
802 | "def getAllRuns(list_runs_url: int, workspace_token: str) -> None:\n",
803 | " \"\"\"\n",
804 | " Fetches all the run and metadata about a given workflow.\n",
805 | " input:\n",
806 | " lists_jobs_url [str]: Databricks API used to fetch all the runs belonging to a given job.\n",
807 | " workspace_token[str]: Databricks workspace token from the widget, needed for authorization.\n",
808 | " output:\n",
809 | " None\n",
810 | " \"\"\"\n",
811 | "\n",
812 | " response = requests.get(\n",
813 | " list_runs_url,\n",
814 | " headers=headers,\n",
815 | " )\n",
816 | " assert response.status_code == 200\n",
817 | "\n",
818 | " response_data = response.json()\n",
819 | " pattern = r\"job_id=([\\w-]+)\"\n",
820 | " matched = re.search(pattern, list_runs_url)\n",
821 | " job_id = int(matched.group(1))\n",
822 | "\n",
823 | " if \"runs\" in response_data:\n",
824 | " for run_info in response_data[\"runs\"]:\n",
825 | " if (\n",
826 | " \"start_time\" in run_info\n",
827 | " and \"end_time\" in run_info\n",
828 | " and run_info[\"end_time\"]\n",
829 | " ):\n",
830 | " all_runs_info.append(\n",
831 | " {\n",
832 | " \"Task\": jobs[job_id][\"name\"],\n",
833 | " \"Start\": datetime.datetime.fromtimestamp(\n",
834 | " run_info[\"start_time\"] / 1000\n",
835 | " ),\n",
836 | " \"Finish\": datetime.datetime.fromtimestamp(\n",
837 | " run_info[\"end_time\"] / 1000\n",
838 | " ),\n",
839 | " \"Duration\": (\n",
840 | " datetime.datetime.fromtimestamp(run_info[\"end_time\"] / 1000)\n",
841 | " - datetime.datetime.fromtimestamp(\n",
842 | " run_info[\"start_time\"] / 1000\n",
843 | " )\n",
844 | " ).total_seconds()\n",
845 | " / 3600,\n",
846 | " \"workflow_category\": jobs[job_id][\"workflow_category\"],\n",
847 | " }\n",
848 | " )\n",
849 | "\n",
850 | " paginate(\n",
851 | " response_data.get(\"has_more\", False),\n",
852 | " response_data.get(\"next_page_token\"),\n",
853 | " list_runs_url,\n",
854 | " workspace_token,\n",
855 | " getAllRuns,\n",
856 | " )\n",
857 | "\n",
858 | "\n",
859 | "job_ids = list(jobs.keys())\n",
860 | "\n",
861 | "list_runs_urls = [\n",
862 | " databricks_url\n",
863 | " + \"/api/2.1/jobs/runs/list\"\n",
864 | " + f\"?job_id={job_id}\"\n",
865 | " + f\"&limit={query_params.get('LIST_RUNS_LIMIT')}\"\n",
866 | " + f\"&expand_tasks={query_params.get('EXPAND_RUNS')}\"\n",
867 | " + f\"&start_time_from={start_date.timestamp()*1000}\"\n",
868 | " for job_id in job_ids\n",
869 | "]\n",
870 | "\n",
871 | "for url in list_runs_urls:\n",
872 | " getAllRuns(url, databricks_workspace_token)"
873 | ]
874 | },
875 | {
876 | "cell_type": "markdown",
877 | "metadata": {
878 | "application/vnd.databricks.v1+cell": {
879 | "cellMetadata": {},
880 | "inputWidgets": {},
881 | "nuid": "d88250c7-78d4-4555-97ff-fcccb17aabc0",
882 | "showTitle": false,
883 | "title": ""
884 | }
885 | },
886 | "source": [
887 | "## Plot the result"
888 | ]
889 | },
890 | {
891 | "cell_type": "code",
892 | "execution_count": null,
893 | "metadata": {
894 | "application/vnd.databricks.v1+cell": {
895 | "cellMetadata": {
896 | "byteLimit": 2048000,
897 | "rowLimit": 10000
898 | },
899 | "inputWidgets": {},
900 | "nuid": "73b8e1b9-01d4-4a76-a746-2e5cdd67315b",
901 | "showTitle": false,
902 | "title": ""
903 | }
904 | },
905 | "outputs": [],
906 | "source": [
907 | "# Adjust accordingly\n",
908 | "PLOT_HEIGHT = 1500\n",
909 | "PLOT_WIDTH = 2000\n",
910 | "\n",
911 | "runs_df = pd.DataFrame(all_runs_info)\n",
912 | "\n",
913 | "runs_df[\"Start\"] = pd.to_datetime(runs_df[\"Start\"])\n",
914 | "runs_df[\"Finish\"] = pd.to_datetime(runs_df[\"Finish\"])\n",
915 | "\n",
916 | "runs_df[\"Duration\"] = (\n",
917 | " runs_df[\"Finish\"] - runs_df[\"Start\"]\n",
918 | ").dt.total_seconds() / 3600 # Duration in hours\n",
919 | "\n",
920 | "# Create a new column 'Day' representing the day for each task\n",
921 | "runs_df[\"Day\"] = runs_df[\"Start\"].dt.date\n",
922 | "runs_df.head()\n",
923 | "\n",
924 | "# Extract task, start, and end dates\n",
925 | "tasks = runs_df[\"Task\"].tolist()\n",
926 | "start_dates = runs_df[\"Start\"].tolist()\n",
927 | "end_dates = runs_df[\"Finish\"].tolist()\n",
928 | "\n",
929 | "# Create the Gantt chart\n",
930 | "fig = ff.create_gantt(\n",
931 | " runs_df,\n",
932 | " title=\"Task Duration Gantt Chart\",\n",
933 | ")\n",
934 | "\n",
935 | "fig.update_layout(\n",
936 | " height=PLOT_HEIGHT,\n",
937 | " width=PLOT_WIDTH,\n",
938 | " plot_bgcolor=\"white\",\n",
939 | " paper_bgcolor=\"white\",\n",
940 | " yaxis=dict(showgrid=True, gridcolor=\"lightgray\"),\n",
941 | " xaxis=dict(showgrid=True, gridcolor=\"lightgray\"),\n",
942 | ")\n",
943 | "\n",
944 | "fig.show()"
945 | ]
946 | }
947 | ],
948 | "metadata": {
949 | "application/vnd.databricks.v1+notebook": {
950 | "dashboards": [
951 | {
952 | "elements": [
953 | {
954 | "dashboardResultIndex": 0,
955 | "elementNUID": "da04d5ee-cf9d-441c-a1d2-95bab6a46eed",
956 | "elementType": "command",
957 | "guid": "1806ca8a-35e7-4bde-b268-8ae24f5a9614",
958 | "options": null,
959 | "position": {
960 | "height": 8,
961 | "width": 24,
962 | "x": 0,
963 | "y": 8,
964 | "z": null
965 | },
966 | "resultIndex": null
967 | },
968 | {
969 | "dashboardResultIndex": 0,
970 | "elementNUID": "73b8e1b9-01d4-4a76-a746-2e5cdd67315b",
971 | "elementType": "command",
972 | "guid": "1c1a7f68-0a81-454d-b94b-6e00aa1fdda2",
973 | "options": null,
974 | "position": {
975 | "height": 17,
976 | "width": 24,
977 | "x": 0,
978 | "y": 16,
979 | "z": null
980 | },
981 | "resultIndex": null
982 | },
983 | {
984 | "dashboardResultIndex": 0,
985 | "elementNUID": "4d3b675a-9a3f-41ec-8551-2900c7616383",
986 | "elementType": "command",
987 | "guid": "3badc786-a3b5-43a9-83bc-61236ea1cd0d",
988 | "options": {
989 | "autoScaleImg": false,
990 | "scale": 0,
991 | "showRunButton": false,
992 | "showTitle": false,
993 | "titleAlign": "center"
994 | },
995 | "position": {
996 | "height": 8,
997 | "width": 24,
998 | "x": 0,
999 | "y": 0,
1000 | "z": null
1001 | },
1002 | "resultIndex": null
1003 | }
1004 | ],
1005 | "globalVars": {},
1006 | "guid": "",
1007 | "layoutOption": {
1008 | "grid": true,
1009 | "stack": true
1010 | },
1011 | "nuid": "89804740-c7b6-44b4-9c72-2e1c14be2084",
1012 | "origId": 3789653585954506,
1013 | "title": "Schedule Viz",
1014 | "version": "DashboardViewV1",
1015 | "width": 1440
1016 | }
1017 | ],
1018 | "language": "python",
1019 | "notebookMetadata": {
1020 | "mostRecentlyExecutedCommandWithImplicitDF": {
1021 | "commandId": 1634724413475231,
1022 | "dataframes": [
1023 | "_sqldf"
1024 | ]
1025 | },
1026 | "pythonIndentUnit": 4,
1027 | "widgetLayout": [
1028 | {
1029 | "breakBefore": false,
1030 | "name": "databricks_url",
1031 | "width": 229
1032 | },
1033 | {
1034 | "breakBefore": false,
1035 | "name": "databricks_workspace_token",
1036 | "width": 229
1037 | },
1038 | {
1039 | "breakBefore": false,
1040 | "name": "start_date",
1041 | "width": 229
1042 | },
1043 | {
1044 | "breakBefore": false,
1045 | "name": "end_date",
1046 | "width": 229
1047 | }
1048 | ]
1049 | },
1050 | "notebookName": "workflow_calender",
1051 | "widgets": {
1052 | "databricks_url": {
1053 | "currentValue": "",
1054 | "nuid": "1252ccd1-8501-4afb-96d1-fd2d12a60852",
1055 | "widgetInfo": {
1056 | "defaultValue": "",
1057 | "label": null,
1058 | "name": "databricks_url",
1059 | "options": {
1060 | "validationRegex": null,
1061 | "widgetType": "text"
1062 | },
1063 | "widgetType": "text"
1064 | }
1065 | },
1066 | "databricks_workspace_token": {
1067 | "currentValue": "",
1068 | "nuid": "7944ddb4-88e5-4041-8773-64bf5327fd25",
1069 | "widgetInfo": {
1070 | "defaultValue": "",
1071 | "label": null,
1072 | "name": "databricks_workspace_token",
1073 | "options": {
1074 | "validationRegex": null,
1075 | "widgetType": "text"
1076 | },
1077 | "widgetType": "text"
1078 | }
1079 | },
1080 | "end_date": {
1081 | "currentValue": "2023-10-14",
1082 | "nuid": "dc84215a-1528-4af8-83de-d407d7bcc6ad",
1083 | "widgetInfo": {
1084 | "defaultValue": "2023-11-05",
1085 | "label": null,
1086 | "name": "end_date",
1087 | "options": {
1088 | "validationRegex": null,
1089 | "widgetType": "text"
1090 | },
1091 | "widgetType": "text"
1092 | }
1093 | },
1094 | "start_date": {
1095 | "currentValue": "2023-10-08",
1096 | "nuid": "a254d69f-7ac4-4911-b323-5f60de54125b",
1097 | "widgetInfo": {
1098 | "defaultValue": "2023-10-01",
1099 | "label": null,
1100 | "name": "start_date",
1101 | "options": {
1102 | "validationRegex": null,
1103 | "widgetType": "text"
1104 | },
1105 | "widgetType": "text"
1106 | }
1107 | }
1108 | }
1109 | },
1110 | "kernelspec": {
1111 | "display_name": "Python 3",
1112 | "language": "python",
1113 | "name": "python3"
1114 | },
1115 | "language_info": {
1116 | "codemirror_mode": {
1117 | "name": "ipython",
1118 | "version": 3
1119 | },
1120 | "file_extension": ".py",
1121 | "mimetype": "text/x-python",
1122 | "name": "python",
1123 | "nbconvert_exporter": "python",
1124 | "pygments_lexer": "ipython3",
1125 | "version": "3.10.11"
1126 | }
1127 | },
1128 | "nbformat": 4,
1129 | "nbformat_minor": 0
1130 | }
1131 |
--------------------------------------------------------------------------------
/notebooks/workflow_config_exporter/README.md:
--------------------------------------------------------------------------------
1 | Backup your Databricks Workflows 🗃
2 |
3 |
4 |
5 |
6 | ## Introduction
7 |
8 | This notebook is used to fetch the jobs config from a workspace and then write it to disk thus helping one save a backup of their workflow config information. While such a backup can also be created using Terraform or other Infrastructure-as-code providers, this approach provides it using the vanilla JSON of the Databricks REST API.
9 |
10 | ## Use Cases
11 |
12 | Areas where such a notebook may be helpful:
13 |
14 | 1. Backup of workflow config information in case of restoration from Databricks REST API
15 | 2. Version controlling of workflow config information
16 |
17 | ---
18 |
19 | ---
20 | See more details in the notebook (ipynb)
21 |
--------------------------------------------------------------------------------
/notebooks/workflow_config_exporter/assets/example_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dotlas/databricks_helpers/6b9a6f1eb5eb2c1d57ce61f7d3ae5443fba39b9b/notebooks/workflow_config_exporter/assets/example_config.png
--------------------------------------------------------------------------------
/notebooks/workflow_config_exporter/workflow_config_exporter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {
8 | "byteLimit": 2048000,
9 | "rowLimit": 10000
10 | },
11 | "inputWidgets": {},
12 | "nuid": "4686ad81-9fd0-4c93-9a48-b11576dc4edf",
13 | "showTitle": false,
14 | "tableResultSettingsMap": {},
15 | "title": ""
16 | }
17 | },
18 | "source": [
19 | "# Backup your Databricks Workflows 🗃\n",
20 | "\n",
21 | "## Requirements\n",
22 | "\n",
23 | "### Databricks\n",
24 | "\n",
25 | "* At least one runnable cluster within the workspace\n",
26 | "\n",
27 | "\n",
28 | "### Parameters\n",
29 | "\n",
30 | "| Parameter Name | Parameter Description | Example Value |\n",
31 | "| --- | --- | --- |\n",
32 | "| `backup_file_path` | The file path (prefix) to the destination where the backup file will be stored. **Don't include filename in path**. | `s3://my-databricks-backups/jobs` |\n",
33 | "\n",
34 | "\n",
35 | "### Steps\n",
36 | "\n",
37 | "#### Fetch Job Configurations\n",
38 | "\n",
39 | "We fetch all the workflows present in your workspace, each fetched workflow config will also contain the individual task config present in the workflow and their respective job cluster configs. [Databricks API documentation](https://docs.databricks.com/api/workspace/jobs/list). \n",
40 | "\n",
41 | "#### Parse Information \n",
42 | "\n",
43 | "In this step we parse the obtained config info. The main thing to keep in mind is that the cluster config contains some fields which are populated after the cluster is initialized but will be fetched anyway from step 1, we need to remove this field or else when we use the same config to create the workflow later it will throw an error. You can also add any custom logic here. For example: You can include webhook notification ID to be associated with a workflow you like, You can also associate an existing all-purpose-compute to a workflow that you want, etc. \n",
44 | "\n",
45 | "#### Save Configuration to JSON 💾\n",
46 | "\n",
47 | "We later save the config to file, if you have a mounted s3 bucket or an azure data lake storage you can direcly specify the path as dbutils will take care of the rest. If you are running the notebook locally then you will need to change the code and use python's inbuilt `open` function to get the task done."
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {
53 | "application/vnd.databricks.v1+cell": {
54 | "cellMetadata": {
55 | "byteLimit": 2048000,
56 | "rowLimit": 10000
57 | },
58 | "inputWidgets": {},
59 | "nuid": "f8b80921-ff93-4b60-8b9d-ad26c4b909c8",
60 | "showTitle": false,
61 | "tableResultSettingsMap": {},
62 | "title": ""
63 | }
64 | },
65 | "source": [
66 | "### Imports"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 0,
72 | "metadata": {
73 | "application/vnd.databricks.v1+cell": {
74 | "cellMetadata": {
75 | "byteLimit": 2048000,
76 | "rowLimit": 10000
77 | },
78 | "inputWidgets": {},
79 | "nuid": "fb9a509f-a4c5-4d06-9d93-1a52c0be1322",
80 | "showTitle": false,
81 | "tableResultSettingsMap": {},
82 | "title": ""
83 | }
84 | },
85 | "outputs": [],
86 | "source": [
87 | "from collections import defaultdict\n",
88 | "from datetime import datetime\n",
89 | "import json\n",
90 | "import re\n",
91 | "from typing import Optional, Callable\n",
92 | "\n",
93 | "from databricks.sdk import WorkspaceClient\n",
94 | "from databricks.sdk.service.jobs import JobSettings"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {
100 | "application/vnd.databricks.v1+cell": {
101 | "cellMetadata": {
102 | "byteLimit": 2048000,
103 | "rowLimit": 10000
104 | },
105 | "inputWidgets": {},
106 | "nuid": "03f51bef-dc97-4b08-bf45-c49e11db1076",
107 | "showTitle": false,
108 | "tableResultSettingsMap": {},
109 | "title": ""
110 | }
111 | },
112 | "source": [
113 | "## Inputs\n"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 0,
119 | "metadata": {
120 | "application/vnd.databricks.v1+cell": {
121 | "cellMetadata": {
122 | "byteLimit": 2048000,
123 | "rowLimit": 10000
124 | },
125 | "inputWidgets": {},
126 | "nuid": "0ba8b199-65cc-4dfe-8926-dbc8f28a38b9",
127 | "showTitle": false,
128 | "tableResultSettingsMap": {},
129 | "title": ""
130 | }
131 | },
132 | "outputs": [],
133 | "source": [
134 | "dbutils.widgets.removeAll()\n",
135 | "dbutils.widgets.text(\"backup_file_path\", \"\")\n",
136 | "backup_file_path: str = getArgument(\"backup_file_path\")\n",
137 | "\n",
138 | "w = WorkspaceClient()\n",
139 | "\n",
140 | "query_params = {\n",
141 | " \"LIST_JOBS_LIMIT\": 100, # max limit\n",
142 | " \"EXPAND_TASKS\": \"true\", # provides the complete config info for each job\n",
143 | "}"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {
149 | "application/vnd.databricks.v1+cell": {
150 | "cellMetadata": {
151 | "byteLimit": 2048000,
152 | "rowLimit": 10000
153 | },
154 | "inputWidgets": {},
155 | "nuid": "004273ff-e821-415c-b57e-74eccd0b2253",
156 | "showTitle": false,
157 | "tableResultSettingsMap": {},
158 | "title": ""
159 | }
160 | },
161 | "source": [
162 | "## List workflows \n",
163 | "\n",
164 | "Fetches all workflows in current workspace and its respective configs"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 0,
170 | "metadata": {
171 | "application/vnd.databricks.v1+cell": {
172 | "cellMetadata": {
173 | "byteLimit": 2048000,
174 | "rowLimit": 10000
175 | },
176 | "inputWidgets": {},
177 | "nuid": "1b13f2e7-238b-4a11-9c78-acab6c09f479",
178 | "showTitle": false,
179 | "tableResultSettingsMap": {},
180 | "title": ""
181 | }
182 | },
183 | "outputs": [],
184 | "source": [
185 | "jobs: dict[int, dict] = {}\n",
186 | "\n",
187 | "# Use the SDK's built-in paginator\n",
188 | "for job in w.jobs.list(expand_tasks=query_params[\"EXPAND_TASKS\"], limit=query_params[\"LIST_JOBS_LIMIT\"]):\n",
189 | " jobs[job.job_id] = job.settings.as_dict()"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {
195 | "application/vnd.databricks.v1+cell": {
196 | "cellMetadata": {
197 | "byteLimit": 2048000,
198 | "rowLimit": 10000
199 | },
200 | "inputWidgets": {},
201 | "nuid": "9ac4ea31-c68f-4e86-9208-403ae6023b08",
202 | "showTitle": false,
203 | "tableResultSettingsMap": {},
204 | "title": ""
205 | }
206 | },
207 | "source": [
208 | "## Parse the fetched data\n",
209 | "\n",
210 | "This is needed because the cluster config info in each task contains some current workspace specific properties, which are populated after cluster initialization, thus it needs to be removed."
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 0,
216 | "metadata": {
217 | "application/vnd.databricks.v1+cell": {
218 | "cellMetadata": {
219 | "byteLimit": 2048000,
220 | "rowLimit": 10000
221 | },
222 | "inputWidgets": {},
223 | "nuid": "e48c33f2-3271-4f1b-a80e-f79ab33535c3",
224 | "showTitle": false,
225 | "tableResultSettingsMap": {},
226 | "title": ""
227 | }
228 | },
229 | "outputs": [],
230 | "source": [
231 | "def parse_jobs(job_info: JobSettings) -> dict:\n",
232 | " \"\"\"\n",
233 | " input:\n",
234 | " job_info [JobSettings]: JobSettings object from the SDK.\n",
235 | " output:\n",
236 | " dict : Parsed dictionary.\n",
237 | " \"\"\"\n",
238 | " job_dict = job_info.as_dict()\n",
239 | "\n",
240 | " for cluster_info in job_dict.get(\"job_clusters\", []):\n",
241 | " new_cluster = cluster_info.get(\"new_cluster\", {})\n",
242 | " if \"aws_attributes\" in new_cluster:\n",
243 | " new_cluster.pop(\"aws_attributes\")\n",
244 | "\n",
245 | " return job_dict"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 0,
251 | "metadata": {
252 | "application/vnd.databricks.v1+cell": {
253 | "cellMetadata": {
254 | "byteLimit": 2048000,
255 | "rowLimit": 10000
256 | },
257 | "inputWidgets": {},
258 | "nuid": "93df3fd2-b654-419f-a0cf-acac81aedd87",
259 | "showTitle": false,
260 | "tableResultSettingsMap": {},
261 | "title": ""
262 | }
263 | },
264 | "outputs": [],
265 | "source": [
266 | "for job_id, job_settings in jobs.items():\n",
267 | " parsed = parse_jobs(JobSettings.from_dict(job_settings))\n",
268 | " jobs[job_id] = parsed"
269 | ]
270 | },
271 | {
272 | "cell_type": "markdown",
273 | "metadata": {
274 | "application/vnd.databricks.v1+cell": {
275 | "cellMetadata": {
276 | "byteLimit": 2048000,
277 | "rowLimit": 10000
278 | },
279 | "inputWidgets": {},
280 | "nuid": "84940d82-3c43-4af8-a5a8-54e81712dd31",
281 | "showTitle": false,
282 | "tableResultSettingsMap": {},
283 | "title": ""
284 | }
285 | },
286 | "source": [
287 | "\n",
288 | "## Backup Job Config\n",
289 | "\n",
290 | "Write the obtained config json to disk of your choice"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 0,
296 | "metadata": {
297 | "application/vnd.databricks.v1+cell": {
298 | "cellMetadata": {
299 | "byteLimit": 2048000,
300 | "rowLimit": 10000
301 | },
302 | "inputWidgets": {},
303 | "nuid": "80000619-68c6-4d1f-a234-6c459dc8463c",
304 | "showTitle": false,
305 | "tableResultSettingsMap": {},
306 | "title": ""
307 | }
308 | },
309 | "outputs": [],
310 | "source": [
311 | "assert len(jobs.keys()) > 1, \"No Jobs Found\""
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 0,
317 | "metadata": {
318 | "application/vnd.databricks.v1+cell": {
319 | "cellMetadata": {
320 | "byteLimit": 2048000,
321 | "rowLimit": 10000
322 | },
323 | "inputWidgets": {},
324 | "nuid": "fe85be21-6d6c-4857-bbf7-bfe52367f30c",
325 | "showTitle": false,
326 | "tableResultSettingsMap": {},
327 | "title": ""
328 | }
329 | },
330 | "outputs": [],
331 | "source": [
332 | "backup_file_path_modded: str = backup_file_path + \"/\" + str(datetime.utcnow().date()).replace(\"-\",\"\") + \".json\"\n",
333 | "backup_file_path_modded"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 0,
339 | "metadata": {
340 | "application/vnd.databricks.v1+cell": {
341 | "cellMetadata": {
342 | "byteLimit": 2048000,
343 | "rowLimit": 10000
344 | },
345 | "inputWidgets": {},
346 | "nuid": "14159c89-9c1d-4117-bcd6-b36766d869bf",
347 | "showTitle": false,
348 | "tableResultSettingsMap": {},
349 | "title": ""
350 | }
351 | },
352 | "outputs": [],
353 | "source": [
354 | "store_flag = None\n",
355 | "\n",
356 | "store_flag: bool = dbutils.fs.put(\n",
357 | " backup_file_path_modded, json.dumps(jobs), overwrite=False\n",
358 | ")\n",
359 | "\n",
360 | "if not store_flag or store_flag is None:\n",
361 | " raise ValueError(\"Unable to Write Jobs Backup\")"
362 | ]
363 | }
364 | ],
365 | "metadata": {
366 | "application/vnd.databricks.v1+notebook": {
367 | "computePreferences": null,
368 | "dashboards": [],
369 | "environmentMetadata": null,
370 | "inputWidgetPreferences": null,
371 | "language": "python",
372 | "notebookMetadata": {
373 | "pythonIndentUnit": 4
374 | },
375 | "notebookName": "workflow_config_exporter",
376 | "widgets": {
377 | "backup_file_path": {
378 | "currentValue": "s3://dotlas-databricks/jobs",
379 | "nuid": "cbe01358-1720-400b-b9a7-6a1642e1515a",
380 | "typedWidgetInfo": {
381 | "autoCreated": false,
382 | "defaultValue": "",
383 | "label": null,
384 | "name": "backup_file_path",
385 | "options": {
386 | "widgetDisplayType": "Text",
387 | "validationRegex": null
388 | },
389 | "parameterDataType": "String"
390 | },
391 | "widgetInfo": {
392 | "widgetType": "text",
393 | "defaultValue": "",
394 | "label": null,
395 | "name": "backup_file_path",
396 | "options": {
397 | "widgetType": "text",
398 | "autoCreated": null,
399 | "validationRegex": null
400 | }
401 | }
402 | }
403 | }
404 | },
405 | "kernelspec": {
406 | "display_name": "env",
407 | "language": "python",
408 | "name": "python3"
409 | },
410 | "language_info": {
411 | "name": "python"
412 | }
413 | },
414 | "nbformat": 4,
415 | "nbformat_minor": 0
416 | }
417 |
--------------------------------------------------------------------------------