├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md └── notebooks ├── databricks_daily_cost_slack ├── README.md ├── databricks_cost_slack.ipynb └── databricks_daily_cost.png ├── delta_docs_pydantic ├── README.md └── delta_docs_pydantic.ipynb ├── migrate_workspace ├── README.md └── migrate_workspace.ipynb ├── pandas_delta ├── README.md ├── assets │ ├── databricks_sql_python.png │ └── unity_catalog_cluster.png └── pandas_delta.ipynb ├── update_job_cluster ├── README.md └── update_job_cluster.ipynb ├── workflow_calendar ├── README.md ├── assets │ └── example_viz.png └── workflow_calender.ipynb └── workflow_config_exporter ├── README.md ├── assets └── example_config.png └── workflow_config_exporter.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to `databricks_helpers` 👷🏾‍♀️ 2 | 3 | ## Contribution Guidelines 4 | 5 | - 🍴 **Fork the Repository:** Start by forking the repository to make your contributions. 6 | 7 | - 🌲 **Create a New Branch:** Always create a new branch for your contributions (`git checkout -b feature-branch`). 8 | 9 | - 🔐 **Handle Secrets Carefully:** Ensure that Databricks secrets, tokens, cluster configurations, or hostnames are never exposed publicly. 10 | 11 | - 📙 **Export as .ipynb:** Ensure that Databricks notebooks are exported as `.ipynb` files. 12 | 13 | - 📃 **Ensure Relevance:** Contributions should be directly related to Databricks, offering unique insights not found in official documentation or other common repositories. 14 | 15 | - 🔖 **Document Your Code:** Ensure your code is well-documented, explaining the purpose and functionality of your contribution. 16 | 17 | - 🧑🏻‍💻 **Follow Coding Conventions:** Ensure your code aligns with existing coding conventions for consistency and readability. Format your code cells. Our team can assist with this during PR review. 18 | 19 | - 🔨 **Test Your Code:** Ensure your code is thoroughly tested to maintain the repository's quality and reliability. 20 | 21 | - 🎫 **Submitting Issues:** Feel free to submit issues and enhancement requests, ensuring they are well-described and labeled. 22 | 23 | - 🤝 **Submitting Pull Requests:** Make sure your code is in a new branch and submit a pull request, ensuring it's well-described. 24 | 25 | Thank you for contributing to `databricks_helpers`! ❤️ 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2023, Dotlas 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Databricks Helpers 🧱

2 | 3 |

4 | Databricks 5 | Delta 6 |

7 | Plotly 8 | Pydantic 9 |

10 | 11 |

12 | Easy-to-use Databricks Notebooks for Admin Tasks. 13 |
Made with ❤️ by Dotlas Inc 14 |

15 | 16 | ## About 17 | 18 | This repository contains a directory of Databricks notebooks that assists with administrative tasks for Databricks, or otherwise helps as a supporting utility. 19 | 20 | For example, consider the following use-cases: 21 | 22 | * 📆 View a calendar of scheduled jobs to resolve conflicts in Databricks workflows. 23 | * 🐼 Upload a [Pandas](https://pypi.org/project/pandas) DataFrame to Delta Lake 24 | * 📑 Update Delta Lake table Documentation using [Pydantic](https://docs.pydantic.dev/latest/) Models 25 | * ➿ Migrate Jobs between Databricks workspaces 26 | * ⚙️ Mass-edit Job Clusters in Existing Jobs 27 | 28 | ## Directory 29 | 30 | | Notebook | Description | 31 | | --- | --- | 32 | | [Databricks Daily Cost to Slack](./notebooks/databricks_daily_cost_slack) | Schedule a daily cost summary of Databricks bills to notify you on Slack. | 33 | | [Workflow Calendar](./notebooks/workflow_calendar/README.md) | Visualize scheduled Jobs on a calendar, eyeball conflicts and view historic runs as a [Gantt](https://en.wikipedia.org/wiki/Gantt_chart) chart | 34 | | [Delta Docs with Pydantic](./notebooks/delta_docs_pydantic/README.md) | If you have pydantic models with fields containing `description` and `tags` that are used as data models, transfer these field descriptions to Delta lake columns as comments and tags. | 35 | | [Pandas to Delta](./notebooks/pandas_delta/README.md) | Use [databricks-sql-python](https://github.com/databricks/databricks-sql-python/) and [SQLAlchemy](https://pypi.org/project/sqlalchemy/) to upload a Pandas DataFrame to Delta Lake from outside a Databricks environment | 36 | | [Workspace Jobs Migration](./notebooks/migrate_workspace/README.md) | Migrate Workflows from one Databricks workspace to another | 37 | | [Job Cluster Update](./notebooks/update_job_cluster/README.md) | Use the Databricks API to mass-update Job and Task configs | 38 | | [Workflow Config Exporter](./notebooks/workflow_config_exporter/README.md) | Export existing workflow configuration and save it for future consumption | 39 | 40 | ## Discussions 41 | 42 | * Check out the launch discussion on this [LinkedIn Release Post](https://www.linkedin.com/feed/update/urn:li:activity:7119179773444030465), with a highlight from Databricks CEO, [Ali Ghodsi](https://www.linkedin.com/in/alighodsi). 43 | * Feel free to raise an issue on this repository to start a discussion about new features, bug fixes or enhancements. 44 | * See [CONTRIBUTING.md](./CONTRIBUTING.md) for guidelines when adding or modifying notebooks in this repository. 45 | -------------------------------------------------------------------------------- /notebooks/databricks_daily_cost_slack/README.md: -------------------------------------------------------------------------------- 1 | # Databricks Daily Cost - Slack 2 | 3 | ![](./databricks_daily_cost.png) 4 | -------------------------------------------------------------------------------- /notebooks/databricks_daily_cost_slack/databricks_cost_slack.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": { 8 | "byteLimit": 2048000, 9 | "rowLimit": 10000 10 | }, 11 | "inputWidgets": {}, 12 | "nuid": "2101dea2-6085-4982-aec2-b961bd745dd1", 13 | "showTitle": false, 14 | "title": "" 15 | } 16 | }, 17 | "source": [ 18 | "# Databricks Cost to Slack 🧱\n", 19 | "\n", 20 | "![](https://img.shields.io/badge/Slack-4A154B.svg?style=for-the-badge&logo=Slack&logoColor=white)\n", 21 | "![](https://img.shields.io/badge/Databricks-FF3621.svg?style=for-the-badge&logo=Databricks&logoColor=white)\n", 22 | "\n", 23 | "> Authors: [Eshwaran Venkat](github.com/cricksmaidiene)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 0, 29 | "metadata": { 30 | "application/vnd.databricks.v1+cell": { 31 | "cellMetadata": { 32 | "byteLimit": 2048000, 33 | "rowLimit": 10000 34 | }, 35 | "inputWidgets": {}, 36 | "nuid": "07c123a1-3fa9-47e6-ac0d-545959aaee6e", 37 | "showTitle": true, 38 | "title": "Install Packages" 39 | } 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "!pip install tabulate -q" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 0, 49 | "metadata": { 50 | "application/vnd.databricks.v1+cell": { 51 | "cellMetadata": { 52 | "byteLimit": 2048000, 53 | "rowLimit": 10000 54 | }, 55 | "inputWidgets": {}, 56 | "nuid": "3b6078c1-599f-4805-bc32-d92fa236258c", 57 | "showTitle": true, 58 | "title": "Imports" 59 | } 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "import warnings\n", 64 | "warnings.filterwarnings(\"ignore\")\n", 65 | "\n", 66 | "import pandas as pd\n", 67 | "import os\n", 68 | "import requests" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 0, 74 | "metadata": { 75 | "application/vnd.databricks.v1+cell": { 76 | "cellMetadata": { 77 | "byteLimit": 2048000, 78 | "rowLimit": 10000 79 | }, 80 | "inputWidgets": {}, 81 | "nuid": "960f5304-ba21-4664-a960-019843601c98", 82 | "showTitle": true, 83 | "title": "Declare and Consume Notebook Parameters" 84 | } 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "dbutils.widgets.removeAll()\n", 89 | "dbutils.widgets.text(\"DAY_COUNT\", \"2\")\n", 90 | "dbutils.widgets.text(\"SLACK_WEBHOOK\", \"\")\n", 91 | "\n", 92 | "N: int = int(dbutils.widgets.get(\"DAY_COUNT\")) # Example value for N days ago, adjust as needed\n", 93 | "SLACK_WEBHOOK = dbutils.widgets.get(\"SLACK_WEBHOOK\")\n", 94 | "\n", 95 | "assert all([N, SLACK_WEBHOOK]), \"One or more required parameters not set\"" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 0, 101 | "metadata": { 102 | "application/vnd.databricks.v1+cell": { 103 | "cellMetadata": { 104 | "byteLimit": 2048000, 105 | "rowLimit": 10000 106 | }, 107 | "inputWidgets": {}, 108 | "nuid": "10496414-7426-43ea-9b61-2eee1bc520c0", 109 | "showTitle": true, 110 | "title": "Declare Slack Message Payload Template" 111 | } 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "# The message payload using Block Kit for formatting\n", 116 | "message_payload = {\n", 117 | " \"blocks\": [\n", 118 | " {\n", 119 | " \"type\": \"section\",\n", 120 | " \"text\": {\n", 121 | " \"type\": \"mrkdwn\",\n", 122 | " \"text\": \"\"\n", 123 | " }\n", 124 | " },\n", 125 | " {\n", 126 | " \"type\": \"divider\"\n", 127 | " },\n", 128 | " {\n", 129 | " \"type\": \"section\",\n", 130 | " \"text\": {\n", 131 | " \"type\": \"mrkdwn\",\n", 132 | " \"text\": \"\"\n", 133 | " }\n", 134 | " }\n", 135 | " ]\n", 136 | "}" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 0, 142 | "metadata": { 143 | "application/vnd.databricks.v1+cell": { 144 | "cellMetadata": { 145 | "byteLimit": 2048000, 146 | "rowLimit": 10000 147 | }, 148 | "inputWidgets": {}, 149 | "nuid": "47bec7b8-8abd-4fd4-9c8c-b0867798d307", 150 | "showTitle": true, 151 | "title": "Read Billing Data" 152 | } 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "df = spark.sql(\n", 157 | " f\"\"\"\n", 158 | " SELECT *, identity_metadata.run_as as user FROM system.billing.usage \n", 159 | " WHERE usage_date >= date_trunc('day', NOW()) - interval '{N} day' \n", 160 | " AND usage_date < date_trunc('day', NOW()) - interval '{N-1} day'\n", 161 | " \"\"\"\n", 162 | ").toPandas()\n", 163 | "\n", 164 | "print(df.shape)\n", 165 | "df.head()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 0, 171 | "metadata": { 172 | "application/vnd.databricks.v1+cell": { 173 | "cellMetadata": { 174 | "byteLimit": 2048000, 175 | "implicitDf": true, 176 | "rowLimit": 10000 177 | }, 178 | "inputWidgets": {}, 179 | "nuid": "15612e57-f6e2-40ca-9c22-b02fee194135", 180 | "showTitle": true, 181 | "title": "Calculate cost by SKU" 182 | } 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "dbu_usd_prices = spark.sql(\n", 187 | " f\"\"\"\n", 188 | " SELECT sku_name, round(pricing.default, 2) as usd_price \n", 189 | " FROM system.billing.list_prices \n", 190 | " WHERE sku_name in (\n", 191 | " SELECT sku_name FROM system.billing.usage \n", 192 | " WHERE usage_date >= date_trunc('day', NOW()) - interval '{N} day' \n", 193 | " AND usage_date < date_trunc('day', NOW()) - interval '{N-1} day'\n", 194 | " )\n", 195 | " \"\"\"\n", 196 | ").toPandas()\n", 197 | "\n", 198 | "print(dbu_usd_prices.shape)\n", 199 | "dbu_usd_prices.head()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 0, 205 | "metadata": { 206 | "application/vnd.databricks.v1+cell": { 207 | "cellMetadata": { 208 | "byteLimit": 2048000, 209 | "rowLimit": 10000 210 | }, 211 | "inputWidgets": {}, 212 | "nuid": "103a8a1a-b2c4-4933-8e63-0543414257bc", 213 | "showTitle": true, 214 | "title": "USD Calculation" 215 | } 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "df[\"usd\"] = (\n", 220 | " df[\"sku_name\"].map(dbu_usd_prices.set_index(\"sku_name\").to_dict()[\"usd_price\"])\n", 221 | " * df[\"usage_quantity\"]\n", 222 | ")\n", 223 | "df[\"usd\"] = df[\"usd\"].astype(float).round(2)\n", 224 | "print(df[\"usd\"].info())" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 0, 230 | "metadata": { 231 | "application/vnd.databricks.v1+cell": { 232 | "cellMetadata": { 233 | "byteLimit": 2048000, 234 | "rowLimit": 10000 235 | }, 236 | "inputWidgets": {}, 237 | "nuid": "10f51852-a121-4c5f-900c-6079d0036cdd", 238 | "showTitle": true, 239 | "title": "Calculate Final Daily Bill" 240 | } 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "report = df.groupby([\"billing_origin_product\"])[[\"usd\"]].sum().reset_index()\n", 245 | "report.columns = [\"product\", \"cost\"]\n", 246 | "report['product'] = report['product'].str.replace(\"_\", \" \").str.title()\n", 247 | "report" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 0, 253 | "metadata": { 254 | "application/vnd.databricks.v1+cell": { 255 | "cellMetadata": { 256 | "byteLimit": 2048000, 257 | "rowLimit": 10000 258 | }, 259 | "inputWidgets": {}, 260 | "nuid": "d80ac4cc-4a32-48bb-b726-55f9ee7c119f", 261 | "showTitle": true, 262 | "title": "Prepare Messages" 263 | } 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "usage_date = df[\"usage_date\"].iloc[0].strftime(r\"%b %d\")\n", 268 | "day_cost = report[\"cost\"].sum()\n", 269 | "message_color = \"info\" if day_cost < 22 else \"error\"\n", 270 | "\n", 271 | "message_title = f\"🧱 *Databricks Cost* for *{usage_date}* is *${day_cost:,.2f}*\"\n", 272 | "message_log = f\"```{report.to_markdown(index=False)}```\"\n", 273 | "\n", 274 | "message_payload['blocks'][0]['text']['text'] = message_title\n", 275 | "message_payload['blocks'][2]['text']['text'] = message_log" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 0, 281 | "metadata": { 282 | "application/vnd.databricks.v1+cell": { 283 | "cellMetadata": { 284 | "byteLimit": 2048000, 285 | "rowLimit": 10000 286 | }, 287 | "inputWidgets": {}, 288 | "nuid": "091f0117-c47e-4c99-85e9-0dab4ca6a462", 289 | "showTitle": true, 290 | "title": "Post to Slack using Incoming Webhooks" 291 | } 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "# Sending the POST request to the Slack webhook URL\n", 296 | "response = requests.post(SLACK_WEBHOOK, json=message_payload)\n", 297 | "assert response.status_code == 200, \"Unable to post to slack\"" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 0, 303 | "metadata": { 304 | "application/vnd.databricks.v1+cell": { 305 | "cellMetadata": {}, 306 | "inputWidgets": {}, 307 | "nuid": "ae70af1e-12eb-4c34-b201-84eadf746212", 308 | "showTitle": false, 309 | "title": "" 310 | } 311 | }, 312 | "outputs": [], 313 | "source": [] 314 | } 315 | ], 316 | "metadata": { 317 | "application/vnd.databricks.v1+notebook": { 318 | "dashboards": [], 319 | "environmentMetadata": { 320 | "base_environment": "", 321 | "client": "1" 322 | }, 323 | "language": "python", 324 | "notebookMetadata": { 325 | "mostRecentlyExecutedCommandWithImplicitDF": { 326 | "commandId": -1, 327 | "dataframes": [ 328 | "_sqldf" 329 | ] 330 | }, 331 | "pythonIndentUnit": 4 332 | }, 333 | "notebookName": "databricks_cost_slack", 334 | "widgets": { 335 | "DAY_COUNT": { 336 | "currentValue": "1", 337 | "nuid": "7d1b9b75-b7db-4c39-ad5f-16c884db15c9", 338 | "typedWidgetInfo": { 339 | "autoCreated": false, 340 | "defaultValue": "2", 341 | "label": null, 342 | "name": "DAY_COUNT", 343 | "options": { 344 | "widgetDisplayType": "Text", 345 | "validationRegex": null 346 | }, 347 | "parameterDataType": "String" 348 | }, 349 | "widgetInfo": { 350 | "widgetType": "text", 351 | "defaultValue": "2", 352 | "label": null, 353 | "name": "DAY_COUNT", 354 | "options": { 355 | "widgetType": "text", 356 | "autoCreated": null, 357 | "validationRegex": null 358 | } 359 | } 360 | }, 361 | "SLACK_WEBHOOK": { 362 | "currentValue": "", 363 | "nuid": "9a846535-f013-40e2-835c-8f8a29da9807", 364 | "typedWidgetInfo": { 365 | "autoCreated": false, 366 | "defaultValue": "", 367 | "label": null, 368 | "name": "SLACK_WEBHOOK", 369 | "options": { 370 | "widgetDisplayType": "Text", 371 | "validationRegex": null 372 | }, 373 | "parameterDataType": "String" 374 | }, 375 | "widgetInfo": { 376 | "widgetType": "text", 377 | "defaultValue": "", 378 | "label": null, 379 | "name": "SLACK_WEBHOOK", 380 | "options": { 381 | "widgetType": "text", 382 | "autoCreated": null, 383 | "validationRegex": null 384 | } 385 | } 386 | } 387 | } 388 | } 389 | }, 390 | "nbformat": 4, 391 | "nbformat_minor": 0 392 | } 393 | -------------------------------------------------------------------------------- /notebooks/databricks_daily_cost_slack/databricks_daily_cost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dotlas/databricks_helpers/6b9a6f1eb5eb2c1d57ce61f7d3ae5443fba39b9b/notebooks/databricks_daily_cost_slack/databricks_daily_cost.png -------------------------------------------------------------------------------- /notebooks/delta_docs_pydantic/README.md: -------------------------------------------------------------------------------- 1 |

Update Metadata for Delta Lake using Pydantic Data Models 📑

2 |

3 | Databricks 4 | Delta 5 | Pydantic 6 |

7 | 8 | ## Introduction 9 | 10 | This notebook is used to update metadata of an existing table in Delta Lake using the table's equivalent Pydantic data model. This is especially useful if you have data from an application that flows into Delta and happens to already have dataclasses or datamodels that define the schema of raw data. 11 | 12 | > Note that updating Delta table metadata is highly dependent on the Pydantic models already being pre-defined with `tags` and `description` per `Field` 13 | 14 | ## Use Cases 15 | 16 | The Delta Lake Table metadata updater is a helpful tool with below use cases: 17 | 18 | 1. **Metadata Enrichment**: Enhance the quality of your data by adding descriptions and tags to your table columns. 19 | 20 | 2. **Automated Documentation**: Save time and effort by automatically generating metadata based on your Pydantic data models. 21 | 22 | 3. **Consistency and Quality**: Ensure consistent metadata across your Delta Lake tables. 23 | 24 | --- 25 | See more details in the notebook (ipynb) 26 | -------------------------------------------------------------------------------- /notebooks/delta_docs_pydantic/delta_docs_pydantic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Update Delta Lake Documentation with Pydantic Data Models 📑" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "application/vnd.databricks.v1+cell": { 14 | "cellMetadata": {}, 15 | "inputWidgets": {}, 16 | "nuid": "7807f803-ee1d-4e2c-9ddb-9284a3df60d6", 17 | "showTitle": false, 18 | "title": "" 19 | } 20 | }, 21 | "source": [ 22 | "## Requirements\n", 23 | "\n", 24 | "### Databricks\n", 25 | "* A Databricks Workspace & Workspace Access Token\n", 26 | "* At least one runnable cluster within the workspace\n", 27 | "* Workspace attached to a metastore for Delta Lake\n", 28 | "* Access to one or more Pydantic model classes, or instances of those classes\n", 29 | " \n", 30 | "### Packages\n", 31 | "`pandas` for data manipulation and `pydantic` for data modeling.\n", 32 | "\n", 33 | "* `pandas < 2.0`\n", 34 | "* `pydantic < 1.11`\n", 35 | "\n", 36 | "### Delta Table\n", 37 | "The table whose column description and tags you want to write / update needs to already exist in your delta lake\n", 38 | "\n", 39 | "### Infra\n", 40 | "A cluster is required to be running on the Databricks workspace from where the Delta lake will be accessed. This cluster will behave as an intermediary to accept connections and data from outside Databricks and add the data into Delta lake.\n", 41 | "\n", 42 | "> In order to add data to Unity catalog, the cluster must be configured to access Unity Catalog.\n", 43 | "\n", 44 | "![](../pandas_delta/assets/unity_catalog_cluster.png)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "application/vnd.databricks.v1+cell": { 51 | "cellMetadata": {}, 52 | "inputWidgets": {}, 53 | "nuid": "256cfca3-dd69-4b03-b0d3-07504f2ed67a", 54 | "showTitle": false, 55 | "title": "" 56 | } 57 | }, 58 | "source": [ 59 | "## Imports" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "application/vnd.databricks.v1+cell": { 67 | "cellMetadata": { 68 | "byteLimit": 2048000, 69 | "rowLimit": 10000 70 | }, 71 | "inputWidgets": {}, 72 | "nuid": "77998633-02bb-4a93-9e3b-f59273ce2d50", 73 | "showTitle": false, 74 | "title": "" 75 | } 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "from pydantic import BaseModel\n", 80 | "import pandas as pd" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "application/vnd.databricks.v1+cell": { 87 | "cellMetadata": {}, 88 | "inputWidgets": {}, 89 | "nuid": "318bb8c0-456a-41a1-950e-bbd7ab706a12", 90 | "showTitle": false, 91 | "title": "" 92 | } 93 | }, 94 | "source": [ 95 | "## Inputs" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "application/vnd.databricks.v1+cell": { 103 | "cellMetadata": { 104 | "byteLimit": 2048000, 105 | "rowLimit": 10000 106 | }, 107 | "inputWidgets": {}, 108 | "nuid": "46c7d8e8-0cd5-46a1-9e92-a49afc97da08", 109 | "showTitle": false, 110 | "title": "" 111 | } 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "dbutils.widgets.removeAll()\n", 116 | "\n", 117 | "dbutils.widgets.text(\"catalog\", \"\")\n", 118 | "catalog: str = getArgument(\"catalog\")\n", 119 | "\n", 120 | "dbutils.widgets.text(\"schema\", \"\")\n", 121 | "schema: str = getArgument(\"schema\")\n", 122 | "\n", 123 | "dbutils.widgets.text(\"table\", \"\")\n", 124 | "table: str = getArgument(\"table\")" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "\n", 132 | "\n", 133 | "## Steps 📊\n", 134 | "\n", 135 | "### 1. Input Pydantic Data Model 📝\n", 136 | "\n", 137 | "Initialize your pydantic data model which inherits from pydantic `BaseModel` where you have declared all the column descriptions and tags.\n", 138 | "\n", 139 | "### 2. Convert the Pydantic data model to a dataframe 🚀\n", 140 | "\n", 141 | "Next we convert the data model into a dataframe containing the relevant fields, making it easier to retrieve the needed data.\n", 142 | "\n", 143 | "\n", 144 | "### 3. Update Delta Lake Table 🔄\n", 145 | "\n", 146 | "Once you are satisfied with the inferred metadata, apply the updates to your Delta Lake table, and it will be enriched with the new descriptions and tags." 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "application/vnd.databricks.v1+cell": { 153 | "cellMetadata": {}, 154 | "inputWidgets": {}, 155 | "nuid": "e1357b26-0eb2-4206-877b-b33e4d253d93", 156 | "showTitle": false, 157 | "title": "" 158 | } 159 | }, 160 | "source": [ 161 | "## Create your pydantic data model class\n", 162 | "#### Example:\n", 163 | "```python\n", 164 | "class pydantic_data_model(BaseModel):\n", 165 | " column_1: str = Field(\n", 166 | " ...,\n", 167 | " title=\"Column One\",\n", 168 | " description=\"The is column one\",\n", 169 | " tags=[\"test_tag_1\"],\n", 170 | " )\n", 171 | " column_2: str = Field(\n", 172 | " ...,\n", 173 | " title=\"Column Two\",\n", 174 | " description=\"The is column two\",\n", 175 | " tags=[\"test_tag_2\"],\n", 176 | " )\n", 177 | " column_3: str = Field(\n", 178 | " ...,\n", 179 | " title=\"Column Three\",\n", 180 | " description=\"The is column three\",\n", 181 | " tags=[\"test_tag_3\"],\n", 182 | " )\n", 183 | "\n", 184 | "```" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "application/vnd.databricks.v1+cell": { 192 | "cellMetadata": { 193 | "byteLimit": 2048000, 194 | "rowLimit": 10000 195 | }, 196 | "inputWidgets": {}, 197 | "nuid": "bdda71da-98a8-432b-8653-21a62a91be56", 198 | "showTitle": false, 199 | "title": "" 200 | } 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "#initialize your pydantic datamodel class here with the class name as pydantic_date_model which inherits from BaseModel\n", 205 | "pydantic_data_model = None" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": { 211 | "application/vnd.databricks.v1+cell": { 212 | "cellMetadata": {}, 213 | "inputWidgets": {}, 214 | "nuid": "008093c5-05e9-4ad4-aca6-533513d97ed4", 215 | "showTitle": false, 216 | "title": "" 217 | } 218 | }, 219 | "source": [ 220 | "## Parse the data model \n", 221 | "#### Convert the declared data model to a data frame containing the needed info" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "application/vnd.databricks.v1+cell": { 229 | "cellMetadata": {}, 230 | "inputWidgets": {}, 231 | "nuid": "d8f2e2b1-b113-455e-9dcd-b41169fc384f", 232 | "showTitle": false, 233 | "title": "" 234 | } 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "def create_data_dictionary(model: type[BaseModel]) -> pd.DataFrame:\n", 239 | " \"\"\"Describe the fields of a pydantic model as a pandas DataFrame.\n", 240 | "\n", 241 | " Args:\n", 242 | " model (Type[BaseModel]): A pydantic model.\n", 243 | "\n", 244 | " Returns:\n", 245 | " pd.DataFrame: A pandas DataFrame describing the model.\n", 246 | " \"\"\"\n", 247 | " return pd.DataFrame(\n", 248 | " [\n", 249 | " {\n", 250 | " \"field_name\": field,\n", 251 | " \"field_title\": field_mf.field_info.title,\n", 252 | " \"python_type\": field_type\n", 253 | " if \"Workspace Migration ✈️ 2 |

3 | Databricks 4 |

5 | 6 | ## Introduction 7 | 8 | This notebook is used to migrate clusters and workflows from one workspace to another using the Databricks REST API. It works by fetching the current cluster / workflow configs and then using it to create the same in a new workspace. 9 | 10 | ## Use Cases 11 | 12 | Areas where such a notebook may be helpful: 13 | 14 | 1. **Migrating clusters and workflows to a new workspace**: This is the obvious use case, and the notebook would be particularly useful for large or complex workspaces, where migrating everything manually would be time-consuming and error-prone. 15 | 2. **Creating a new workspace from scratch**: The notebook could be used to quickly create a new workspace with the same clusters and workflows as an existing workspace. This could be useful for creating a development or staging environment, or for creating a new workspace for a specific project or team. 16 | 17 | --- 18 | See more details in the notebook (ipynb) 19 | -------------------------------------------------------------------------------- /notebooks/migrate_workspace/migrate_workspace.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "d08046ea-5af6-4d2e-9bfb-483adbd72f55", 10 | "showTitle": false, 11 | "title": "" 12 | } 13 | }, 14 | "source": [ 15 | "# Workspace Migration ✈️\n", 16 | "\n", 17 | "## Requirements\n", 18 | "\n", 19 | "### Databricks\n", 20 | "\n", 21 | "* Two Databricks Workspaces & Workspace Access Tokens for the same\n", 22 | "* At least one runnable cluster within any workspace\n", 23 | "\n", 24 | "> Note: The word `job` and `wokflow` is used interchangeably throughout " 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "application/vnd.databricks.v1+cell": { 32 | "cellMetadata": { 33 | "byteLimit": 2048000, 34 | "rowLimit": 10000 35 | }, 36 | "inputWidgets": {}, 37 | "nuid": "aae8da7c-d1f1-4f52-ac7e-28c1b1e686e2", 38 | "showTitle": false, 39 | "title": "" 40 | } 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "import json\n", 45 | "\n", 46 | "import requests\n", 47 | "from typing import Optional, Callable" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "application/vnd.databricks.v1+cell": { 54 | "cellMetadata": { 55 | "byteLimit": 2048000, 56 | "rowLimit": 10000 57 | }, 58 | "inputWidgets": {}, 59 | "nuid": "bef603a0-00df-4319-b7eb-0ff8f3bbbeb9", 60 | "showTitle": false, 61 | "title": "" 62 | } 63 | }, 64 | "source": [ 65 | "## Steps 📊\n", 66 | "\n", 67 | "\n", 68 | "### 1. Fetch workflow / cluster configurations 📬\n", 69 | "\n", 70 | "We fetch all the workflows/clusters present in your workspace, each fetched workflow config will also contain the individual task config present in the workflow and their respective job cluster configs. \n", 71 | "\n", 72 | "### 2. Parse Information 🧩\n", 73 | "\n", 74 | "In this step we parse the obtained config info. The main thing to keep in mind is that the cluster config contains some fields which are populated after the cluster is initialized but will be fetched anyway from step 1, we need to remove this field or else when we use the same config to create the workflow later it will throw an error. You can also add any custom logic here. For example: You can include webhook notification ID to be associated with a workflow you like, You can also associate an existing all-purpose-compute to a workflow that you want, etc. \n", 75 | "\n", 76 | "### 3. Create new workflow / config 👶🏽\n", 77 | "\n", 78 | "Using the parsed info we create workflows/clusters in the new workspace.\n" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "application/vnd.databricks.v1+cell": { 85 | "cellMetadata": {}, 86 | "inputWidgets": {}, 87 | "nuid": "4f517d5d-b817-4e75-85d6-5de5b317bbf9", 88 | "showTitle": false, 89 | "title": "" 90 | } 91 | }, 92 | "source": [ 93 | "### Set up workspace urls and access tokens\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "application/vnd.databricks.v1+cell": { 101 | "cellMetadata": { 102 | "byteLimit": 2048000, 103 | "rowLimit": 10000 104 | }, 105 | "inputWidgets": {}, 106 | "nuid": "d2688c9e-f89c-4b7b-b84c-913016163080", 107 | "showTitle": false, 108 | "title": "" 109 | } 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "dbutils.widgets.removeAll()\n", 114 | "\n", 115 | "dbutils.widgets.text(\"old_workspace_url\", \"\")\n", 116 | "old_workspace_url: str = getArgument(\"old_workspace_url\")\n", 117 | "\n", 118 | "dbutils.widgets.text(\"old_workspace_token\", \"\")\n", 119 | "old_workspace_token: str = getArgument(\"old_workspace_token\")\n", 120 | "\n", 121 | "dbutils.widgets.text(\"new_workspace_url\", \"\")\n", 122 | "new_workspace_url: str = getArgument(\"new_workspace_url\")\n", 123 | "\n", 124 | "dbutils.widgets.text(\"new_workspace_token\", \"\")\n", 125 | "new_workspace_token: str = getArgument(\"new_workspace_token\")\n", 126 | "\n", 127 | "\n", 128 | "query_params = {\n", 129 | " \"LIST_JOBS_LIMIT\": 100, # max limit\n", 130 | " \"EXPAND_TASKS\": \"true\", # provides the complete config info for each job\n", 131 | "}" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "application/vnd.databricks.v1+cell": { 139 | "cellMetadata": { 140 | "byteLimit": 2048000, 141 | "rowLimit": 10000 142 | }, 143 | "inputWidgets": {}, 144 | "nuid": "3dae11b4-39ad-4229-9c06-de60b96a770f", 145 | "showTitle": false, 146 | "title": "" 147 | } 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "def paginate(\n", 152 | " can_paginate: bool,\n", 153 | " next_page_token: Optional[str],\n", 154 | " url: str,\n", 155 | " workspace_token: str,\n", 156 | " function_to_call: Callable,\n", 157 | ") -> None:\n", 158 | " \"\"\"\n", 159 | " Paginates to the next page if possible\n", 160 | " input:\n", 161 | " can_paginate [bool]: Boolean info about wheather there is additional info.\n", 162 | " next_page_token [str]: Token needed in url query param to paginate to next page.\n", 163 | " url [str]: Url used to list the needed info.\n", 164 | " function_to_call [Callable]: Function that gets called with the paginated url to paginate further.\n", 165 | " output:\n", 166 | " None\n", 167 | " \"\"\"\n", 168 | "\n", 169 | " if next_page_token and can_paginate:\n", 170 | " if \"&page_token\" in url:\n", 171 | " url = f\"{url[:url.find('&page_token')]}&page_token={next_page_token}\"\n", 172 | " else:\n", 173 | " url = f\"{url}&page_token={next_page_token}\"\n", 174 | "\n", 175 | " function_to_call(url, workspace_token)\n", 176 | " else:\n", 177 | " return" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "application/vnd.databricks.v1+cell": { 184 | "cellMetadata": {}, 185 | "inputWidgets": {}, 186 | "nuid": "3a11f70f-5c0d-41b6-92f7-74684f5a606a", 187 | "showTitle": false, 188 | "title": "" 189 | } 190 | }, 191 | "source": [ 192 | "## List Clusters \n", 193 | "#### Fetches all clusters in current workspace and its respective configs\n", 194 | "API Docs\n" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "application/vnd.databricks.v1+cell": { 202 | "cellMetadata": { 203 | "byteLimit": 2048000, 204 | "rowLimit": 10000 205 | }, 206 | "inputWidgets": {}, 207 | "nuid": "9dece1e7-42d3-437a-a375-f56cc87b9074", 208 | "showTitle": false, 209 | "title": "" 210 | } 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "def getAllClusters(list_clusters_url: str, workspace_token: str) -> None:\n", 215 | " \"\"\"\n", 216 | " Fetches all the clusters and metadata about them.\n", 217 | " input:\n", 218 | " list_clusters_url [str]: Databricks API used to fetch all the clusters.\n", 219 | " workspace_token [str]: Databricks workspace access token.\n", 220 | " output:\n", 221 | " None\n", 222 | " \"\"\"\n", 223 | "\n", 224 | " response = requests.get(\n", 225 | " list_clusters_url,\n", 226 | " headers={\"Authorization\": f\"Bearer {workspace_token}\"},\n", 227 | " )\n", 228 | " assert response.status_code == 200\n", 229 | "\n", 230 | " response_data = response.json()\n", 231 | "\n", 232 | " for cluster_info in response_data.get(\"clusters\", []):\n", 233 | " clusters.append(cluster_info)\n", 234 | "\n", 235 | " paginate(\n", 236 | " response_data.get(\"has_more\", False),\n", 237 | " response_data.get(\"next_page_token\"),\n", 238 | " list_clusters_url,\n", 239 | " workspace_token,\n", 240 | " getAllClusters,\n", 241 | " )\n", 242 | "\n", 243 | "\n", 244 | "clusters = [] # holds all cluster' info\n", 245 | "List_clusters_url = str(old_workspace_url + \"/api/2.0/clusters/list\")\n", 246 | "getAllClusters(List_clusters_url, old_workspace_token)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": { 252 | "application/vnd.databricks.v1+cell": { 253 | "cellMetadata": {}, 254 | "inputWidgets": {}, 255 | "nuid": "804512f8-b360-4ca3-8d85-6f63a8ae235d", 256 | "showTitle": false, 257 | "title": "" 258 | } 259 | }, 260 | "source": [ 261 | "## Filter and Parse info" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": { 268 | "application/vnd.databricks.v1+cell": { 269 | "cellMetadata": { 270 | "byteLimit": 2048000, 271 | "rowLimit": 10000 272 | }, 273 | "inputWidgets": {}, 274 | "nuid": "8134e80d-8207-45ca-adab-9314791403ce", 275 | "showTitle": false, 276 | "title": "" 277 | } 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "def filterClusters(cluster_info: dict) -> bool:\n", 282 | " \"\"\"Filter clusters based on custom logic\"\"\"\n", 283 | " return True\n", 284 | "\n", 285 | "\n", 286 | "def parseClusters(cluster_info: dict) -> dict:\n", 287 | " \"\"\"Modefies the cluster config.\n", 288 | " input:\n", 289 | " cluster_info [dict]: Dict containing all the config info about the cluster.\n", 290 | " output:\n", 291 | " dict : parsed result in accordance with the `create cluster` api payload.\"\"\"\n", 292 | " if cluster_info.get(\"aws_attributes\"):\n", 293 | " cluster_info.pop(\"aws_attributes\")\n", 294 | " if cluster_info.get(\"cluster_id\"):\n", 295 | " cluster_info.pop(\"cluster_id\")\n", 296 | "\n", 297 | " # add more custom parsing logic if needed\n", 298 | " return cluster_info\n", 299 | "\n", 300 | "\n", 301 | "filtered_clusters = []\n", 302 | "\n", 303 | "# filter\n", 304 | "for cluster_info in clusters:\n", 305 | " if filterClusters(cluster_info):\n", 306 | " filtered_clusters.append(cluster_info)\n", 307 | "\n", 308 | "# parse\n", 309 | "for idx in range(len(filtered_clusters)):\n", 310 | " cluster_info = filtered_clusters[idx]\n", 311 | " parsed_cluster_info = parseClusters(cluster_info)\n", 312 | " filtered_clusters[idx] = parsed_cluster_info\n", 313 | "\n", 314 | "clusters = filtered_clusters" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": { 320 | "application/vnd.databricks.v1+cell": { 321 | "cellMetadata": { 322 | "byteLimit": 2048000, 323 | "rowLimit": 10000 324 | }, 325 | "inputWidgets": {}, 326 | "nuid": "1f5dc0e5-e714-43bb-a014-7e55a301119d", 327 | "showTitle": false, 328 | "title": "" 329 | } 330 | }, 331 | "source": [ 332 | "## Create new cluster\n", 333 | "#### Use the parsed info as payload to create clusters in the new workspace\n", 334 | "API Docs\n", 335 | "\n" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "application/vnd.databricks.v1+cell": { 343 | "cellMetadata": {}, 344 | "inputWidgets": {}, 345 | "nuid": "97b639a3-6bd6-4132-9c3b-4802ca5cf73b", 346 | "showTitle": false, 347 | "title": "" 348 | } 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "for cluster_info in clusters:\n", 353 | " response = requests.post(\n", 354 | " f\"{new_workspace_url}/api/2.0/clusters/create\",\n", 355 | " headers={\n", 356 | " \"Content-Type\": \"application/json\",\n", 357 | " \"Authorization\": f\"Bearer {new_workspace_token}\",\n", 358 | " },\n", 359 | " data=json.dumps(cluster_info),\n", 360 | " )\n", 361 | " assert response.status_code in {\n", 362 | " 200,\n", 363 | " 201,\n", 364 | " }" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": { 370 | "application/vnd.databricks.v1+cell": { 371 | "cellMetadata": {}, 372 | "inputWidgets": {}, 373 | "nuid": "1383ed56-7a81-4e91-ae46-a45f59ee65d9", 374 | "showTitle": false, 375 | "title": "" 376 | } 377 | }, 378 | "source": [ 379 | "## List Workflows \n", 380 | "#### Fetches all workflows in current workspace and its respective configs\n", 381 | "API Docs\n" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "application/vnd.databricks.v1+cell": { 389 | "cellMetadata": {}, 390 | "inputWidgets": {}, 391 | "nuid": "3ed71bea-8300-4c9a-9601-ed22e50b406c", 392 | "showTitle": false, 393 | "title": "" 394 | } 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "def getAllJobs(list_jobs_url: str, workspace_token: str) -> None:\n", 399 | " \"\"\"\n", 400 | " Fetches all the jobs and metadata about them.\n", 401 | " input:\n", 402 | " lists_jobs_url [str]: Databricks API used to fetch all the jobs.\n", 403 | " workspace_token [str]: Databricks workspace access token.\n", 404 | " output:\n", 405 | " None\n", 406 | " \"\"\"\n", 407 | "\n", 408 | " response = requests.get(\n", 409 | " list_jobs_url,\n", 410 | " headers={\"Authorization\": f\"Bearer {workspace_token}\"},\n", 411 | " )\n", 412 | " assert response.status_code == 200\n", 413 | "\n", 414 | " response_data = response.json()\n", 415 | "\n", 416 | " for job in response_data.get(\"jobs\", []):\n", 417 | " jobs.append(job.get(\"settings\"))\n", 418 | "\n", 419 | " paginate(\n", 420 | " response_data.get(\"has_more\", False),\n", 421 | " response_data.get(\"next_page_token\"),\n", 422 | " list_jobs_url,\n", 423 | " workspace_token,\n", 424 | " getAllJobs,\n", 425 | " )\n", 426 | "\n", 427 | "\n", 428 | "jobs = [] # holds all jobs' info\n", 429 | "List_jobs_url = str(\n", 430 | " old_workspace_url\n", 431 | " + \"/api/2.1/jobs/list?\"\n", 432 | " + f\"limit={query_params.get('LIST_JOBS_LIMIT')}\"\n", 433 | " + f\"&expand_tasks={query_params.get('EXPAND_TASKS')}\"\n", 434 | ")\n", 435 | "getAllJobs(List_jobs_url, old_workspace_token)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": { 441 | "application/vnd.databricks.v1+cell": { 442 | "cellMetadata": {}, 443 | "inputWidgets": {}, 444 | "nuid": "26d699ee-6076-49aa-bd54-8a5d918936b3", 445 | "showTitle": false, 446 | "title": "" 447 | } 448 | }, 449 | "source": [ 450 | "## Filter and Parse info\n", 451 | "#### Some of the parsing we can do \n", 452 | "1. You can add new webhook notif ID \n", 453 | "2. Tag an existing all-prupose compute to the workflow \n", 454 | "3. Tag an existing task if the new task (from the workflow) depends on it" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": { 461 | "application/vnd.databricks.v1+cell": { 462 | "cellMetadata": {}, 463 | "inputWidgets": {}, 464 | "nuid": "7f8252ae-94e8-4513-9e60-a4b4b62d2054", 465 | "showTitle": false, 466 | "title": "" 467 | } 468 | }, 469 | "outputs": [], 470 | "source": [ 471 | "def filterWorkflows(workflow_info: dict) -> bool:\n", 472 | " \"\"\"Filter Workflow based on custom logic\"\"\"\n", 473 | " return True\n", 474 | "\n", 475 | "\n", 476 | "def parseWorkflows(workflow_info: dict) -> dict:\n", 477 | " \"\"\"Modefies the workflow config.\n", 478 | " input:\n", 479 | " workflow_info [dict]: Dict containing all the config info about the workflow.\n", 480 | " output:\n", 481 | " dict : parsed result in accordance with the `create job` api payload.\"\"\"\n", 482 | " for cluster_info in workflow_info.get(\n", 483 | " \"job_clusters\", []\n", 484 | " ): # below parsing is same for cluster config payload too.\n", 485 | " if \"aws_attributes\" in cluster_info.get(\"new_cluster\"):\n", 486 | " cluster_info.get(\"new_cluster\").pop(\"aws_attributes\")\n", 487 | "\n", 488 | " # add more custom parsing logic if needed\n", 489 | " return workflow_info\n", 490 | "\n", 491 | "\n", 492 | "filtered_jobs = []\n", 493 | "\n", 494 | "# filter\n", 495 | "for workflow_info in jobs:\n", 496 | " if filterWorkflows(workflow_info):\n", 497 | " filtered_jobs.append(workflow_info)\n", 498 | "\n", 499 | "# parse\n", 500 | "for idx in range(len(filtered_jobs)):\n", 501 | " workflow_info = filtered_jobs[idx]\n", 502 | " parsed_workflow_info = parseWorkflows(workflow_info)\n", 503 | " filtered_jobs[idx] = parsed_workflow_info\n", 504 | "\n", 505 | "jobs = filtered_jobs" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": { 511 | "application/vnd.databricks.v1+cell": { 512 | "cellMetadata": { 513 | "byteLimit": 2048000, 514 | "rowLimit": 10000 515 | }, 516 | "inputWidgets": {}, 517 | "nuid": "61459da2-bf71-48eb-9bde-0be70462aa6d", 518 | "showTitle": false, 519 | "title": "" 520 | } 521 | }, 522 | "source": [ 523 | "## Create Workflow\n", 524 | "#### Use the parsed info to create workflow in new workspace\n", 525 | "API Docs\n" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": { 532 | "application/vnd.databricks.v1+cell": { 533 | "cellMetadata": {}, 534 | "inputWidgets": {}, 535 | "nuid": "106ec485-e5a3-4dff-a5a9-6e01a68e68b1", 536 | "showTitle": false, 537 | "title": "" 538 | } 539 | }, 540 | "outputs": [], 541 | "source": [ 542 | "for workflow_info in jobs:\n", 543 | " response = requests.post(\n", 544 | " url=f\"{new_workspace_url}/api/2.1/jobs/create\",\n", 545 | " headers={\n", 546 | " \"Content-Type\": \"application/json\",\n", 547 | " \"Authorization\": f\"Bearer {new_workspace_token}\",\n", 548 | " },\n", 549 | " data=json.dumps(workflow_info),\n", 550 | " )\n", 551 | " assert response.status_code in {\n", 552 | " 200,\n", 553 | " 201,\n", 554 | " }" 555 | ] 556 | } 557 | ], 558 | "metadata": { 559 | "application/vnd.databricks.v1+notebook": { 560 | "dashboards": [], 561 | "language": "python", 562 | "notebookMetadata": { 563 | "pythonIndentUnit": 4 564 | }, 565 | "notebookName": "migrate_workspace", 566 | "widgets": { 567 | "new_workspace_token": { 568 | "currentValue": "", 569 | "nuid": "47394cf3-4b2e-427e-ab85-7fe7998f33de", 570 | "widgetInfo": { 571 | "defaultValue": "", 572 | "label": null, 573 | "name": "new_workspace_token", 574 | "options": { 575 | "validationRegex": null, 576 | "widgetType": "text" 577 | }, 578 | "widgetType": "text" 579 | } 580 | }, 581 | "new_workspace_url": { 582 | "currentValue": "", 583 | "nuid": "efdcc97f-e245-4c68-bca9-992c5489cc0d", 584 | "widgetInfo": { 585 | "defaultValue": "", 586 | "label": null, 587 | "name": "new_workspace_url", 588 | "options": { 589 | "validationRegex": null, 590 | "widgetType": "text" 591 | }, 592 | "widgetType": "text" 593 | } 594 | }, 595 | "old_workspace_token": { 596 | "currentValue": "", 597 | "nuid": "f0561168-26a1-434e-af57-8b2405d96362", 598 | "widgetInfo": { 599 | "defaultValue": "", 600 | "label": null, 601 | "name": "old_workspace_token", 602 | "options": { 603 | "validationRegex": null, 604 | "widgetType": "text" 605 | }, 606 | "widgetType": "text" 607 | } 608 | }, 609 | "old_workspace_url": { 610 | "currentValue": "", 611 | "nuid": "1dfaf69b-5d6b-4782-bf25-d94605ef9848", 612 | "widgetInfo": { 613 | "defaultValue": "", 614 | "label": null, 615 | "name": "old_workspace_url", 616 | "options": { 617 | "validationRegex": null, 618 | "widgetType": "text" 619 | }, 620 | "widgetType": "text" 621 | } 622 | } 623 | } 624 | }, 625 | "language_info": { 626 | "name": "python" 627 | } 628 | }, 629 | "nbformat": 4, 630 | "nbformat_minor": 0 631 | } 632 | -------------------------------------------------------------------------------- /notebooks/pandas_delta/README.md: -------------------------------------------------------------------------------- 1 | # Delta Lake I/O with Pandas DataFrames 2 | 3 | ![](https://img.shields.io/badge/Databricks-FF3621.svg?style=for-the-badge&logo=Databricks&logoColor=white) 4 | ![](https://img.shields.io/badge/Delta-003366.svg?style=for-the-badge&logo=Delta&logoColor=white) 5 | ![](https://img.shields.io/badge/pandas-150458.svg?style=for-the-badge&logo=pandas&logoColor=white) 6 | 7 | ## Use-Cases 8 | 9 | * Read and write Delta Lake tables into Pandas DataFrames. 10 | * Access Schemas and Catalogs as Pandas DataFrames 11 | * Access Delta lake from external services for table reads and writes 12 | 13 | ## Structure 14 | 15 | A running cluster on a Databricks workspace is required to interface any outside data from pandas DataFrames to mediate access and I/O with Delta lake. 16 | 17 | 18 | 19 | See more details in the notebook (`ipynb`) 20 | -------------------------------------------------------------------------------- /notebooks/pandas_delta/assets/databricks_sql_python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dotlas/databricks_helpers/6b9a6f1eb5eb2c1d57ce61f7d3ae5443fba39b9b/notebooks/pandas_delta/assets/databricks_sql_python.png -------------------------------------------------------------------------------- /notebooks/pandas_delta/assets/unity_catalog_cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dotlas/databricks_helpers/6b9a6f1eb5eb2c1d57ce61f7d3ae5443fba39b9b/notebooks/pandas_delta/assets/unity_catalog_cluster.png -------------------------------------------------------------------------------- /notebooks/pandas_delta/pandas_delta.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": { 8 | "byteLimit": 2048000, 9 | "rowLimit": 10000 10 | }, 11 | "inputWidgets": {}, 12 | "nuid": "122e3c8d-e602-407a-abb7-a5b521ef7057", 13 | "showTitle": false, 14 | "title": "" 15 | } 16 | }, 17 | "source": [ 18 | "\n", 19 | "# Delta Lake I/O with Pandas Dataframes outside Databricks Environment\n", 20 | "\n", 21 | "![](https://img.shields.io/badge/Databricks-FF3621.svg?style=for-the-badge&logo=Databricks&logoColor=white)\n", 22 | "![](https://img.shields.io/badge/Delta-003366.svg?style=for-the-badge&logo=Delta&logoColor=white)\n", 23 | "![](https://img.shields.io/badge/pandas-150458.svg?style=for-the-badge&logo=pandas&logoColor=white)\n", 24 | "\n", 25 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dotlas/databricks_helpers/blob/main/notebooks/pandas_delta/pandas_delta.ipynb)\n", 26 | "\n", 27 | "\n", 28 | "In this notebook, we showcase some utility functions built on top of existing third-party open source libraries in Python to read or write Pandas Dataframes **from within or outside a Databricks environment into Delta lake on Databricks**. The Delta lake can exist on [Unity Catalog](https://www.databricks.com/product/unity-catalog), or simply be the `hive_metastore` default. \n", 29 | "\n", 30 | "## Requirements\n", 31 | "\n", 32 | "### Databricks\n", 33 | "* A Databricks Workspace & Workspace Access Token\n", 34 | "* At least one runnable cluster within the workspace\n", 35 | "* Workspace attached to a metastore for Delta Lake\n", 36 | "\n", 37 | "### Packages\n", 38 | "\n", 39 | "This process heavily relies on [databricks-sql-python](https://github.com/databricks/databricks-sql-python) library which provides us with a [SQLAlchemy](https://sqlalche.me/) interface to write data. `databricks-sql-python` is an open source Python package maintained by Databricks, and `SQLAlchemy` is used since it is the default ORM wrapper used by the Pandas library\n", 40 | "\n", 41 | "\n", 42 | "* `databricks-sql-connector`\n", 43 | "* `sqlalchemy == 1.4.41`\n", 44 | "* `pandas < 2.0`\n", 45 | "\n", 46 | "### Infra\n", 47 | "\n", 48 | "A cluster is required to be running on the Databricks workspace from where the Delta lake will be accessed. This cluster will behave as an intermediary to accept connections and data from outside Databricks and add the data into Delta lake. \n", 49 | "\n", 50 | "> In order to add data to Unity catalog, the cluster must be configured to access `Unity Catalog`\n", 51 | "\n", 52 | "![](./assets/unity_catalog_cluster.png)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "application/vnd.databricks.v1+cell": { 60 | "cellMetadata": {}, 61 | "inputWidgets": {}, 62 | "nuid": "86830a84-762c-43c8-98e2-b4b46daf4b80", 63 | "showTitle": false, 64 | "title": "" 65 | } 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "pip install pandas databricks-sql-connector sqlalchemy==1.4.41 -q" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "application/vnd.databricks.v1+cell": { 77 | "cellMetadata": { 78 | "byteLimit": 2048000, 79 | "rowLimit": 10000 80 | }, 81 | "inputWidgets": {}, 82 | "nuid": "88bed7fd-8efa-4b41-bfc0-49d4f1b5f62a", 83 | "showTitle": false, 84 | "title": "" 85 | } 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "import os\n", 90 | "\n", 91 | "import pandas as pd\n", 92 | "from sqlalchemy import types as sql_types\n", 93 | "from sqlalchemy import create_engine\n", 94 | "from sqlalchemy.engine import Engine\n", 95 | "\n", 96 | "# databricks imports\n", 97 | "from databricks import sql as databricks_sql" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "application/vnd.databricks.v1+cell": { 104 | "cellMetadata": {}, 105 | "inputWidgets": {}, 106 | "nuid": "434bb481-e475-4241-9da6-667817a1480e", 107 | "showTitle": false, 108 | "title": "" 109 | } 110 | }, 111 | "source": [ 112 | "\n", 113 | "### Setup User Inputs\n", 114 | "\n", 115 | "When running this on Databricks, `CLUSTER HTTP PATH` and `WORKSPACE HOSTNAME` can be inferred. When running outside Databricks, you need to start a cluster, and then get these values, copy them over to this notebook when it's run externally and use those as parameters\n", 116 | "\n", 117 | "Use `HTTP_PATH` from within the Cluster configuration page for `CLUSTER HTTP PATH` variable like so:\n", 118 | "\n", 119 | "![](https://i.stack.imgur.com/qDotH.png)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": { 125 | "application/vnd.databricks.v1+cell": { 126 | "cellMetadata": {}, 127 | "inputWidgets": {}, 128 | "nuid": "1c35434f-ceb4-424d-b550-06d593681818", 129 | "showTitle": false, 130 | "title": "" 131 | } 132 | }, 133 | "source": [ 134 | "\n", 135 | "**Fill up the values for the 3 parameters within the cell below when running this notebook outside a Databricks environment**" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "application/vnd.databricks.v1+cell": { 143 | "cellMetadata": { 144 | "byteLimit": 2048000, 145 | "rowLimit": 10000 146 | }, 147 | "inputWidgets": {}, 148 | "nuid": "18996dc5-875a-4347-816b-770b8e385597", 149 | "showTitle": false, 150 | "title": "" 151 | } 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "# Check if notebook is running inside databricks environment\n", 156 | "DATABRICKS_ENV = any(\"SPARK\" in k for k in os.environ)\n", 157 | "\n", 158 | "if DATABRICKS_ENV:\n", 159 | " dbutils.widgets.removeAll()\n", 160 | " dbutils.widgets.text(\"WORKSPACE ACCESS TOKEN\", \"\")\n", 161 | " dbutils.widgets.text(\"WORKSPACE HOSTNAME\", \"\")\n", 162 | " dbutils.widgets.text(\"CLUSTER HTTP PATH\", \"\")\n", 163 | "\n", 164 | "# INPUT VALUES HERE\n", 165 | "\n", 166 | "# The workspace access token. Usually of the form *******\n", 167 | "databricks_workspace_access_token: str = (\n", 168 | " getArgument(\"WORKSPACE ACCESS TOKEN\")\n", 169 | " if DATABRICKS_ENV\n", 170 | " else \"\"\n", 171 | ")\n", 172 | "\n", 173 | "# server hostname like dbc-xxxx.cloud.databricks.com\n", 174 | "# do not prefix with https:// or add a / at the end\n", 175 | "databricks_server_hostname: str = (\n", 176 | " getArgument(\"WORKSPACE HOSTNAME\")\n", 177 | " if DATABRICKS_ENV\n", 178 | " else \"\"\n", 179 | ")\n", 180 | "\n", 181 | "# the http path from the cluster configuration -> JDBC/ODBC tab\n", 182 | "databricks_cluster_jdbc_http_path: str = (\n", 183 | " getArgument(\"CLUSTER HTTP PATH\")\n", 184 | " if DATABRICKS_ENV\n", 185 | " else \"\"\n", 186 | ")" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": { 192 | "application/vnd.databricks.v1+cell": { 193 | "cellMetadata": {}, 194 | "inputWidgets": {}, 195 | "nuid": "1bc3bd88-ceb0-4a2c-8276-79ca5f6ce471", 196 | "showTitle": false, 197 | "title": "" 198 | } 199 | }, 200 | "source": [ 201 | "\n", 202 | "### Infer & Assert Inputs" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "application/vnd.databricks.v1+cell": { 210 | "cellMetadata": { 211 | "byteLimit": 2048000, 212 | "rowLimit": 10000 213 | }, 214 | "inputWidgets": {}, 215 | "nuid": "66eae1b2-0c3b-4be2-a1d8-b5a29a601f93", 216 | "showTitle": false, 217 | "title": "" 218 | } 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "if DATABRICKS_ENV:\n", 223 | " # if notebook is running on databricks environment, then infer parameters\n", 224 | " if not databricks_cluster_jdbc_http_path:\n", 225 | " # spark works without imports within databricks environment\n", 226 | " cluster_id: str = spark.conf.get(\n", 227 | " \"spark.databricks.clusterUsageTags.clusterId\",\n", 228 | " ) # type: ignore\n", 229 | " workspace_id: str = spark.conf.get(\n", 230 | " \"spark.databricks.clusterUsageTags.clusterOwnerOrgId\",\n", 231 | " ) # type: ignore\n", 232 | " databricks_cluster_jdbc_http_path = (\n", 233 | " f\"sql/protocolv1/o/{workspace_id}/{cluster_id}\"\n", 234 | " )\n", 235 | "\n", 236 | " if not databricks_server_hostname:\n", 237 | " databricks_server_hostname = spark.conf.get(\"spark.databricks.workspaceUrl\")\n", 238 | "\n", 239 | "assert databricks_workspace_access_token, \"Databricks Workspace Access Token Missing\"\n", 240 | "assert databricks_server_hostname, \"Databricks Hostname Missing\"\n", 241 | "assert databricks_cluster_jdbc_http_path, \"Cluster JDBC path Missing\"" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": { 247 | "application/vnd.databricks.v1+cell": { 248 | "cellMetadata": {}, 249 | "inputWidgets": {}, 250 | "nuid": "bf44e023-6416-42ec-9b7e-cbe1e771518a", 251 | "showTitle": false, 252 | "title": "" 253 | } 254 | }, 255 | "source": [ 256 | "\n", 257 | "### Setup Connection\n", 258 | "\n", 259 | "We will create a SQLAlchemy engine using the credentials required to connect to the cluster and workspace" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "application/vnd.databricks.v1+cell": { 267 | "cellMetadata": { 268 | "byteLimit": 2048000, 269 | "rowLimit": 10000 270 | }, 271 | "inputWidgets": {}, 272 | "nuid": "65a854e9-a983-49c9-9189-86508b083fe1", 273 | "showTitle": false, 274 | "title": "" 275 | } 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "databricks_sqlalchemy_url: str = (\n", 280 | " \"databricks://token:\"\n", 281 | " + databricks_workspace_access_token\n", 282 | " + \"@\"\n", 283 | " + databricks_server_hostname\n", 284 | " + \"?http_path=\"\n", 285 | " + databricks_cluster_jdbc_http_path\n", 286 | ")\n", 287 | "\n", 288 | "databricks_alch_engine: Engine = create_engine(databricks_sqlalchemy_url)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": { 294 | "application/vnd.databricks.v1+cell": { 295 | "cellMetadata": {}, 296 | "inputWidgets": {}, 297 | "nuid": "42369035-2002-40f4-b95d-767c0879c29d", 298 | "showTitle": false, 299 | "title": "" 300 | } 301 | }, 302 | "source": [ 303 | "\n", 304 | "Verify that the connection works by listing catalogs on Databricks" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "application/vnd.databricks.v1+cell": { 312 | "cellMetadata": { 313 | "byteLimit": 2048000, 314 | "rowLimit": 10000 315 | }, 316 | "inputWidgets": {}, 317 | "nuid": "1decf93d-be28-4a59-bec7-e11a2ea19b38", 318 | "showTitle": false, 319 | "title": "" 320 | } 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "catalogs = pd.read_sql(\"show catalogs\", databricks_alch_engine)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": { 330 | "application/vnd.databricks.v1+cell": { 331 | "cellMetadata": {}, 332 | "inputWidgets": {}, 333 | "nuid": "59dc2781-b6ae-4725-ab2f-cd65f951cc47", 334 | "showTitle": false, 335 | "title": "" 336 | } 337 | }, 338 | "source": [ 339 | "\n", 340 | "### Run Queries" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": { 347 | "application/vnd.databricks.v1+cell": { 348 | "cellMetadata": { 349 | "byteLimit": 2048000, 350 | "rowLimit": 10000 351 | }, 352 | "inputWidgets": {}, 353 | "nuid": "b61812d3-395e-480f-8a62-20b06ab6e797", 354 | "showTitle": false, 355 | "title": "" 356 | } 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "catalog_name: str = \"samples\"\n", 361 | "schema_name: str = \"nyctaxi\"\n", 362 | "table_name: str = \"trips\"" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": { 369 | "application/vnd.databricks.v1+cell": { 370 | "cellMetadata": { 371 | "byteLimit": 2048000, 372 | "rowLimit": 10000 373 | }, 374 | "inputWidgets": {}, 375 | "nuid": "665b0c87-f1a5-4496-868a-22df48f29c7d", 376 | "showTitle": false, 377 | "title": "" 378 | } 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "df: pd.DataFrame = pd.read_sql(\n", 383 | " f\"SELECT * FROM {catalog_name}.{schema_name}.{table_name} limit 100\",\n", 384 | " databricks_alch_engine,\n", 385 | ")\n", 386 | "\n", 387 | "df.head()" 388 | ] 389 | } 390 | ], 391 | "metadata": { 392 | "application/vnd.databricks.v1+notebook": { 393 | "dashboards": [], 394 | "language": "python", 395 | "notebookMetadata": { 396 | "pythonIndentUnit": 4 397 | }, 398 | "notebookName": "pandas_delta", 399 | "widgets": { 400 | "CLUSTER HTTP PATH": { 401 | "currentValue": "", 402 | "nuid": "9b33f01f-e642-41f9-bd6c-f06013c3d6c2", 403 | "widgetInfo": { 404 | "defaultValue": "", 405 | "label": null, 406 | "name": "CLUSTER HTTP PATH", 407 | "options": { 408 | "validationRegex": null, 409 | "widgetType": "text" 410 | }, 411 | "widgetType": "text" 412 | } 413 | }, 414 | "WORKSPACE ACCESS TOKEN": { 415 | "currentValue": "", 416 | "nuid": "4ef011e5-b2e1-4ffc-8fb8-b09907d809a5", 417 | "widgetInfo": { 418 | "defaultValue": "", 419 | "label": null, 420 | "name": "WORKSPACE ACCESS TOKEN", 421 | "options": { 422 | "validationRegex": null, 423 | "widgetType": "text" 424 | }, 425 | "widgetType": "text" 426 | } 427 | }, 428 | "WORKSPACE HOSTNAME": { 429 | "currentValue": "", 430 | "nuid": "c0a80a2b-f1b1-433c-8c44-633a3786b835", 431 | "widgetInfo": { 432 | "defaultValue": "", 433 | "label": null, 434 | "name": "WORKSPACE HOSTNAME", 435 | "options": { 436 | "validationRegex": null, 437 | "widgetType": "text" 438 | }, 439 | "widgetType": "text" 440 | } 441 | } 442 | } 443 | }, 444 | "language_info": { 445 | "name": "python" 446 | } 447 | }, 448 | "nbformat": 4, 449 | "nbformat_minor": 0 450 | } 451 | -------------------------------------------------------------------------------- /notebooks/update_job_cluster/README.md: -------------------------------------------------------------------------------- 1 |

Update Workflows and Clusters ♻️

2 |

3 | Databricks 4 |

5 | 6 | ## Introduction 7 | 8 | This notebook is used to update clusters and workflows in the current workspace. It works by fetching the current cluster / workflow configs, performing some parsing and finally updating the same in current workspace. 9 | 10 | ## Use Cases 11 | 12 | Areas where such a notebook may be helpful: 13 | 14 | 1. **Cluster management**: The notebook could be used to automate the process of updating clusters, such as changing the cluster size, node type, or Spark version. This could be useful for organizations that need to scale their clusters up or down dynamically, or that need to keep their clusters up to date with the latest Spark releases. 15 | 2. **Workflow management**: The notebook could be used to automate the process of updating workflows, such as adding or removing tasks, changing the order of tasks, or updating the parameters of tasks. This could be useful for organizations that need to make changes to their workflows on a regular basis, or that need to deploy new workflows to production quickly and reliably. 16 | 17 | --- 18 | See more details in the notebook (ipynb) 19 | -------------------------------------------------------------------------------- /notebooks/update_job_cluster/update_job_cluster.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": { 8 | "byteLimit": 2048000, 9 | "rowLimit": 10000 10 | }, 11 | "inputWidgets": {}, 12 | "nuid": "d08046ea-5af6-4d2e-9bfb-483adbd72f55", 13 | "showTitle": false, 14 | "tableResultSettingsMap": {}, 15 | "title": "" 16 | } 17 | }, 18 | "source": [ 19 | "# Update Clusters & Jobs ♻️\n", 20 | "\n", 21 | "## Requirements\n", 22 | "### Databricks\n", 23 | "* A Databricks Workspace & Workspace Access Token\n", 24 | "* At least one runnable cluster within the workspace\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 0, 30 | "metadata": { 31 | "application/vnd.databricks.v1+cell": { 32 | "cellMetadata": { 33 | "byteLimit": 2048000, 34 | "rowLimit": 10000 35 | }, 36 | "inputWidgets": {}, 37 | "nuid": "5905cc9b-8d77-442e-b915-b0d9cf8825c4", 38 | "showTitle": true, 39 | "tableResultSettingsMap": {}, 40 | "title": "Update SDK to latest version" 41 | } 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "!pip install --upgrade databricks-sdk -q\n", 46 | "!pip install loguru -q" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 0, 52 | "metadata": { 53 | "application/vnd.databricks.v1+cell": { 54 | "cellMetadata": { 55 | "byteLimit": 2048000, 56 | "rowLimit": 10000 57 | }, 58 | "inputWidgets": {}, 59 | "nuid": "8ca906cb-450b-4806-8717-94603a670cef", 60 | "showTitle": true, 61 | "tableResultSettingsMap": {}, 62 | "title": "Restart Python" 63 | } 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "dbutils.library.restartPython()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 0, 73 | "metadata": { 74 | "application/vnd.databricks.v1+cell": { 75 | "cellMetadata": { 76 | "byteLimit": 2048000, 77 | "rowLimit": 10000 78 | }, 79 | "inputWidgets": {}, 80 | "nuid": "aae8da7c-d1f1-4f52-ac7e-28c1b1e686e2", 81 | "showTitle": true, 82 | "tableResultSettingsMap": {}, 83 | "title": "Imports" 84 | } 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "from pathlib import Path\n", 89 | "import re\n", 90 | "\n", 91 | "import pandas as pd\n", 92 | "from loguru import logger\n", 93 | "\n", 94 | "from databricks.sdk import WorkspaceClient\n", 95 | "from databricks.sdk.service.compute import (\n", 96 | " ClusterDetails,\n", 97 | " UpdateClusterResource,\n", 98 | " ListClustersFilterBy,\n", 99 | " ClusterSource,\n", 100 | " InitScriptInfo,\n", 101 | ")\n", 102 | "from databricks.sdk.service.jobs import Job, JobSettings, BaseJob, JobCluster" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "application/vnd.databricks.v1+cell": { 109 | "cellMetadata": { 110 | "byteLimit": 2048000, 111 | "rowLimit": 10000 112 | }, 113 | "inputWidgets": {}, 114 | "nuid": "5ff0f6ad-d92a-4ce2-94f5-a7d3d572d8e4", 115 | "showTitle": false, 116 | "tableResultSettingsMap": {}, 117 | "title": "" 118 | } 119 | }, 120 | "source": [ 121 | "## Setup\n", 122 | "\n", 123 | "| Parameter Name | Description | Allowed Values |\n", 124 | "| ------------------------------- | ------------------------------------------------------------------------------------------------------------------ | ----------------------------------- |\n", 125 | "| `workspace_host` | The **domain** of the Databricks workspace. | `str` |\n", 126 | "| `workspace_token` | The **token** for accessing the Databricks Workspace API | `str` |\n", 127 | "| `desired_runtime_version` | The desired **Databricks Runtime Version** for the updated clusters/job clusters. | `str` [Eg: `\"15.4\"`] |\n", 128 | "| `init_scripts_dir` | Path to the common **directory with init scripts** on a Unity Catalog **Volume** | `str` |\n", 129 | "| `cluster_init_script_files` | **Filenames** for the scripts to be used when initializing the **clusters**. Use `,` commas to separate files. | `str` [Eg: `\"S-154.sh, RE-154.sh\"`] |\n", 130 | "| `job_cluster_init_script_files` | **Filenames** for the scripts to be used when initializing the **job clusters**. Use `,` commas to separate files. | `str` [Eg: `\"S-154.sh, RE-154.sh\"`] |\n" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 0, 136 | "metadata": { 137 | "application/vnd.databricks.v1+cell": { 138 | "cellMetadata": { 139 | "byteLimit": 2048000, 140 | "rowLimit": 10000 141 | }, 142 | "inputWidgets": {}, 143 | "nuid": "8551cf68-7418-4596-853a-364ba45e9059", 144 | "showTitle": true, 145 | "tableResultSettingsMap": {}, 146 | "title": "Setup Widgets" 147 | } 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "dbutils.widgets.removeAll()\n", 152 | "\n", 153 | "dbutils.widgets.text(\"workspace_host\", \"\")\n", 154 | "workspace_host: str = getArgument(\"workspace_host\")\n", 155 | "\n", 156 | "dbutils.widgets.text(\"workspace_token\", \"\")\n", 157 | "workspace_token: str = getArgument(\"workspace_token\")\n", 158 | "\n", 159 | "dbutils.widgets.text(\"desired_runtime_version\", \"\")\n", 160 | "desired_runtime_version: str = getArgument(\"desired_runtime_version\")\n", 161 | "\n", 162 | "dbutils.widgets.text(\"init_scripts_dir\", \"\")\n", 163 | "# Validate if directory exists and normalize the path\n", 164 | "init_scripts_dir: str = str(Path(getArgument(\"init_scripts_dir\")).resolve(strict=True))\n", 165 | "\n", 166 | "dbutils.widgets.text(\"cluster_init_script_files\", \"\")\n", 167 | "cluster_init_script_files: list[str] = [\n", 168 | " filename.strip() for filename in getArgument(\"cluster_init_script_files\").split(\",\")\n", 169 | "]\n", 170 | "# Validate if files exist and are not empty\n", 171 | "assert all(\n", 172 | " (Path(init_scripts_dir) / file_name).exists()\n", 173 | " for file_name in cluster_init_script_files\n", 174 | "), \"One or more cluster init script files do not exist\"\n", 175 | "\n", 176 | "dbutils.widgets.text(\"job_cluster_init_script_files\", \"\")\n", 177 | "job_cluster_init_script_files: list[str] = [\n", 178 | " filename.strip()\n", 179 | " for filename in getArgument(\"job_cluster_init_script_files\").split(\",\")\n", 180 | "]\n", 181 | "# Validate if files exist and are not empty\n", 182 | "assert all(\n", 183 | " (Path(init_scripts_dir) / file_name).exists()\n", 184 | " for file_name in job_cluster_init_script_files\n", 185 | "), \"One or more job cluster init script files do not exist\"\n", 186 | "\n", 187 | "assert all(\n", 188 | " [\n", 189 | " workspace_host,\n", 190 | " workspace_token,\n", 191 | " desired_runtime_version,\n", 192 | " init_scripts_dir,\n", 193 | " cluster_init_script_files,\n", 194 | " job_cluster_init_script_files,\n", 195 | " ]\n", 196 | "), \"One or more required parameters for notebook functioning are missing\"\n", 197 | "\n", 198 | "logger.info(f\"{workspace_host=}\")\n", 199 | "logger.info(f\"{desired_runtime_version=}\")\n", 200 | "logger.info(f\"{init_scripts_dir=}\")\n", 201 | "logger.info(f\"{cluster_init_script_files=}\")\n", 202 | "logger.info(f\"{job_cluster_init_script_files=}\")" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 0, 208 | "metadata": { 209 | "application/vnd.databricks.v1+cell": { 210 | "cellMetadata": { 211 | "byteLimit": 2048000, 212 | "rowLimit": 10000 213 | }, 214 | "inputWidgets": {}, 215 | "nuid": "aa39e27a-0385-4b2b-a319-eeed62f96483", 216 | "showTitle": true, 217 | "tableResultSettingsMap": {}, 218 | "title": "Setup the workspace client" 219 | } 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "ws = WorkspaceClient(host=workspace_host, token=workspace_token)\n", 224 | "logger.info(f\"{ws.get_workspace_id()=}\")" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 0, 230 | "metadata": { 231 | "application/vnd.databricks.v1+cell": { 232 | "cellMetadata": { 233 | "byteLimit": 2048000, 234 | "rowLimit": 10000 235 | }, 236 | "inputWidgets": {}, 237 | "nuid": "c9a7849d-1c94-4faf-afa1-8444c9213365", 238 | "showTitle": true, 239 | "tableResultSettingsMap": {}, 240 | "title": "Validate Input Runtime" 241 | } 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "valid_workspace_versions: list[str] = sorted(\n", 246 | " list(\n", 247 | " set(\n", 248 | " [\n", 249 | " version_tuple.name.split(\" \")[0]\n", 250 | " for version_tuple in ws.clusters.spark_versions().versions\n", 251 | " ]\n", 252 | " )\n", 253 | " )\n", 254 | ")\n", 255 | "\n", 256 | "logger.info(f\"{len(valid_workspace_versions)=:,}\")\n", 257 | "\n", 258 | "assert (\n", 259 | " desired_runtime_version in valid_workspace_versions\n", 260 | "), f\"Invalid {desired_runtime_version=}\"" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": { 266 | "application/vnd.databricks.v1+cell": { 267 | "cellMetadata": { 268 | "byteLimit": 2048000, 269 | "rowLimit": 10000 270 | }, 271 | "inputWidgets": {}, 272 | "nuid": "995e3a0e-0aca-4cac-bd6e-baa24078dff1", 273 | "showTitle": false, 274 | "tableResultSettingsMap": {}, 275 | "title": "" 276 | } 277 | }, 278 | "source": [ 279 | "## Init Scripts" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 0, 285 | "metadata": { 286 | "application/vnd.databricks.v1+cell": { 287 | "cellMetadata": { 288 | "byteLimit": 2048000, 289 | "rowLimit": 10000 290 | }, 291 | "inputWidgets": {}, 292 | "nuid": "5aaedbf9-3102-406f-9c3b-870e24542702", 293 | "showTitle": true, 294 | "tableResultSettingsMap": {}, 295 | "title": "Define Init scripts" 296 | } 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "def make_init_scripts(init_script_files: list[str]):\n", 301 | " return [\n", 302 | " InitScriptInfo.from_dict(\n", 303 | " {\n", 304 | " \"volumes\": {\n", 305 | " \"destination\": str(Path(init_scripts_dir) / file_name),\n", 306 | " }\n", 307 | " }\n", 308 | " )\n", 309 | " for file_name in init_script_files\n", 310 | " ]\n", 311 | "\n", 312 | "\n", 313 | "cluster_init_scripts = make_init_scripts(cluster_init_script_files)\n", 314 | "job_cluster_init_scripts = make_init_scripts(job_cluster_init_script_files)\n", 315 | "\n", 316 | "logger.info(f\"{cluster_init_scripts=}\")\n", 317 | "logger.info(f\"{job_cluster_init_scripts=}\")" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": { 323 | "application/vnd.databricks.v1+cell": { 324 | "cellMetadata": { 325 | "byteLimit": 2048000, 326 | "rowLimit": 10000 327 | }, 328 | "inputWidgets": {}, 329 | "nuid": "38b92429-d1b1-48e5-a305-29c3007c3f03", 330 | "showTitle": false, 331 | "tableResultSettingsMap": {}, 332 | "title": "" 333 | } 334 | }, 335 | "source": [ 336 | "## Clusters\n", 337 | "\n", 338 | "According to the SDK and REST API documentation:\n", 339 | "\n", 340 | "- Clusters created as a result of a job cannot be updated via this endpoint. Only those created either via the `UI` or `API` can be changed.\n", 341 | "- Those clusters that are `RUNNING` will be `TERMINATED` at the time of update and restart with the new configuration." 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 0, 347 | "metadata": { 348 | "application/vnd.databricks.v1+cell": { 349 | "cellMetadata": { 350 | "byteLimit": 2048000, 351 | "rowLimit": 10000 352 | }, 353 | "inputWidgets": {}, 354 | "nuid": "18545303-a26d-43e5-b500-39c9ea9c1975", 355 | "showTitle": true, 356 | "tableResultSettingsMap": {}, 357 | "title": "List all clusters" 358 | } 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "clusters = list(\n", 363 | " ws.clusters.list(\n", 364 | " filter_by=ListClustersFilterBy(\n", 365 | " cluster_sources=[ClusterSource.API, ClusterSource.UI]\n", 366 | " )\n", 367 | " )\n", 368 | ")\n", 369 | "\n", 370 | "logger.info(f\"Found {len(clusters)} clusters\")" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 0, 376 | "metadata": { 377 | "application/vnd.databricks.v1+cell": { 378 | "cellMetadata": { 379 | "byteLimit": 2048000, 380 | "rowLimit": 10000 381 | }, 382 | "inputWidgets": {}, 383 | "nuid": "69aece1f-8c0c-496a-aa1a-cbc613c69c70", 384 | "showTitle": true, 385 | "tableResultSettingsMap": {}, 386 | "title": "Display clusters as table" 387 | } 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "pd.DataFrame([cluster.as_dict() for cluster in clusters])" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 0, 397 | "metadata": { 398 | "application/vnd.databricks.v1+cell": { 399 | "cellMetadata": { 400 | "byteLimit": 2048000, 401 | "rowLimit": 10000 402 | }, 403 | "inputWidgets": {}, 404 | "nuid": "a6ebd085-4694-409a-96f1-a89eaf42119c", 405 | "showTitle": true, 406 | "tableResultSettingsMap": {}, 407 | "title": "Save cluster update parameters" 408 | } 409 | }, 410 | "outputs": [], 411 | "source": [ 412 | "# A dictionary which maps each cluster ID to parameters for the cluster update method\n", 413 | "cluster_updates = {}" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": { 419 | "application/vnd.databricks.v1+cell": { 420 | "cellMetadata": { 421 | "byteLimit": 2048000, 422 | "rowLimit": 10000 423 | }, 424 | "inputWidgets": {}, 425 | "nuid": "8c860c4d-9a70-42d7-92fb-cb664ac80a7a", 426 | "showTitle": false, 427 | "tableResultSettingsMap": {}, 428 | "title": "" 429 | } 430 | }, 431 | "source": [ 432 | "### Updating the Databricks Runtime Version\n", 433 | "\n", 434 | "The runtime version is the `cluster.spark_version` field." 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 0, 440 | "metadata": { 441 | "application/vnd.databricks.v1+cell": { 442 | "cellMetadata": { 443 | "byteLimit": 2048000, 444 | "rowLimit": 10000 445 | }, 446 | "inputWidgets": {}, 447 | "nuid": "fd0f7b31-e0a8-471c-b395-ccf5297e87a6", 448 | "showTitle": true, 449 | "tableResultSettingsMap": {}, 450 | "title": "Function for retrieving the new runtime version" 451 | } 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "valid_versions = set(\n", 456 | " pd.DataFrame(\n", 457 | " [version.as_dict() for version in ws.clusters.spark_versions().versions]\n", 458 | " )[\"key\"].tolist()\n", 459 | ")\n", 460 | "\n", 461 | "\n", 462 | "def get_updated_spark_version_key(\n", 463 | " spark_version_key: str, desired_runtime_version: str\n", 464 | ") -> str:\n", 465 | " new_spark_version = re.sub(\n", 466 | " r\"^\\d{2}\\.\\d\", desired_runtime_version, spark_version_key\n", 467 | " )\n", 468 | "\n", 469 | " if new_spark_version not in valid_versions:\n", 470 | " raise ValueError(f\"Could not validate version '{new_spark_version}'\")\n", 471 | "\n", 472 | " return new_spark_version\n", 473 | "\n", 474 | "\n", 475 | "assert (\n", 476 | " get_updated_spark_version_key(\"11.3.x-photon-scala2.12\", \"15.4\")\n", 477 | " == \"15.4.x-photon-scala2.12\"\n", 478 | ")" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 0, 484 | "metadata": { 485 | "application/vnd.databricks.v1+cell": { 486 | "cellMetadata": { 487 | "byteLimit": 2048000, 488 | "rowLimit": 10000 489 | }, 490 | "inputWidgets": {}, 491 | "nuid": "efb3415b-6f28-4500-a480-29b43a2373c6", 492 | "showTitle": true, 493 | "tableResultSettingsMap": {}, 494 | "title": "Given a cluster, define the params for updating it's runtime version" 495 | } 496 | }, 497 | "outputs": [], 498 | "source": [ 499 | "def update_cluster_spark_version(cluster: ClusterDetails):\n", 500 | " cluster_updates[cluster.cluster_id] = {\n", 501 | " **(cluster_updates.get(cluster.cluster_id) or {}),\n", 502 | " \"spark_version\": get_updated_spark_version_key(\n", 503 | " cluster.spark_version, desired_runtime_version\n", 504 | " ),\n", 505 | " }\n", 506 | "\n", 507 | "\n", 508 | "for cluster in clusters:\n", 509 | " update_cluster_spark_version(cluster)" 510 | ] 511 | }, 512 | { 513 | "cell_type": "markdown", 514 | "metadata": { 515 | "application/vnd.databricks.v1+cell": { 516 | "cellMetadata": { 517 | "byteLimit": 2048000, 518 | "rowLimit": 10000 519 | }, 520 | "inputWidgets": {}, 521 | "nuid": "9d802ce3-3004-4d6a-980c-a46c1ed81cbb", 522 | "showTitle": false, 523 | "tableResultSettingsMap": {}, 524 | "title": "" 525 | } 526 | }, 527 | "source": [ 528 | "### Update the Init Scripts" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 0, 534 | "metadata": { 535 | "application/vnd.databricks.v1+cell": { 536 | "cellMetadata": { 537 | "byteLimit": 2048000, 538 | "rowLimit": 10000 539 | }, 540 | "inputWidgets": {}, 541 | "nuid": "2d188fd2-b3ae-4409-8870-3ad1dbf6dca9", 542 | "showTitle": true, 543 | "tableResultSettingsMap": {}, 544 | "title": "Given a cluster, update it so that it uses specific init scripts" 545 | } 546 | }, 547 | "outputs": [], 548 | "source": [ 549 | "def update_cluster_init_scripts(\n", 550 | " cluster: ClusterDetails, init_scripts: list[InitScriptInfo]\n", 551 | "):\n", 552 | " cluster_updates[cluster.cluster_id] = {\n", 553 | " **(cluster_updates.get(cluster.cluster_id) or {}),\n", 554 | " \"init_scripts\": init_scripts,\n", 555 | " }\n", 556 | "\n", 557 | "\n", 558 | "for cluster in clusters:\n", 559 | " update_cluster_init_scripts(cluster, cluster_init_scripts)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": { 565 | "application/vnd.databricks.v1+cell": { 566 | "cellMetadata": { 567 | "byteLimit": 2048000, 568 | "rowLimit": 10000 569 | }, 570 | "inputWidgets": {}, 571 | "nuid": "d1980665-fc7c-4341-b78f-c4c8b425ce42", 572 | "showTitle": false, 573 | "tableResultSettingsMap": {}, 574 | "title": "" 575 | } 576 | }, 577 | "source": [ 578 | "### Execute the updates" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 0, 584 | "metadata": { 585 | "application/vnd.databricks.v1+cell": { 586 | "cellMetadata": { 587 | "byteLimit": 2048000, 588 | "rowLimit": 10000 589 | }, 590 | "inputWidgets": {}, 591 | "nuid": "4c32de4a-0967-495a-bce7-cd45ba442eb2", 592 | "showTitle": true, 593 | "tableResultSettingsMap": {}, 594 | "title": "Use the SDK to make the cluster updates" 595 | } 596 | }, 597 | "outputs": [], 598 | "source": [ 599 | "clusters_to_update = clusters\n", 600 | "names_for_clusters_that_failed_update = []\n", 601 | "\n", 602 | "for cluster in clusters_to_update:\n", 603 | " cluster_id = cluster.cluster_id\n", 604 | "\n", 605 | " # Do not update the cluster which is running this notebook\n", 606 | " # because it will force a restart\n", 607 | " if cluster_id == spark.conf.get(\"spark.databricks.clusterUsageTags.clusterId\"):\n", 608 | " logger.info(\n", 609 | " f\"Skipping cluster: '{cluster.cluster_name}', because it is running this notebook\"\n", 610 | " )\n", 611 | " continue\n", 612 | "\n", 613 | " updates = cluster_updates.get(cluster_id)\n", 614 | "\n", 615 | " if updates is None:\n", 616 | " continue\n", 617 | "\n", 618 | " update_mask = \",\".join(updates.keys())\n", 619 | "\n", 620 | " try:\n", 621 | " ws.clusters.update(\n", 622 | " cluster_id=cluster_id,\n", 623 | " update_mask=update_mask,\n", 624 | " cluster=UpdateClusterResource(**updates),\n", 625 | " )\n", 626 | " logger.info(f\"Updated cluster: '{cluster.cluster_name}'\")\n", 627 | " except Exception as e:\n", 628 | " logger.error(f\"Failed to update cluster: '{cluster.cluster_name}'\")\n", 629 | " logger.error(e)\n", 630 | " names_for_clusters_that_failed_update.append(cluster.cluster_name)\n", 631 | "\n", 632 | "\n", 633 | "cluster_update_failures = len(names_for_clusters_that_failed_update)\n", 634 | "cluster_count = len(clusters_to_update)\n", 635 | "\n", 636 | "if cluster_update_failures > 0:\n", 637 | " cluster_update_failure_message = (\n", 638 | " f\"Failed to update {cluster_update_failures} of {cluster_count} cluster(s)\"\n", 639 | " )\n", 640 | " if cluster_update_failures / len(clusters) >= 0.25:\n", 641 | " raise Exception(cluster_update_failure_message)\n", 642 | "\n", 643 | " logger.warning(cluster_update_failure_message)\n", 644 | "else:\n", 645 | " logger.info(f\"Updated all {cluster_count} cluster(s)\")" 646 | ] 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "metadata": { 651 | "application/vnd.databricks.v1+cell": { 652 | "cellMetadata": { 653 | "byteLimit": 2048000, 654 | "rowLimit": 10000 655 | }, 656 | "inputWidgets": {}, 657 | "nuid": "9ed4b505-873b-4211-8d62-5caff2de6e21", 658 | "showTitle": false, 659 | "tableResultSettingsMap": {}, 660 | "title": "" 661 | } 662 | }, 663 | "source": [ 664 | "## Jobs" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 0, 670 | "metadata": { 671 | "application/vnd.databricks.v1+cell": { 672 | "cellMetadata": { 673 | "byteLimit": 2048000, 674 | "rowLimit": 10000 675 | }, 676 | "inputWidgets": {}, 677 | "nuid": "d5eb9996-eb42-408e-8851-13985e039ca4", 678 | "showTitle": true, 679 | "tableResultSettingsMap": {}, 680 | "title": "List all jobs" 681 | } 682 | }, 683 | "outputs": [], 684 | "source": [ 685 | "jobs = list(ws.jobs.list(expand_tasks=True))\n", 686 | "logger.info(f\"Found {len(jobs)} jobs\")" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": 0, 692 | "metadata": { 693 | "application/vnd.databricks.v1+cell": { 694 | "cellMetadata": { 695 | "byteLimit": 2048000, 696 | "rowLimit": 10000 697 | }, 698 | "inputWidgets": {}, 699 | "nuid": "ada4fa8c-43ea-45fb-bb0a-6df15bb2a7a3", 700 | "showTitle": true, 701 | "tableResultSettingsMap": {}, 702 | "title": "Display jobs as a table" 703 | } 704 | }, 705 | "outputs": [], 706 | "source": [ 707 | "pd.DataFrame([job.as_dict() for job in jobs])" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": 0, 713 | "metadata": { 714 | "application/vnd.databricks.v1+cell": { 715 | "cellMetadata": { 716 | "byteLimit": 2048000, 717 | "rowLimit": 10000 718 | }, 719 | "inputWidgets": {}, 720 | "nuid": "939f4dee-2999-4a05-b247-dbfb6f097b20", 721 | "showTitle": true, 722 | "tableResultSettingsMap": {}, 723 | "title": "Creates a new job with its job clusters on the updated runtime version" 724 | } 725 | }, 726 | "outputs": [], 727 | "source": [ 728 | "def update_job_clusters_spark_version(job: Job | BaseJob) -> Job | BaseJob:\n", 729 | " job_clusters = []\n", 730 | " for jc in job.settings.job_clusters:\n", 731 | " njc = jc.__class__.from_dict(jc.as_dict())\n", 732 | " njc.new_cluster.spark_version = get_updated_spark_version_key(\n", 733 | " njc.new_cluster.spark_version, desired_runtime_version\n", 734 | " )\n", 735 | " job_clusters.append(njc)\n", 736 | "\n", 737 | " new_job = job.__class__.from_dict(job.as_dict())\n", 738 | " new_job.settings.job_clusters = job_clusters\n", 739 | " return new_job" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": 0, 745 | "metadata": { 746 | "application/vnd.databricks.v1+cell": { 747 | "cellMetadata": { 748 | "byteLimit": 2048000, 749 | "rowLimit": 10000 750 | }, 751 | "inputWidgets": {}, 752 | "nuid": "2c5b17e3-d89e-4a8e-a7b8-5ed714c4a7ef", 753 | "showTitle": true, 754 | "tableResultSettingsMap": {}, 755 | "title": "Creates a new job with its job clusters using the specified init scripts" 756 | } 757 | }, 758 | "outputs": [], 759 | "source": [ 760 | "def update_job_clusters_init_scripts(\n", 761 | " job: Job | BaseJob, init_scripts: list[InitScriptInfo]\n", 762 | ") -> Job | BaseJob:\n", 763 | " job_clusters = []\n", 764 | " for jc in job.settings.job_clusters:\n", 765 | " njc = jc.__class__.from_dict(jc.as_dict())\n", 766 | " njc.new_cluster.init_scripts = init_scripts\n", 767 | " job_clusters.append(njc)\n", 768 | "\n", 769 | " new_job = job.__class__.from_dict(job.as_dict())\n", 770 | " new_job.settings.job_clusters = job_clusters\n", 771 | " return new_job" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": 0, 777 | "metadata": { 778 | "application/vnd.databricks.v1+cell": { 779 | "cellMetadata": { 780 | "byteLimit": 2048000, 781 | "rowLimit": 10000 782 | }, 783 | "inputWidgets": {}, 784 | "nuid": "80a63f0b-abce-4a5b-8ad5-a151f9ff35a6", 785 | "showTitle": true, 786 | "tableResultSettingsMap": {}, 787 | "title": "Update all jobs" 788 | } 789 | }, 790 | "outputs": [], 791 | "source": [ 792 | "names_for_jobs_that_failed_update = []\n", 793 | "\n", 794 | "jobs_to_update = jobs\n", 795 | "\n", 796 | "for job in jobs_to_update:\n", 797 | " njob = update_job_clusters_spark_version(job)\n", 798 | " njob = update_job_clusters_init_scripts(njob, job_cluster_init_scripts)\n", 799 | "\n", 800 | " new_settings = njob.settings.as_dict()\n", 801 | " new_settings = {\n", 802 | " k: v for k, v in new_settings.items() if k in (\"job_clusters\", \"init_scripts\")\n", 803 | " }\n", 804 | " new_settings = JobSettings.from_dict(new_settings)\n", 805 | "\n", 806 | " try:\n", 807 | " ws.jobs.update(job_id=job.job_id, new_settings=njob.settings)\n", 808 | " logger.info(f\"Updated job: '{job.settings.name}'\")\n", 809 | " except Exception as e:\n", 810 | " logger.error(f\"Failed to update job: '{job.settings.name}'\")\n", 811 | " logger.error(e)\n", 812 | " names_for_jobs_that_failed_update.append(job.settings.name)\n", 813 | "\n", 814 | "job_update_failures = len(names_for_jobs_that_failed_update)\n", 815 | "job_count = len(jobs_to_update)\n", 816 | "\n", 817 | "if job_update_failures > 0:\n", 818 | " job_update_failure_message = (\n", 819 | " f\"Failed to update {job_update_failures} of {job_count} job(s)\"\n", 820 | " )\n", 821 | " if job_update_failures / len(jobs) >= 0.25:\n", 822 | " raise Exception(job_update_failure_message)\n", 823 | "\n", 824 | " logger.warning(job_update_failure_message)\n", 825 | "else:\n", 826 | " logger.info(f\"Updated all {job_count} job(s)\")" 827 | ] 828 | } 829 | ], 830 | "metadata": { 831 | "application/vnd.databricks.v1+notebook": { 832 | "computePreferences": null, 833 | "dashboards": [], 834 | "environmentMetadata": null, 835 | "language": "python", 836 | "notebookMetadata": { 837 | "pythonIndentUnit": 4 838 | }, 839 | "notebookName": "update_job_cluster", 840 | "widgets": { 841 | "cluster_init_script_files": { 842 | "currentValue": "", 843 | "nuid": "57203fa8-3437-4dad-a7b2-ae12d82f7612", 844 | "typedWidgetInfo": { 845 | "autoCreated": false, 846 | "defaultValue": "", 847 | "label": null, 848 | "name": "cluster_init_script_files", 849 | "options": { 850 | "widgetDisplayType": "Text", 851 | "validationRegex": null 852 | }, 853 | "parameterDataType": "String" 854 | }, 855 | "widgetInfo": { 856 | "widgetType": "text", 857 | "defaultValue": "", 858 | "label": null, 859 | "name": "cluster_init_script_files", 860 | "options": { 861 | "widgetType": "text", 862 | "autoCreated": null, 863 | "validationRegex": null 864 | } 865 | } 866 | }, 867 | "desired_runtime_version": { 868 | "currentValue": "", 869 | "nuid": "37292731-03a3-4422-9f9b-80d4b9fa8e0d", 870 | "typedWidgetInfo": { 871 | "autoCreated": false, 872 | "defaultValue": "", 873 | "label": null, 874 | "name": "desired_runtime_version", 875 | "options": { 876 | "widgetDisplayType": "Text", 877 | "validationRegex": null 878 | }, 879 | "parameterDataType": "String" 880 | }, 881 | "widgetInfo": { 882 | "widgetType": "text", 883 | "defaultValue": "", 884 | "label": null, 885 | "name": "desired_runtime_version", 886 | "options": { 887 | "widgetType": "text", 888 | "autoCreated": null, 889 | "validationRegex": null 890 | } 891 | } 892 | }, 893 | "init_scripts_dir": { 894 | "currentValue": "", 895 | "nuid": "77a17daf-1ac2-4821-8842-a395e102a92c", 896 | "typedWidgetInfo": { 897 | "autoCreated": false, 898 | "defaultValue": "", 899 | "label": null, 900 | "name": "init_scripts_dir", 901 | "options": { 902 | "widgetDisplayType": "Text", 903 | "validationRegex": null 904 | }, 905 | "parameterDataType": "String" 906 | }, 907 | "widgetInfo": { 908 | "widgetType": "text", 909 | "defaultValue": "", 910 | "label": null, 911 | "name": "init_scripts_dir", 912 | "options": { 913 | "widgetType": "text", 914 | "autoCreated": null, 915 | "validationRegex": null 916 | } 917 | } 918 | }, 919 | "job_cluster_init_script_files": { 920 | "currentValue": "", 921 | "nuid": "5017ecfc-acc5-46d4-83aa-44e7276c4ddf", 922 | "typedWidgetInfo": { 923 | "autoCreated": false, 924 | "defaultValue": "", 925 | "label": null, 926 | "name": "job_cluster_init_script_files", 927 | "options": { 928 | "widgetDisplayType": "Text", 929 | "validationRegex": null 930 | }, 931 | "parameterDataType": "String" 932 | }, 933 | "widgetInfo": { 934 | "widgetType": "text", 935 | "defaultValue": "", 936 | "label": null, 937 | "name": "job_cluster_init_script_files", 938 | "options": { 939 | "widgetType": "text", 940 | "autoCreated": null, 941 | "validationRegex": null 942 | } 943 | } 944 | }, 945 | "workspace_host": { 946 | "currentValue": "", 947 | "nuid": "300637f4-1b1a-41b0-8507-13c8df6f5e65", 948 | "typedWidgetInfo": { 949 | "autoCreated": false, 950 | "defaultValue": "", 951 | "label": null, 952 | "name": "workspace_host", 953 | "options": { 954 | "widgetDisplayType": "Text", 955 | "validationRegex": null 956 | }, 957 | "parameterDataType": "String" 958 | }, 959 | "widgetInfo": { 960 | "widgetType": "text", 961 | "defaultValue": "", 962 | "label": null, 963 | "name": "workspace_host", 964 | "options": { 965 | "widgetType": "text", 966 | "autoCreated": null, 967 | "validationRegex": null 968 | } 969 | } 970 | }, 971 | "workspace_token": { 972 | "currentValue": "", 973 | "nuid": "96fd30d3-9fe9-4666-a928-ac1afcdde420", 974 | "typedWidgetInfo": { 975 | "autoCreated": false, 976 | "defaultValue": "", 977 | "label": null, 978 | "name": "workspace_token", 979 | "options": { 980 | "widgetDisplayType": "Text", 981 | "validationRegex": null 982 | }, 983 | "parameterDataType": "String" 984 | }, 985 | "widgetInfo": { 986 | "widgetType": "text", 987 | "defaultValue": "", 988 | "label": null, 989 | "name": "workspace_token", 990 | "options": { 991 | "widgetType": "text", 992 | "autoCreated": null, 993 | "validationRegex": null 994 | } 995 | } 996 | } 997 | } 998 | }, 999 | "language_info": { 1000 | "name": "python" 1001 | } 1002 | }, 1003 | "nbformat": 4, 1004 | "nbformat_minor": 0 1005 | } 1006 | -------------------------------------------------------------------------------- /notebooks/workflow_calendar/README.md: -------------------------------------------------------------------------------- 1 |

Workflow Calendar 📆

2 |

3 | Databricks 4 | Plotly 5 |

6 | 7 | ## Introduction 8 | 9 | This notebook is designed to visualize workflow schedules and their respective runs. It showcases aspects of scheduling, execution, and duration of your tasks. 10 | 11 | ## Use Cases 12 | 13 | This notebook is useful for the below cases: 14 | 15 | 1. **Performance Monitoring**: Keep an eye on how long your runs are taking and identify potential bottlenecks. 16 | 2. **Scheduling Insights**: Understand when your workflows are scheduled and when the first run occurred. Resolve timing conflicts and ensure that your workflows are running as expected. 17 | 3. **Historical Analysis**: Analyze the historical data of your runs, making it easier to identify trends and patterns. 18 | 4. **Resource Allocation**: Optimize your resource allocation based on past performance. 19 | 5. **Troubleshooting**: Quickly identify runs that failed or took longer than expected. 20 | 21 | --- 22 | 23 | 24 | 25 | --- 26 | 27 | See more details in the notebook (ipynb) 28 | -------------------------------------------------------------------------------- /notebooks/workflow_calendar/assets/example_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dotlas/databricks_helpers/6b9a6f1eb5eb2c1d57ce61f7d3ae5443fba39b9b/notebooks/workflow_calendar/assets/example_viz.png -------------------------------------------------------------------------------- /notebooks/workflow_calendar/workflow_calender.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "27fd11cc-a11d-4c9a-9a74-e44f8763a4ce", 10 | "showTitle": false, 11 | "title": "" 12 | } 13 | }, 14 | "source": [ 15 | "# Workflow Calendar 📆\n", 16 | "## Requirements\n", 17 | "### Databricks\n", 18 | "* A Databricks Workspace & Workspace Access Token\n", 19 | "* At least one runnable cluster within the workspace\n", 20 | "* At least one scheduled job in Databricks workflows\n", 21 | "\n", 22 | "### Packages\n", 23 | "This process relies on a package called `cron-schedule-triggers` which is used to infer the cron-schedule expression. `pandas` for data manipulation and `plotly` for visualization.\n", 24 | "* cron-schedule-triggers\n", 25 | "* pandas\n", 26 | "* plotly\n", 27 | "\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "application/vnd.databricks.v1+cell": { 35 | "cellMetadata": {}, 36 | "inputWidgets": {}, 37 | "nuid": "6d4292f1-19fa-44ce-aeb4-3b108c69e659", 38 | "showTitle": false, 39 | "title": "" 40 | } 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "pip install cron-schedule-triggers -q" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "application/vnd.databricks.v1+cell": { 51 | "cellMetadata": {}, 52 | "inputWidgets": {}, 53 | "nuid": "760300fa-6924-47b1-b471-40cc1c990670", 54 | "showTitle": false, 55 | "title": "" 56 | } 57 | }, 58 | "source": [ 59 | "## Imports" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "application/vnd.databricks.v1+cell": { 67 | "cellMetadata": { 68 | "byteLimit": 2048000, 69 | "rowLimit": 10000 70 | }, 71 | "inputWidgets": {}, 72 | "nuid": "89018793-50c2-432c-8979-8e287a048ccc", 73 | "showTitle": false, 74 | "title": "" 75 | } 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "import requests\n", 80 | "from typing import Optional, Callable\n", 81 | "import pandas as pd\n", 82 | "import datetime\n", 83 | "import re\n", 84 | "\n", 85 | "from cstriggers.core.trigger import QuartzCron\n", 86 | "from datetime import timedelta\n", 87 | "import plotly.express as px\n", 88 | "\n", 89 | "\n", 90 | "import plotly.graph_objects as go\n", 91 | "import plotly.figure_factory as ff" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "application/vnd.databricks.v1+cell": { 98 | "cellMetadata": {}, 99 | "inputWidgets": {}, 100 | "nuid": "764722d6-53d7-4d0e-93f6-4aa2ad4154d8", 101 | "showTitle": false, 102 | "title": "" 103 | } 104 | }, 105 | "source": [ 106 | "## Input Data\n", 107 | "\n", 108 | "> Provide the date values in `YYYY-MM-DD` format" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "application/vnd.databricks.v1+cell": { 116 | "cellMetadata": { 117 | "byteLimit": 2048000, 118 | "rowLimit": 10000 119 | }, 120 | "inputWidgets": {}, 121 | "nuid": "249ef86e-0d94-47af-a9a1-6eea8f273919", 122 | "showTitle": false, 123 | "title": "" 124 | } 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "dbutils.widgets.removeAll()\n", 129 | "\n", 130 | "dbutils.widgets.text(\"start_date\", \"2023-10-01\")\n", 131 | "\n", 132 | "start_date: datetime.datetime = datetime.datetime.strptime(\n", 133 | " getArgument(\"start_date\"), \"%Y-%m-%d\"\n", 134 | ")\n", 135 | "\n", 136 | "dbutils.widgets.text(\"end_date\", \"2023-11-05\")\n", 137 | "\n", 138 | "end_date: datetime.datetime = datetime.datetime.strptime(\n", 139 | " getArgument(\"end_date\"), \"%Y-%m-%d\"\n", 140 | ")\n", 141 | "\n", 142 | "dbutils.widgets.text(\"databricks_url\", \"\")\n", 143 | "databricks_url: str = getArgument(\"databricks_url\")\n", 144 | "\n", 145 | "dbutils.widgets.text(\"databricks_workspace_token\", \"\")\n", 146 | "databricks_workspace_token: str = getArgument(\"databricks_workspace_token\")\n", 147 | "\n", 148 | "headers: dict = {\"Authorization\": f\"Bearer {databricks_workspace_token}\"}" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "application/vnd.databricks.v1+cell": { 156 | "cellMetadata": { 157 | "byteLimit": 2048000, 158 | "rowLimit": 10000 159 | }, 160 | "inputWidgets": {}, 161 | "nuid": "9c5b390d-b903-4cec-a8b9-f8855f297f12", 162 | "showTitle": false, 163 | "title": "" 164 | } 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "query_params: dict = {\n", 169 | " \"LIST_JOBS_LIMIT\": 100, # max limit\n", 170 | " \"LIST_RUNS_LIMIT\": 25, # max limit\n", 171 | " \"EXPAND_RUNS\": \"true\",\n", 172 | " \"EXPAND_TASKS\": \"true\",\n", 173 | "}" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "application/vnd.databricks.v1+cell": { 181 | "cellMetadata": { 182 | "byteLimit": 2048000, 183 | "rowLimit": 10000 184 | }, 185 | "inputWidgets": {}, 186 | "nuid": "131b84b3-735e-46bf-9c0e-5494c55606f9", 187 | "showTitle": false, 188 | "title": "" 189 | } 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "def paginate(\n", 194 | " can_paginate: bool,\n", 195 | " next_page_token: Optional[str],\n", 196 | " url: str,\n", 197 | " workspace_token: str,\n", 198 | " function_to_call: Callable,\n", 199 | ") -> None:\n", 200 | " \"\"\"\n", 201 | " Paginates to the next page if possible\n", 202 | " input:\n", 203 | " can_paginate [bool]: Boolean info about wheather there is additional info.\n", 204 | " next_page_token [str]: Token needed in url query param to paginate to next page.\n", 205 | " url [str]: Url used to list the needed info.\n", 206 | " workspace_token[str]: Databricks workspace token from the widget, needed for authorization.\n", 207 | " function_to_call [Callable]: Function that gets called with the paginated url to paginate further.\n", 208 | " output:\n", 209 | " None\n", 210 | " \"\"\"\n", 211 | "\n", 212 | " if next_page_token and can_paginate:\n", 213 | " if \"&page_token\" in url:\n", 214 | " url = f\"{url[:url.find('&page_token')]}&page_token={next_page_token}\"\n", 215 | " else:\n", 216 | " url = f\"{url}&page_token={next_page_token}\"\n", 217 | "\n", 218 | " function_to_call(url, workspace_token)\n", 219 | " else:\n", 220 | " return" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "application/vnd.databricks.v1+cell": { 227 | "cellMetadata": {}, 228 | "inputWidgets": {}, 229 | "nuid": "6d032e6a-8afa-4130-8f0c-d37f0c69f0ff", 230 | "showTitle": false, 231 | "title": "" 232 | } 233 | }, 234 | "source": [ 235 | "## Steps 📊\n", 236 | "\n", 237 | "### 1. Fetch Workflows and Runs 🏃‍♂️\n", 238 | "\n", 239 | "This notebook begins by fetching all the [workflows](https://docs.databricks.com/api/workspace/jobs/list) in your Databricks workspace. It also retrieves information about the [runs](https://docs.databricks.com/api/workspace/runs/list) that have occurred within a specified date range, which is provided by the user.\n", 240 | "\n", 241 | "### 2. Parse the fetched info 🧩\n", 242 | "Workflows have a schedule which is defined using a `quartz_cron-expression` using which we generate the dates of next runs.\n", 243 | "\n", 244 | "### 3. Visualizations 📈\n", 245 | "\n", 246 | "The notebook provides three insightful visualizations:\n", 247 | "\n", 248 | "- **First Scheduled Run of All Workflows**: Visualizes the first scheduled run of each workflow since the start date.\n", 249 | "\n", 250 | "- **Scheduled Runs Between Start and End Date**: Shows all scheduled runs that occurred within the specified date range.\n", 251 | "\n", 252 | "- **All Runs Since Start Date with Time Taken**: Displays all runs that have occurred since the start date, plotting them along with their execution time for performance analysis.\n", 253 | "\n" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "application/vnd.databricks.v1+cell": { 260 | "cellMetadata": {}, 261 | "inputWidgets": {}, 262 | "nuid": "6f8a1915-b729-448f-a87f-536f0e8e01ef", 263 | "showTitle": false, 264 | "title": "" 265 | } 266 | }, 267 | "source": [ 268 | "## List workflows \n", 269 | "#### Fetches all workflows in current workspace and its respective configs\n", 270 | "API Docs\n" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "application/vnd.databricks.v1+cell": { 278 | "cellMetadata": { 279 | "byteLimit": 2048000, 280 | "rowLimit": 10000 281 | }, 282 | "inputWidgets": {}, 283 | "nuid": "07dcd0ca-de2a-474b-be0b-906ebb6d9bd0", 284 | "showTitle": false, 285 | "title": "" 286 | } 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "def getAllJobs(list_jobs_url: str, workspace_token: str) -> None:\n", 291 | " \"\"\"\n", 292 | " Fetches all the jobs and metadata about them.\n", 293 | " input:\n", 294 | " lists_jobs_url [str]: Databricks API used to fetch all the jobs.\n", 295 | " workspace_token[str]: Databricks workspace token from the widget, needed for authorization.\n", 296 | " output:\n", 297 | " None\n", 298 | " \"\"\"\n", 299 | "\n", 300 | " response = requests.get(\n", 301 | " list_jobs_url,\n", 302 | " headers=headers,\n", 303 | " )\n", 304 | " assert response.status_code == 200\n", 305 | "\n", 306 | " response_data = response.json()\n", 307 | "\n", 308 | " for job in response_data.get(\"jobs\", []):\n", 309 | " if job.get(\"settings\", {}).get(\"schedule\"):\n", 310 | " jobs[job.get(\"job_id\")] = {\n", 311 | " \"name\": job.get(\"settings\", {}).get(\"name\"),\n", 312 | " \"quartz_cron_expression\": job.get(\"settings\", {})\n", 313 | " .get(\"schedule\", {})\n", 314 | " .get(\"quartz_cron_expression\")\n", 315 | " .lower(),\n", 316 | " }\n", 317 | "\n", 318 | " paginate(\n", 319 | " response_data.get(\"has_more\", False),\n", 320 | " response_data.get(\"next_page_token\"),\n", 321 | " list_jobs_url,\n", 322 | " workspace_token,\n", 323 | " getAllJobs,\n", 324 | " )\n", 325 | "\n", 326 | "\n", 327 | "jobs = {} # holds all jobs' info\n", 328 | "\n", 329 | "list_jobs_url: str = (\n", 330 | " databricks_url\n", 331 | " + \"/api/2.1/jobs/list\"\n", 332 | " + f\"?limit={query_params.get('LIST_JOBS_LIMIT')}\"\n", 333 | " + f\"&expand_tasks={query_params['EXPAND_TASKS']}\"\n", 334 | ")\n", 335 | "\n", 336 | "getAllJobs(list_jobs_url, databricks_workspace_token)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": { 342 | "application/vnd.databricks.v1+cell": { 343 | "cellMetadata": {}, 344 | "inputWidgets": {}, 345 | "nuid": "e7996e9d-3e42-4be9-9d8e-5d814d1887f3", 346 | "showTitle": false, 347 | "title": "" 348 | } 349 | }, 350 | "source": [ 351 | "## Parse the fetched data\n", 352 | "#### Infer the cron expression and calculate the next run. \n", 353 | "#### Additionally you can also categorize workflows based on the title, as this category is what determines the colour of the plotted workflow." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "application/vnd.databricks.v1+cell": { 361 | "cellMetadata": { 362 | "byteLimit": 2048000, 363 | "rowLimit": 10000 364 | }, 365 | "inputWidgets": {}, 366 | "nuid": "fcfbf534-33c4-4f04-9157-002ba0858386", 367 | "showTitle": false, 368 | "title": "" 369 | } 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "def categorizeWorkflow(workflow_title: str) -> str:\n", 374 | " \"\"\"You can add custom grouping logic. as this will be used to\n", 375 | " group the workflows, as they will be coloured based on their categories\n", 376 | " in the plot.\n", 377 | " input:\n", 378 | " workflow_title : str\n", 379 | " output:\n", 380 | " category : str\n", 381 | " \"\"\"\n", 382 | "\n", 383 | " category = workflow_title # add custom logic to categorize the workflow\n", 384 | " return category\n", 385 | "\n", 386 | "\n", 387 | "for job_id, job_info in jobs.items():\n", 388 | " cron_expression = job_info[\"quartz_cron_expression\"]\n", 389 | "\n", 390 | " cron_obj = QuartzCron(\n", 391 | " schedule_string=cron_expression,\n", 392 | " start_date=start_date, # This is the start date based on which the next scheduled run is generated. You can change it as per your needs.\n", 393 | " )\n", 394 | "\n", 395 | " next_scheduled_run = cron_obj.next_trigger(isoformat=False)\n", 396 | " # print(next_scheduled_run)\n", 397 | " jobs[job_id][\"next_scheduled_run\"] = next_scheduled_run\n", 398 | " jobs[job_id][\"workflow_category\"] = categorizeWorkflow(jobs[job_id][\"name\"])" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": { 404 | "application/vnd.databricks.v1+cell": { 405 | "cellMetadata": {}, 406 | "inputWidgets": {}, 407 | "nuid": "083dcc0f-f5fd-4a3b-b1ea-71c8cda66492", 408 | "showTitle": false, 409 | "title": "" 410 | } 411 | }, 412 | "source": [ 413 | "## Jitter workflows\n", 414 | "#### Sometimes workflows maybe scheduled too close to each other, this causes them to be too close to each other in the visualization, thus we jitter the workflows slighlty so as to obtain a neat visualization." 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": { 421 | "application/vnd.databricks.v1+cell": { 422 | "cellMetadata": { 423 | "byteLimit": 2048000, 424 | "rowLimit": 10000 425 | }, 426 | "inputWidgets": {}, 427 | "nuid": "92f15423-f544-46f1-b040-9d1e6b96b895", 428 | "showTitle": false, 429 | "title": "" 430 | } 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "def jitterPoints(df: pd.DataFrame) -> pd.DataFrame:\n", 435 | " \"\"\"If two workflow's have schedules too close to each other\n", 436 | " then this function moves them a bit away from each other\n", 437 | " so that the visualization is neat\"\"\"\n", 438 | " # Initialize a flag to keep track of whether any adjustments were made\n", 439 | " adjusted = True\n", 440 | " max_iterations = 2 # Set a maximum number of iterations, increase if you have a lot of conflicting workflow schedules.\n", 441 | " jitter_minutes = 10 # adjust based on need\n", 442 | "\n", 443 | " iteration = 0\n", 444 | " while adjusted and iteration < max_iterations:\n", 445 | " adjusted = False\n", 446 | "\n", 447 | " for i in range(1, len(df)):\n", 448 | " diff = df[\"start_datetime\"].iloc[i] - df[\"start_datetime\"].iloc[i - 1]\n", 449 | "\n", 450 | " if diff <= timedelta(minutes=10):\n", 451 | " # Adjust the start time of the current event\n", 452 | " df[\"start_datetime\"].iloc[i] = df[\"start_datetime\"].iloc[\n", 453 | " i - 1\n", 454 | " ] + timedelta(minutes=jitter_minutes)\n", 455 | " adjusted = True\n", 456 | "\n", 457 | " iteration += 1\n", 458 | " return df" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": { 464 | "application/vnd.databricks.v1+cell": { 465 | "cellMetadata": {}, 466 | "inputWidgets": {}, 467 | "nuid": "3b24f2a5-adb0-49f2-aeb2-6f5dde794e09", 468 | "showTitle": false, 469 | "title": "" 470 | } 471 | }, 472 | "source": [ 473 | "## Helper Function\n", 474 | "#### Used to generate X axis tick values" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": { 481 | "application/vnd.databricks.v1+cell": { 482 | "cellMetadata": {}, 483 | "inputWidgets": {}, 484 | "nuid": "e9042ec1-ea36-4265-a286-b12fd44f49e6", 485 | "showTitle": false, 486 | "title": "" 487 | } 488 | }, 489 | "outputs": [], 490 | "source": [ 491 | "def generateXAxisTickTexts() -> list:\n", 492 | " \"\"\"Helper function used to generate x axis tick values\"\"\"\n", 493 | " temp = list(range(1, 13)) + list(range(1, 13)) # 12 hour clock entries\n", 494 | " temp = temp[-1:] + temp[:-1] # right shifting\n", 495 | " for idx in range(len(temp)): # filling the AM/PM value as its a 12 hour format\n", 496 | " if idx < len(temp) // 2:\n", 497 | " temp[idx] = f\"{temp[idx]} AM\"\n", 498 | " else:\n", 499 | " temp[idx] = f\"{temp[idx]} PM\"\n", 500 | " return temp" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": { 506 | "application/vnd.databricks.v1+cell": { 507 | "cellMetadata": {}, 508 | "inputWidgets": {}, 509 | "nuid": "ead81d1a-8a7e-45ff-a444-22fc0ae1cc1d", 510 | "showTitle": false, 511 | "title": "" 512 | } 513 | }, 514 | "source": [ 515 | "## Plot the all the result\n" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": { 522 | "application/vnd.databricks.v1+cell": { 523 | "cellMetadata": { 524 | "byteLimit": 2048000, 525 | "rowLimit": 10000 526 | }, 527 | "inputWidgets": {}, 528 | "nuid": "4d3b675a-9a3f-41ec-8551-2900c7616383", 529 | "showTitle": false, 530 | "title": "" 531 | } 532 | }, 533 | "outputs": [], 534 | "source": [ 535 | "# Adjust the plot dimensions here\n", 536 | "PLOT_HEIGHT = 700\n", 537 | "PLOT_WIDTH = 2000\n", 538 | "POINT_SIZE = 15\n", 539 | "\n", 540 | "events = [\n", 541 | " {\n", 542 | " \"name\": job_info[\"name\"],\n", 543 | " \"start_datetime\": job_info[\"next_scheduled_run\"],\n", 544 | " \"workflow_category\": job_info[\"workflow_category\"],\n", 545 | " }\n", 546 | " for job_info in jobs.values()\n", 547 | "]\n", 548 | "\n", 549 | "df = pd.DataFrame(events)\n", 550 | "\n", 551 | "df[\"start_datetime\"] = pd.to_datetime(df[\"start_datetime\"])\n", 552 | "\n", 553 | "# Sort DataFrame by 'start_datetime'\n", 554 | "df.sort_values(by=\"start_datetime\", inplace=True)\n", 555 | "\n", 556 | "# jitter closeby points\n", 557 | "df = jitterPoints(df)\n", 558 | "\n", 559 | "\n", 560 | "# Increase the size of all points by adjusting the marker size\n", 561 | "point_size = POINT_SIZE # Adjust the size as needed\n", 562 | "\n", 563 | "# Create an interactive scatter plot using Plotly Express\n", 564 | "fig = px.scatter(\n", 565 | " df,\n", 566 | " x=df[\"start_datetime\"].dt.hour\n", 567 | " + df[\"start_datetime\"].dt.minute / 60\n", 568 | " + df[\"start_datetime\"].dt.second / 3600,\n", 569 | " y=df[\"start_datetime\"].dt.strftime(\"%Y/%m/%d\"),\n", 570 | " # y= df['start_datetime'].dt.strftime('%d-%m-%y'),\n", 571 | " color=\"workflow_category\", # Color points by 'workflow_cateogry' column\n", 572 | " hover_name=\"name\", # Display event name on hover\n", 573 | " labels={\"x\": \"Time of Day (12-hour format)\", \"y\": \"Date\"},\n", 574 | " title=f\"Workflow's first run since {start_date.strftime('%Y-%m-%d')}\",\n", 575 | " template=\"plotly_white\",\n", 576 | ")\n", 577 | "\n", 578 | "\n", 579 | "# Customize the appearance of the plot\n", 580 | "fig.update_layout(\n", 581 | " xaxis=dict(\n", 582 | " tickmode=\"array\",\n", 583 | " tickvals=list(range(1, 25)),\n", 584 | " ticktext=generateXAxisTickTexts(),\n", 585 | " ),\n", 586 | " yaxis=dict(\n", 587 | " tickmode=\"array\",\n", 588 | " tickvals=list(\n", 589 | " range(\n", 590 | " 0,\n", 591 | " int((df[\"start_datetime\"].iloc[-1] - df[\"start_datetime\"].iloc[0]).days)\n", 592 | " + 10,\n", 593 | " )\n", 594 | " ),\n", 595 | " ),\n", 596 | " showlegend=True,\n", 597 | " legend_title_text=\"Workflow Category\",\n", 598 | " height=PLOT_HEIGHT, # Height of the plot\n", 599 | " width=PLOT_WIDTH, # Width of the plot\n", 600 | ")\n", 601 | "\n", 602 | "# Increase the marker size for all points\n", 603 | "fig.update_traces(marker=dict(size=point_size))\n", 604 | "\n", 605 | "# Show the interactive plot\n", 606 | "fig.show()" 607 | ] 608 | }, 609 | { 610 | "cell_type": "markdown", 611 | "metadata": { 612 | "application/vnd.databricks.v1+cell": { 613 | "cellMetadata": {}, 614 | "inputWidgets": {}, 615 | "nuid": "a22a7529-72a6-4d56-b183-535e834cb2b6", 616 | "showTitle": false, 617 | "title": "" 618 | } 619 | }, 620 | "source": [ 621 | "## Calculate all the scheduled runs \n", 622 | "#### using `start_date` and `end_data` we calculate all the scheduled runs within the data range\n", 623 | "#### Using `cron-schedule-triggers` we calculate all the next scheduled runs since the mentioned `start_date` " 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "metadata": { 630 | "application/vnd.databricks.v1+cell": { 631 | "cellMetadata": { 632 | "byteLimit": 2048000, 633 | "rowLimit": 10000 634 | }, 635 | "inputWidgets": {}, 636 | "nuid": "801365b4-09b4-4654-bd1d-018f3b38ae4b", 637 | "showTitle": false, 638 | "title": "" 639 | } 640 | }, 641 | "outputs": [], 642 | "source": [ 643 | "all_scheduled_runs = []\n", 644 | "for job_id, job_info in jobs.items():\n", 645 | " cron_expression = job_info[\"quartz_cron_expression\"]\n", 646 | "\n", 647 | " cron_obj = QuartzCron(\n", 648 | " schedule_string=cron_expression,\n", 649 | " start_date=start_date,\n", 650 | " )\n", 651 | "\n", 652 | " next_scheduled_run = cron_obj.next_trigger(isoformat=False)\n", 653 | " runs = []\n", 654 | " while next_scheduled_run <= end_date:\n", 655 | " runs.append(next_scheduled_run)\n", 656 | " next_scheduled_run = cron_obj.next_trigger(isoformat=False)\n", 657 | "\n", 658 | " for run in runs:\n", 659 | " all_scheduled_runs.append(\n", 660 | " {\n", 661 | " \"name\": jobs[job_id][\"name\"],\n", 662 | " \"start_datetime\": run,\n", 663 | " \"workflow_category\": jobs[job_id][\"workflow_category\"],\n", 664 | " }\n", 665 | " )" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": { 671 | "application/vnd.databricks.v1+cell": { 672 | "cellMetadata": {}, 673 | "inputWidgets": {}, 674 | "nuid": "4bdc6843-0f7b-472c-a1db-7398497399ea", 675 | "showTitle": false, 676 | "title": "" 677 | } 678 | }, 679 | "source": [ 680 | "## Plot the result" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": null, 686 | "metadata": { 687 | "application/vnd.databricks.v1+cell": { 688 | "cellMetadata": { 689 | "byteLimit": 2048000, 690 | "rowLimit": 10000 691 | }, 692 | "inputWidgets": {}, 693 | "nuid": "da04d5ee-cf9d-441c-a1d2-95bab6a46eed", 694 | "showTitle": false, 695 | "title": "" 696 | } 697 | }, 698 | "outputs": [], 699 | "source": [ 700 | "# Adjust the plot dimensions here\n", 701 | "PLOT_HEIGHT = 700\n", 702 | "PLOT_WIDTH = 2000\n", 703 | "POINT_SIZE = 15\n", 704 | "\n", 705 | "\n", 706 | "df = pd.DataFrame(all_scheduled_runs)\n", 707 | "\n", 708 | "df[\"start_datetime\"] = pd.to_datetime(df[\"start_datetime\"])\n", 709 | "\n", 710 | "# Sort DataFrame by 'start_datetime'\n", 711 | "df.sort_values(by=\"start_datetime\", inplace=True)\n", 712 | "\n", 713 | "# jitter closeby points\n", 714 | "df = jitterPoints(df)\n", 715 | "\n", 716 | "# Increase the size of all points by adjusting the marker size\n", 717 | "point_size = POINT_SIZE # Adjust the size as needed\n", 718 | "\n", 719 | "# Create an interactive scatter plot using Plotly Express\n", 720 | "fig = px.scatter(\n", 721 | " df,\n", 722 | " x=df[\"start_datetime\"].dt.hour\n", 723 | " + df[\"start_datetime\"].dt.minute / 60\n", 724 | " + df[\"start_datetime\"].dt.second / 3600,\n", 725 | " y=df[\"start_datetime\"].dt.strftime(\"%Y/%m/%d\"),\n", 726 | " color=\"workflow_category\", # Color points by 'workflow_category' column\n", 727 | " hover_name=\"name\", # Display event name on hover\n", 728 | " labels={\"x\": \"Time of Day (12-hour format)\", \"y\": \"Date\"},\n", 729 | " title=f\"All Workflow runs scheduled from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}\",\n", 730 | " template=\"plotly_white\",\n", 731 | ")\n", 732 | "\n", 733 | "# Customize the appearance of the plot\n", 734 | "fig.update_layout(\n", 735 | " xaxis=dict(\n", 736 | " tickmode=\"array\",\n", 737 | " tickvals=list(range(1, 25)),\n", 738 | " ticktext=generateXAxisTickTexts(),\n", 739 | " ),\n", 740 | " yaxis=dict(\n", 741 | " tickmode=\"array\",\n", 742 | " tickvals=list(\n", 743 | " range(\n", 744 | " 0,\n", 745 | " int((df[\"start_datetime\"].iloc[-1] - df[\"start_datetime\"].iloc[0]).days)\n", 746 | " + 10,\n", 747 | " )\n", 748 | " ),\n", 749 | " ),\n", 750 | " showlegend=True,\n", 751 | " legend_title_text=\"Workflow category\",\n", 752 | " height=PLOT_HEIGHT, # Height of the plot\n", 753 | " width=PLOT_WIDTH, # Width of the plot\n", 754 | ")\n", 755 | "\n", 756 | "# Increase the marker size for all points\n", 757 | "fig.update_traces(marker=dict(size=point_size))\n", 758 | "\n", 759 | "# Show the interactive plot\n", 760 | "fig.show()" 761 | ] 762 | }, 763 | { 764 | "cell_type": "markdown", 765 | "metadata": { 766 | "application/vnd.databricks.v1+cell": { 767 | "cellMetadata": {}, 768 | "inputWidgets": {}, 769 | "nuid": "bb306f7b-fa82-4949-a4de-b1f77e35d015", 770 | "showTitle": false, 771 | "title": "" 772 | } 773 | }, 774 | "source": [ 775 | "## List workflow runs\n", 776 | "#### Fetch all workflow runs that have taken place since the mentioned start date. Making sure to parse the necessary info\n", 777 | "API Docs\n", 778 | "\n", 779 | "\n" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": null, 785 | "metadata": { 786 | "application/vnd.databricks.v1+cell": { 787 | "cellMetadata": { 788 | "byteLimit": 2048000, 789 | "rowLimit": 10000 790 | }, 791 | "inputWidgets": {}, 792 | "nuid": "26076b8c-d7d9-4ac4-b398-2f1a587801e8", 793 | "showTitle": false, 794 | "title": "" 795 | } 796 | }, 797 | "outputs": [], 798 | "source": [ 799 | "all_runs_info = []\n", 800 | "\n", 801 | "\n", 802 | "def getAllRuns(list_runs_url: int, workspace_token: str) -> None:\n", 803 | " \"\"\"\n", 804 | " Fetches all the run and metadata about a given workflow.\n", 805 | " input:\n", 806 | " lists_jobs_url [str]: Databricks API used to fetch all the runs belonging to a given job.\n", 807 | " workspace_token[str]: Databricks workspace token from the widget, needed for authorization.\n", 808 | " output:\n", 809 | " None\n", 810 | " \"\"\"\n", 811 | "\n", 812 | " response = requests.get(\n", 813 | " list_runs_url,\n", 814 | " headers=headers,\n", 815 | " )\n", 816 | " assert response.status_code == 200\n", 817 | "\n", 818 | " response_data = response.json()\n", 819 | " pattern = r\"job_id=([\\w-]+)\"\n", 820 | " matched = re.search(pattern, list_runs_url)\n", 821 | " job_id = int(matched.group(1))\n", 822 | "\n", 823 | " if \"runs\" in response_data:\n", 824 | " for run_info in response_data[\"runs\"]:\n", 825 | " if (\n", 826 | " \"start_time\" in run_info\n", 827 | " and \"end_time\" in run_info\n", 828 | " and run_info[\"end_time\"]\n", 829 | " ):\n", 830 | " all_runs_info.append(\n", 831 | " {\n", 832 | " \"Task\": jobs[job_id][\"name\"],\n", 833 | " \"Start\": datetime.datetime.fromtimestamp(\n", 834 | " run_info[\"start_time\"] / 1000\n", 835 | " ),\n", 836 | " \"Finish\": datetime.datetime.fromtimestamp(\n", 837 | " run_info[\"end_time\"] / 1000\n", 838 | " ),\n", 839 | " \"Duration\": (\n", 840 | " datetime.datetime.fromtimestamp(run_info[\"end_time\"] / 1000)\n", 841 | " - datetime.datetime.fromtimestamp(\n", 842 | " run_info[\"start_time\"] / 1000\n", 843 | " )\n", 844 | " ).total_seconds()\n", 845 | " / 3600,\n", 846 | " \"workflow_category\": jobs[job_id][\"workflow_category\"],\n", 847 | " }\n", 848 | " )\n", 849 | "\n", 850 | " paginate(\n", 851 | " response_data.get(\"has_more\", False),\n", 852 | " response_data.get(\"next_page_token\"),\n", 853 | " list_runs_url,\n", 854 | " workspace_token,\n", 855 | " getAllRuns,\n", 856 | " )\n", 857 | "\n", 858 | "\n", 859 | "job_ids = list(jobs.keys())\n", 860 | "\n", 861 | "list_runs_urls = [\n", 862 | " databricks_url\n", 863 | " + \"/api/2.1/jobs/runs/list\"\n", 864 | " + f\"?job_id={job_id}\"\n", 865 | " + f\"&limit={query_params.get('LIST_RUNS_LIMIT')}\"\n", 866 | " + f\"&expand_tasks={query_params.get('EXPAND_RUNS')}\"\n", 867 | " + f\"&start_time_from={start_date.timestamp()*1000}\"\n", 868 | " for job_id in job_ids\n", 869 | "]\n", 870 | "\n", 871 | "for url in list_runs_urls:\n", 872 | " getAllRuns(url, databricks_workspace_token)" 873 | ] 874 | }, 875 | { 876 | "cell_type": "markdown", 877 | "metadata": { 878 | "application/vnd.databricks.v1+cell": { 879 | "cellMetadata": {}, 880 | "inputWidgets": {}, 881 | "nuid": "d88250c7-78d4-4555-97ff-fcccb17aabc0", 882 | "showTitle": false, 883 | "title": "" 884 | } 885 | }, 886 | "source": [ 887 | "## Plot the result" 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": null, 893 | "metadata": { 894 | "application/vnd.databricks.v1+cell": { 895 | "cellMetadata": { 896 | "byteLimit": 2048000, 897 | "rowLimit": 10000 898 | }, 899 | "inputWidgets": {}, 900 | "nuid": "73b8e1b9-01d4-4a76-a746-2e5cdd67315b", 901 | "showTitle": false, 902 | "title": "" 903 | } 904 | }, 905 | "outputs": [], 906 | "source": [ 907 | "# Adjust accordingly\n", 908 | "PLOT_HEIGHT = 1500\n", 909 | "PLOT_WIDTH = 2000\n", 910 | "\n", 911 | "runs_df = pd.DataFrame(all_runs_info)\n", 912 | "\n", 913 | "runs_df[\"Start\"] = pd.to_datetime(runs_df[\"Start\"])\n", 914 | "runs_df[\"Finish\"] = pd.to_datetime(runs_df[\"Finish\"])\n", 915 | "\n", 916 | "runs_df[\"Duration\"] = (\n", 917 | " runs_df[\"Finish\"] - runs_df[\"Start\"]\n", 918 | ").dt.total_seconds() / 3600 # Duration in hours\n", 919 | "\n", 920 | "# Create a new column 'Day' representing the day for each task\n", 921 | "runs_df[\"Day\"] = runs_df[\"Start\"].dt.date\n", 922 | "runs_df.head()\n", 923 | "\n", 924 | "# Extract task, start, and end dates\n", 925 | "tasks = runs_df[\"Task\"].tolist()\n", 926 | "start_dates = runs_df[\"Start\"].tolist()\n", 927 | "end_dates = runs_df[\"Finish\"].tolist()\n", 928 | "\n", 929 | "# Create the Gantt chart\n", 930 | "fig = ff.create_gantt(\n", 931 | " runs_df,\n", 932 | " title=\"Task Duration Gantt Chart\",\n", 933 | ")\n", 934 | "\n", 935 | "fig.update_layout(\n", 936 | " height=PLOT_HEIGHT,\n", 937 | " width=PLOT_WIDTH,\n", 938 | " plot_bgcolor=\"white\",\n", 939 | " paper_bgcolor=\"white\",\n", 940 | " yaxis=dict(showgrid=True, gridcolor=\"lightgray\"),\n", 941 | " xaxis=dict(showgrid=True, gridcolor=\"lightgray\"),\n", 942 | ")\n", 943 | "\n", 944 | "fig.show()" 945 | ] 946 | } 947 | ], 948 | "metadata": { 949 | "application/vnd.databricks.v1+notebook": { 950 | "dashboards": [ 951 | { 952 | "elements": [ 953 | { 954 | "dashboardResultIndex": 0, 955 | "elementNUID": "da04d5ee-cf9d-441c-a1d2-95bab6a46eed", 956 | "elementType": "command", 957 | "guid": "1806ca8a-35e7-4bde-b268-8ae24f5a9614", 958 | "options": null, 959 | "position": { 960 | "height": 8, 961 | "width": 24, 962 | "x": 0, 963 | "y": 8, 964 | "z": null 965 | }, 966 | "resultIndex": null 967 | }, 968 | { 969 | "dashboardResultIndex": 0, 970 | "elementNUID": "73b8e1b9-01d4-4a76-a746-2e5cdd67315b", 971 | "elementType": "command", 972 | "guid": "1c1a7f68-0a81-454d-b94b-6e00aa1fdda2", 973 | "options": null, 974 | "position": { 975 | "height": 17, 976 | "width": 24, 977 | "x": 0, 978 | "y": 16, 979 | "z": null 980 | }, 981 | "resultIndex": null 982 | }, 983 | { 984 | "dashboardResultIndex": 0, 985 | "elementNUID": "4d3b675a-9a3f-41ec-8551-2900c7616383", 986 | "elementType": "command", 987 | "guid": "3badc786-a3b5-43a9-83bc-61236ea1cd0d", 988 | "options": { 989 | "autoScaleImg": false, 990 | "scale": 0, 991 | "showRunButton": false, 992 | "showTitle": false, 993 | "titleAlign": "center" 994 | }, 995 | "position": { 996 | "height": 8, 997 | "width": 24, 998 | "x": 0, 999 | "y": 0, 1000 | "z": null 1001 | }, 1002 | "resultIndex": null 1003 | } 1004 | ], 1005 | "globalVars": {}, 1006 | "guid": "", 1007 | "layoutOption": { 1008 | "grid": true, 1009 | "stack": true 1010 | }, 1011 | "nuid": "89804740-c7b6-44b4-9c72-2e1c14be2084", 1012 | "origId": 3789653585954506, 1013 | "title": "Schedule Viz", 1014 | "version": "DashboardViewV1", 1015 | "width": 1440 1016 | } 1017 | ], 1018 | "language": "python", 1019 | "notebookMetadata": { 1020 | "mostRecentlyExecutedCommandWithImplicitDF": { 1021 | "commandId": 1634724413475231, 1022 | "dataframes": [ 1023 | "_sqldf" 1024 | ] 1025 | }, 1026 | "pythonIndentUnit": 4, 1027 | "widgetLayout": [ 1028 | { 1029 | "breakBefore": false, 1030 | "name": "databricks_url", 1031 | "width": 229 1032 | }, 1033 | { 1034 | "breakBefore": false, 1035 | "name": "databricks_workspace_token", 1036 | "width": 229 1037 | }, 1038 | { 1039 | "breakBefore": false, 1040 | "name": "start_date", 1041 | "width": 229 1042 | }, 1043 | { 1044 | "breakBefore": false, 1045 | "name": "end_date", 1046 | "width": 229 1047 | } 1048 | ] 1049 | }, 1050 | "notebookName": "workflow_calender", 1051 | "widgets": { 1052 | "databricks_url": { 1053 | "currentValue": "", 1054 | "nuid": "1252ccd1-8501-4afb-96d1-fd2d12a60852", 1055 | "widgetInfo": { 1056 | "defaultValue": "", 1057 | "label": null, 1058 | "name": "databricks_url", 1059 | "options": { 1060 | "validationRegex": null, 1061 | "widgetType": "text" 1062 | }, 1063 | "widgetType": "text" 1064 | } 1065 | }, 1066 | "databricks_workspace_token": { 1067 | "currentValue": "", 1068 | "nuid": "7944ddb4-88e5-4041-8773-64bf5327fd25", 1069 | "widgetInfo": { 1070 | "defaultValue": "", 1071 | "label": null, 1072 | "name": "databricks_workspace_token", 1073 | "options": { 1074 | "validationRegex": null, 1075 | "widgetType": "text" 1076 | }, 1077 | "widgetType": "text" 1078 | } 1079 | }, 1080 | "end_date": { 1081 | "currentValue": "2023-10-14", 1082 | "nuid": "dc84215a-1528-4af8-83de-d407d7bcc6ad", 1083 | "widgetInfo": { 1084 | "defaultValue": "2023-11-05", 1085 | "label": null, 1086 | "name": "end_date", 1087 | "options": { 1088 | "validationRegex": null, 1089 | "widgetType": "text" 1090 | }, 1091 | "widgetType": "text" 1092 | } 1093 | }, 1094 | "start_date": { 1095 | "currentValue": "2023-10-08", 1096 | "nuid": "a254d69f-7ac4-4911-b323-5f60de54125b", 1097 | "widgetInfo": { 1098 | "defaultValue": "2023-10-01", 1099 | "label": null, 1100 | "name": "start_date", 1101 | "options": { 1102 | "validationRegex": null, 1103 | "widgetType": "text" 1104 | }, 1105 | "widgetType": "text" 1106 | } 1107 | } 1108 | } 1109 | }, 1110 | "kernelspec": { 1111 | "display_name": "Python 3", 1112 | "language": "python", 1113 | "name": "python3" 1114 | }, 1115 | "language_info": { 1116 | "codemirror_mode": { 1117 | "name": "ipython", 1118 | "version": 3 1119 | }, 1120 | "file_extension": ".py", 1121 | "mimetype": "text/x-python", 1122 | "name": "python", 1123 | "nbconvert_exporter": "python", 1124 | "pygments_lexer": "ipython3", 1125 | "version": "3.10.11" 1126 | } 1127 | }, 1128 | "nbformat": 4, 1129 | "nbformat_minor": 0 1130 | } 1131 | -------------------------------------------------------------------------------- /notebooks/workflow_config_exporter/README.md: -------------------------------------------------------------------------------- 1 |

Backup your Databricks Workflows 🗃

2 |

3 | Databricks 4 |

5 | 6 | ## Introduction 7 | 8 | This notebook is used to fetch the jobs config from a workspace and then write it to disk thus helping one save a backup of their workflow config information. While such a backup can also be created using Terraform or other Infrastructure-as-code providers, this approach provides it using the vanilla JSON of the Databricks REST API. 9 | 10 | ## Use Cases 11 | 12 | Areas where such a notebook may be helpful: 13 | 14 | 1. Backup of workflow config information in case of restoration from Databricks REST API 15 | 2. Version controlling of workflow config information 16 | 17 | --- 18 | 19 | --- 20 | See more details in the notebook (ipynb) 21 | -------------------------------------------------------------------------------- /notebooks/workflow_config_exporter/assets/example_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dotlas/databricks_helpers/6b9a6f1eb5eb2c1d57ce61f7d3ae5443fba39b9b/notebooks/workflow_config_exporter/assets/example_config.png -------------------------------------------------------------------------------- /notebooks/workflow_config_exporter/workflow_config_exporter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": { 8 | "byteLimit": 2048000, 9 | "rowLimit": 10000 10 | }, 11 | "inputWidgets": {}, 12 | "nuid": "4686ad81-9fd0-4c93-9a48-b11576dc4edf", 13 | "showTitle": false, 14 | "tableResultSettingsMap": {}, 15 | "title": "" 16 | } 17 | }, 18 | "source": [ 19 | "# Backup your Databricks Workflows 🗃\n", 20 | "\n", 21 | "## Requirements\n", 22 | "\n", 23 | "### Databricks\n", 24 | "\n", 25 | "* At least one runnable cluster within the workspace\n", 26 | "\n", 27 | "\n", 28 | "### Parameters\n", 29 | "\n", 30 | "| Parameter Name | Parameter Description | Example Value |\n", 31 | "| --- | --- | --- |\n", 32 | "| `backup_file_path` | The file path (prefix) to the destination where the backup file will be stored. **Don't include filename in path**. | `s3://my-databricks-backups/jobs` |\n", 33 | "\n", 34 | "\n", 35 | "### Steps\n", 36 | "\n", 37 | "#### Fetch Job Configurations\n", 38 | "\n", 39 | "We fetch all the workflows present in your workspace, each fetched workflow config will also contain the individual task config present in the workflow and their respective job cluster configs. [Databricks API documentation](https://docs.databricks.com/api/workspace/jobs/list). \n", 40 | "\n", 41 | "#### Parse Information \n", 42 | "\n", 43 | "In this step we parse the obtained config info. The main thing to keep in mind is that the cluster config contains some fields which are populated after the cluster is initialized but will be fetched anyway from step 1, we need to remove this field or else when we use the same config to create the workflow later it will throw an error. You can also add any custom logic here. For example: You can include webhook notification ID to be associated with a workflow you like, You can also associate an existing all-purpose-compute to a workflow that you want, etc. \n", 44 | "\n", 45 | "#### Save Configuration to JSON 💾\n", 46 | "\n", 47 | "We later save the config to file, if you have a mounted s3 bucket or an azure data lake storage you can direcly specify the path as dbutils will take care of the rest. If you are running the notebook locally then you will need to change the code and use python's inbuilt `open` function to get the task done." 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "application/vnd.databricks.v1+cell": { 54 | "cellMetadata": { 55 | "byteLimit": 2048000, 56 | "rowLimit": 10000 57 | }, 58 | "inputWidgets": {}, 59 | "nuid": "f8b80921-ff93-4b60-8b9d-ad26c4b909c8", 60 | "showTitle": false, 61 | "tableResultSettingsMap": {}, 62 | "title": "" 63 | } 64 | }, 65 | "source": [ 66 | "### Imports" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 0, 72 | "metadata": { 73 | "application/vnd.databricks.v1+cell": { 74 | "cellMetadata": { 75 | "byteLimit": 2048000, 76 | "rowLimit": 10000 77 | }, 78 | "inputWidgets": {}, 79 | "nuid": "fb9a509f-a4c5-4d06-9d93-1a52c0be1322", 80 | "showTitle": false, 81 | "tableResultSettingsMap": {}, 82 | "title": "" 83 | } 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "from collections import defaultdict\n", 88 | "from datetime import datetime\n", 89 | "import json\n", 90 | "import re\n", 91 | "from typing import Optional, Callable\n", 92 | "\n", 93 | "from databricks.sdk import WorkspaceClient\n", 94 | "from databricks.sdk.service.jobs import JobSettings" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "application/vnd.databricks.v1+cell": { 101 | "cellMetadata": { 102 | "byteLimit": 2048000, 103 | "rowLimit": 10000 104 | }, 105 | "inputWidgets": {}, 106 | "nuid": "03f51bef-dc97-4b08-bf45-c49e11db1076", 107 | "showTitle": false, 108 | "tableResultSettingsMap": {}, 109 | "title": "" 110 | } 111 | }, 112 | "source": [ 113 | "## Inputs\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 0, 119 | "metadata": { 120 | "application/vnd.databricks.v1+cell": { 121 | "cellMetadata": { 122 | "byteLimit": 2048000, 123 | "rowLimit": 10000 124 | }, 125 | "inputWidgets": {}, 126 | "nuid": "0ba8b199-65cc-4dfe-8926-dbc8f28a38b9", 127 | "showTitle": false, 128 | "tableResultSettingsMap": {}, 129 | "title": "" 130 | } 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "dbutils.widgets.removeAll()\n", 135 | "dbutils.widgets.text(\"backup_file_path\", \"\")\n", 136 | "backup_file_path: str = getArgument(\"backup_file_path\")\n", 137 | "\n", 138 | "w = WorkspaceClient()\n", 139 | "\n", 140 | "query_params = {\n", 141 | " \"LIST_JOBS_LIMIT\": 100, # max limit\n", 142 | " \"EXPAND_TASKS\": \"true\", # provides the complete config info for each job\n", 143 | "}" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "application/vnd.databricks.v1+cell": { 150 | "cellMetadata": { 151 | "byteLimit": 2048000, 152 | "rowLimit": 10000 153 | }, 154 | "inputWidgets": {}, 155 | "nuid": "004273ff-e821-415c-b57e-74eccd0b2253", 156 | "showTitle": false, 157 | "tableResultSettingsMap": {}, 158 | "title": "" 159 | } 160 | }, 161 | "source": [ 162 | "## List workflows \n", 163 | "\n", 164 | "Fetches all workflows in current workspace and its respective configs" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 0, 170 | "metadata": { 171 | "application/vnd.databricks.v1+cell": { 172 | "cellMetadata": { 173 | "byteLimit": 2048000, 174 | "rowLimit": 10000 175 | }, 176 | "inputWidgets": {}, 177 | "nuid": "1b13f2e7-238b-4a11-9c78-acab6c09f479", 178 | "showTitle": false, 179 | "tableResultSettingsMap": {}, 180 | "title": "" 181 | } 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "jobs: dict[int, dict] = {}\n", 186 | "\n", 187 | "# Use the SDK's built-in paginator\n", 188 | "for job in w.jobs.list(expand_tasks=query_params[\"EXPAND_TASKS\"], limit=query_params[\"LIST_JOBS_LIMIT\"]):\n", 189 | " jobs[job.job_id] = job.settings.as_dict()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "application/vnd.databricks.v1+cell": { 196 | "cellMetadata": { 197 | "byteLimit": 2048000, 198 | "rowLimit": 10000 199 | }, 200 | "inputWidgets": {}, 201 | "nuid": "9ac4ea31-c68f-4e86-9208-403ae6023b08", 202 | "showTitle": false, 203 | "tableResultSettingsMap": {}, 204 | "title": "" 205 | } 206 | }, 207 | "source": [ 208 | "## Parse the fetched data\n", 209 | "\n", 210 | "This is needed because the cluster config info in each task contains some current workspace specific properties, which are populated after cluster initialization, thus it needs to be removed." 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 0, 216 | "metadata": { 217 | "application/vnd.databricks.v1+cell": { 218 | "cellMetadata": { 219 | "byteLimit": 2048000, 220 | "rowLimit": 10000 221 | }, 222 | "inputWidgets": {}, 223 | "nuid": "e48c33f2-3271-4f1b-a80e-f79ab33535c3", 224 | "showTitle": false, 225 | "tableResultSettingsMap": {}, 226 | "title": "" 227 | } 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "def parse_jobs(job_info: JobSettings) -> dict:\n", 232 | " \"\"\"\n", 233 | " input:\n", 234 | " job_info [JobSettings]: JobSettings object from the SDK.\n", 235 | " output:\n", 236 | " dict : Parsed dictionary.\n", 237 | " \"\"\"\n", 238 | " job_dict = job_info.as_dict()\n", 239 | "\n", 240 | " for cluster_info in job_dict.get(\"job_clusters\", []):\n", 241 | " new_cluster = cluster_info.get(\"new_cluster\", {})\n", 242 | " if \"aws_attributes\" in new_cluster:\n", 243 | " new_cluster.pop(\"aws_attributes\")\n", 244 | "\n", 245 | " return job_dict" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 0, 251 | "metadata": { 252 | "application/vnd.databricks.v1+cell": { 253 | "cellMetadata": { 254 | "byteLimit": 2048000, 255 | "rowLimit": 10000 256 | }, 257 | "inputWidgets": {}, 258 | "nuid": "93df3fd2-b654-419f-a0cf-acac81aedd87", 259 | "showTitle": false, 260 | "tableResultSettingsMap": {}, 261 | "title": "" 262 | } 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "for job_id, job_settings in jobs.items():\n", 267 | " parsed = parse_jobs(JobSettings.from_dict(job_settings))\n", 268 | " jobs[job_id] = parsed" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": { 274 | "application/vnd.databricks.v1+cell": { 275 | "cellMetadata": { 276 | "byteLimit": 2048000, 277 | "rowLimit": 10000 278 | }, 279 | "inputWidgets": {}, 280 | "nuid": "84940d82-3c43-4af8-a5a8-54e81712dd31", 281 | "showTitle": false, 282 | "tableResultSettingsMap": {}, 283 | "title": "" 284 | } 285 | }, 286 | "source": [ 287 | "\n", 288 | "## Backup Job Config\n", 289 | "\n", 290 | "Write the obtained config json to disk of your choice" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 0, 296 | "metadata": { 297 | "application/vnd.databricks.v1+cell": { 298 | "cellMetadata": { 299 | "byteLimit": 2048000, 300 | "rowLimit": 10000 301 | }, 302 | "inputWidgets": {}, 303 | "nuid": "80000619-68c6-4d1f-a234-6c459dc8463c", 304 | "showTitle": false, 305 | "tableResultSettingsMap": {}, 306 | "title": "" 307 | } 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "assert len(jobs.keys()) > 1, \"No Jobs Found\"" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 0, 317 | "metadata": { 318 | "application/vnd.databricks.v1+cell": { 319 | "cellMetadata": { 320 | "byteLimit": 2048000, 321 | "rowLimit": 10000 322 | }, 323 | "inputWidgets": {}, 324 | "nuid": "fe85be21-6d6c-4857-bbf7-bfe52367f30c", 325 | "showTitle": false, 326 | "tableResultSettingsMap": {}, 327 | "title": "" 328 | } 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "backup_file_path_modded: str = backup_file_path + \"/\" + str(datetime.utcnow().date()).replace(\"-\",\"\") + \".json\"\n", 333 | "backup_file_path_modded" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 0, 339 | "metadata": { 340 | "application/vnd.databricks.v1+cell": { 341 | "cellMetadata": { 342 | "byteLimit": 2048000, 343 | "rowLimit": 10000 344 | }, 345 | "inputWidgets": {}, 346 | "nuid": "14159c89-9c1d-4117-bcd6-b36766d869bf", 347 | "showTitle": false, 348 | "tableResultSettingsMap": {}, 349 | "title": "" 350 | } 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "store_flag = None\n", 355 | "\n", 356 | "store_flag: bool = dbutils.fs.put(\n", 357 | " backup_file_path_modded, json.dumps(jobs), overwrite=False\n", 358 | ")\n", 359 | "\n", 360 | "if not store_flag or store_flag is None:\n", 361 | " raise ValueError(\"Unable to Write Jobs Backup\")" 362 | ] 363 | } 364 | ], 365 | "metadata": { 366 | "application/vnd.databricks.v1+notebook": { 367 | "computePreferences": null, 368 | "dashboards": [], 369 | "environmentMetadata": null, 370 | "inputWidgetPreferences": null, 371 | "language": "python", 372 | "notebookMetadata": { 373 | "pythonIndentUnit": 4 374 | }, 375 | "notebookName": "workflow_config_exporter", 376 | "widgets": { 377 | "backup_file_path": { 378 | "currentValue": "s3://dotlas-databricks/jobs", 379 | "nuid": "cbe01358-1720-400b-b9a7-6a1642e1515a", 380 | "typedWidgetInfo": { 381 | "autoCreated": false, 382 | "defaultValue": "", 383 | "label": null, 384 | "name": "backup_file_path", 385 | "options": { 386 | "widgetDisplayType": "Text", 387 | "validationRegex": null 388 | }, 389 | "parameterDataType": "String" 390 | }, 391 | "widgetInfo": { 392 | "widgetType": "text", 393 | "defaultValue": "", 394 | "label": null, 395 | "name": "backup_file_path", 396 | "options": { 397 | "widgetType": "text", 398 | "autoCreated": null, 399 | "validationRegex": null 400 | } 401 | } 402 | } 403 | } 404 | }, 405 | "kernelspec": { 406 | "display_name": "env", 407 | "language": "python", 408 | "name": "python3" 409 | }, 410 | "language_info": { 411 | "name": "python" 412 | } 413 | }, 414 | "nbformat": 4, 415 | "nbformat_minor": 0 416 | } 417 | --------------------------------------------------------------------------------