├── .devcontainer ├── Dockerfile └── devcontainer.json ├── .gitignore ├── .pylintrc ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── dev.env ├── docs ├── README.md └── images │ ├── AppInsightConnectionString.jpg │ ├── AzureResources.JPG │ ├── DatabricksNotebookExecution.JPG │ ├── DatabricksORGIDandHOSTID.JPG │ ├── DatabricksTokenGeneration.jpg │ ├── DevContainer.jpg │ ├── DockerImageLoad.jpg │ ├── InstallExtensions.jpg │ ├── MLOps_for_databricks_Solution_Acclerator_logo.JPG │ ├── OutputOfTheConfigurationStep.jpg │ ├── Overview.JPG │ ├── PipelineSteps.JPG │ ├── PowershellScreen.jpg │ ├── SecretsFileImage.jpg │ ├── SuccessfulClusterCreation.JPG │ ├── Verify_Python_Interpreter.jpg │ ├── cluster-upload-wheel.jpg │ ├── databricks-connect-pass.jpg │ ├── final.jpg │ ├── map01.png │ ├── map02.png │ ├── map03.png │ ├── map04.png │ ├── map05.png │ ├── map06.png │ ├── map07.png │ ├── pythonversion.jpg │ └── workspaceselection.jpg ├── requirements.txt ├── src ├── README.md ├── modules │ ├── acai_ml │ │ ├── __init__.py │ │ └── core.py │ ├── dbkcore │ │ ├── __init__.py │ │ ├── core.py │ │ ├── helpers.py │ │ └── requirements.txt │ ├── dbkdev │ │ ├── __init__.py │ │ ├── core.py │ │ ├── data_steps.py │ │ └── requirements.txt │ ├── dbkenv │ │ ├── __init__.py │ │ ├── core.py │ │ ├── local.py │ │ └── requirements.txt │ ├── devmaint │ │ ├── __init__.py │ │ ├── command_line.py │ │ ├── docgenerator.py │ │ └── requirements.txt │ └── tests │ │ ├── dbkcore │ │ └── test_logger.py │ │ ├── dbkenv │ │ ├── content │ │ │ ├── py_file.py │ │ │ └── unittest_notebook.py │ │ ├── test_cluster.py │ │ └── unittest_cluster.json │ │ └── pytest.ini ├── pipelines │ └── dbkframework │ │ ├── documentation.md │ │ ├── requirements.txt │ │ └── setup.py ├── setup │ ├── arm-templates │ │ ├── parameters.json │ │ └── template.json │ ├── config │ │ └── setup_config.json │ ├── configureResources.ps1 │ ├── deployResources.ps1 │ └── util │ │ ├── DBCluster-Configuration.json │ │ ├── Deploy-DBCluster.ps1 │ │ └── Deploy-DBCluster_using_CLI.ps1 └── tutorial │ ├── README.md │ ├── cluster_config.json │ ├── create_databricks_secrets.py │ ├── deploy.py │ └── scripts │ ├── __init__.py │ ├── create_cluster.py │ ├── framework_testing │ └── remote_analysis.py │ ├── install_dbkframework.py │ ├── local_config.py │ └── set_secrets.py └── workspace.code-workspace /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.166.0/containers/python-3/.devcontainer/base.Dockerfile 2 | 3 | # [Choice] Python version: 3, 3.9, 3.8, 3.7, 3.6 4 | ARG VARIANT="3" 5 | FROM mcr.microsoft.com/vscode/devcontainers/python:0-${VARIANT} 6 | 7 | # [Option] Install Node.js 8 | ARG INSTALL_NODE="true" 9 | ARG NODE_VERSION="lts/*" 10 | RUN if [ "${INSTALL_NODE}" = "true" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi 11 | 12 | # [Optional] Uncomment this section to install additional OS packages. 13 | RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ 14 | && apt-get -y install --no-install-recommends software-properties-common \ 15 | && apt-add-repository 'deb http://security.debian.org/debian-security stretch/updates main' \ 16 | && apt-get update \ 17 | && apt-get -y install --no-install-recommends openjdk-8-jdk pandoc 18 | 19 | RUN pip3 --disable-pip-version-check --no-cache-dir install databricks-connect==7.3.* 20 | 21 | # [Optional] If your pip requirements rarely change, uncomment this section to add them to the image. 22 | COPY requirements.txt /tmp/pip-tmp/ 23 | RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \ 24 | && rm -rf /tmp/pip-tmp 25 | 26 | # [Optional] Uncomment this line to install global node packages. 27 | # RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g " 2>&1 -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.166.0/containers/python-3 3 | { 4 | "name": "Python 3", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | "context": "..", 8 | "args": { 9 | // Update 'VARIANT' to pick a Python version: 3, 3.6, 3.7, 3.8, 3.9 10 | "VARIANT": "3.7", 11 | // Options 12 | "INSTALL_NODE": "false", 13 | "NODE_VERSION": "lts/*" 14 | } 15 | }, 16 | 17 | // Set *default* container specific settings.json values on container create. 18 | "settings": { 19 | "terminal.integrated.shell.linux": "/bin/bash", 20 | "python.pythonPath": "/usr/local/bin/python", 21 | "python.venvPath": "/usr/local/lib/python3.7/site-packages/pyspark/jars", 22 | "python.envFile": ".env", 23 | // "python.linting.enabled": true, 24 | "python.linting.pylintEnabled": false, 25 | "python.linting.pydocstyleEnabled": true, 26 | "python.linting.flake8Enabled": true, 27 | "python.linting.flake8Args": [ 28 | "--ignore=E501, E402" 29 | ], //["--ignore=E501,E123"] 30 | "python.linting.enabled": true, 31 | "python.linting.pylamaEnabled": false, 32 | "python.linting.pylamaArgs": [ 33 | "--ignore=E501" 34 | ], 35 | "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8", 36 | "python.formatting.blackPath": "/usr/local/py-utils/bin/black", 37 | "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf", 38 | "python.linting.banditPath": "/usr/local/py-utils/bin/bandit", 39 | "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8", 40 | "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy", 41 | "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle", 42 | "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle", 43 | "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint", 44 | "python.analysis.extraPaths": [ 45 | "src/modules" 46 | ], 47 | "python.testing.pytestArgs": [ 48 | "src" 49 | ], 50 | "python.testing.unittestEnabled": false, 51 | "python.testing.nosetestsEnabled": false, 52 | "python.testing.pytestEnabled": true 53 | }, 54 | 55 | // Add the IDs of extensions you want installed when the container is created. 56 | "extensions": [ 57 | "ms-python.python", 58 | "visualstudioexptteam.vscodeintellicode", 59 | "ms-python.vscode-pylance", 60 | "ms-azuretools.vscode-docker", 61 | "ms-vscode-remote.remote-containers", 62 | "irongeek.vscode-env", 63 | "njpwerner.autodocstring" 64 | ], 65 | "runArgs": [ 66 | "--env-file", 67 | ".env" 68 | ], 69 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 70 | // "forwardPorts": [], 71 | 72 | // Use 'postCreateCommand' to run commands after the container is created. 73 | // "postCreateCommand": "pip3 install --user -r requirements.txt", 74 | 75 | // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 76 | "remoteUser": "vscode" 77 | } 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/** 2 | notes.md 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | venv/ 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | *.vscode 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | 111 | .DS_Store 112 | 113 | #custom 114 | config.json 115 | 116 | #appsecret.txt 117 | *appsecret.txt 118 | 119 | #DBKToken.txt 120 | *DBKtoken.txt 121 | *setup_config.json -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | disable=C0114, # Missing module docstring 3 | C0115, # Missing class docstring 4 | C0116, # Missing function docstring 5 | R0801, # Similar lines 6 | W0511 # TODO's 7 | 8 | [BASIC] 9 | good-names=i, j, 10 | k, v, 11 | f, 12 | ex, 13 | Run, 14 | _, 15 | df, 16 | ws, sp, 17 | X, y, X_train, X_test 18 | 19 | [FORMAT] 20 | max-line-length=120 21 | max-module-lines=1000 22 | 23 | [SIMILARITIES] 24 | ignore-comments=yes 25 | ignore-docstrings=yes 26 | ignore-imports=yes 27 | min-similarity-lines=4 28 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Microsoft 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Banner](docs/images/MLOps_for_databricks_Solution_Acclerator_logo.JPG) 2 | 3 | # About this repository 4 | 5 | This repository contains the Databricks development framework for delivering any Data Engineering projects, and machine learning projects based on the Azure Technologies. 6 | 7 | # Details of the accelerator 8 | 9 | The accelerator contains few of the core features of Databricks development which can be extended or reused in any implementation projects with Databricks. 10 | 11 | ![overview](docs/images/Overview.JPG) 12 | 13 | - Logging Framework using the [Opensensus Azure Monitor Exporters](https://github.com/census-instrumentation/opencensus-python/tree/master/contrib/opencensus-ext-azure) 14 | - Support for Databricks development from VS Code IDE using the [Databricks Connect](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/databricks-connect#visual-studio-code) feature. 15 | - continuous development with [Python Local Packaging](https://packaging.python.org/tutorials/packaging-projects/) 16 | - Implementation of the Databricks utilities in VS Code such as dbutils, notebook execution, secret handling. 17 | - Example Model file which uses the framework end to end. 18 | 19 | 20 | # Prerequisites 21 | 22 | To successfully complete your solution, you will need to have access to and or provisioned the following: 23 | 24 | - Access to an Azure subscription 25 | - Service Principal (valid Client ID and secret ) which has the contributor permission the subscription. We are going to create the resource group using the service principal. 26 | - VS Code installed. 27 | - Docker Desktop Installed. 28 | 29 | # Create the Service Principal 30 | 31 | - [Instruction to create the service principal](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#register-an-application-with-azure-ad-and-create-a-service-principal) 32 | - [Instruction to assign role to the service principal access over the Subscription](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#assign-a-role-to-the-application). Please provide **contributor** access over the subscription. 33 | - [Instruction to Get application ID and tenant ID for the application you registered](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#get-tenant-and-app-id-values-for-signing-in) 34 | - [Instruction to create application secret](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#option-2-create-a-new-application-secret). The application Secret is needed at the later part of this setup. Please copy the **value** and store it in a notepad for now. 35 | 36 | # Getting Started 37 | 38 | The below sections provide the step by step approach to set up the solution. As part of this solution, we need the following resources to be provisioned in a resource group. 39 | 40 | 1. Azure Databricks 41 | 2. Application Insight Instance. 42 | 3. A log analytics workspace for the App Insight. 43 | 4. Azure Key Vault to store the secrets. 44 | 5. A Storage Account. 45 | 46 | ## Section 1: Docker image load in VS Code 47 | 48 | ![map01](docs/images/map01.png) 49 | 1. Clone the Repository : https://github.com/microsoft/dstoolkit-ml-ops-for-databricks/pulls 50 | 2. Install Docker Desktop. In this solution, the Visual Code uses the docker image as a remote container to run the solution. 51 | 3. Create .env file in the root folder, and keep the file blank for now. (root folder is the parent folder of the project) 52 | 4. In the repo, open the workspace. File: workspace.ode-workspace. 53 | 54 | > Once you click the file, you will get the "Open Workspace" button at right bottom corner in the code editor. Click it to open the solution into the vscode workspace. 55 | 56 | ![workspaceselection](docs/images/workspaceselection.jpg) 57 | 58 | 5. We need to connect to the [docker image as remote container in vs code](https://code.visualstudio.com/docs/remote/attach-container#_attach-to-a-docker-container). In the code repository, we have ./.devcontainer folder that has required docker image file and docker configuration file. Once we load the repo in the vscode, we generally get the prompt. Select "Reopen in Container". Otherwise we can go to the VS code command palette ( ctrl+shift+P in windows), and select the option "Remote-Containers: Rebuild and Reopen in Containers" 59 | 60 | ![DockerImageLoad](docs/images/DockerImageLoad.jpg) 61 | 62 | 6. In the background, it is going to build a docker image. We need to wait for sometime to complete build. the docker image will basically contain the a linux environment which has python 3.7 installed. Please have a look at the configuration file(.devcontainer\devcontainer.json) for more details. 63 | 7. Once it is loaded. we will be able to see the python interpreter is loaded successfully. Incase it does not show, we need to load the interpreter manually. To do that, click on the select python interpreter => Entire workspace => /usr/local/bin/python 64 | 65 | ![pythonversion](docs/images/pythonversion.jpg) 66 | 67 | 8. You will be prompted with installing the required extension on the right bottom corner. Install the extensions by clicking on the prompts. 68 | 69 | ![InstallExtensions](docs/images/InstallExtensions.jpg) 70 | 71 | 9. Once the steps are completed, you should be able to see the python extensions as below: 72 | 73 | ![pythonversion](docs/images/pythonversion.jpg) 74 | 75 | ## Section 2: Databricks environment creation 76 | 77 | ![map02](docs/images/map02.png) 78 | 79 | The objectives of this section are: 80 | 81 | - Create the required resources. 82 | 1. Azure Databricks 83 | 2. Application Insight Instance. 84 | 3. A log analytics workspace for the App Insight. 85 | 4. Azure Key Vault to store the secrets. 86 | 5. A Storage Account. 87 | 88 | - Create the .env file for the local development. 89 | 90 | > You don't need to create the environment again if you already had a databricks environment. You can directly create the .env file ( Section 4 ) with the details of your environment. 91 | 92 | 1. Go to **src/setup/config/setup_config.json**, and complete the json files with the values; according to your environment. The service principal should be having the contributor access over the subscription you are using. Or if you choose to create the resource group manually, or reuse an existing resource group, then it should have the contributor access on the resource group itself. 93 | 94 | > These details would be used to connect to the Azure Subscription for the resource creation. 95 | 96 | ``` json 97 | { 98 | 99 | "applicationID":"deeadfb5-27xxxaad3-9fd39049b450", 100 | "tenantID":"72f988bf-8xxxxx2d7cd011db47", 101 | "subscriptionID":"89c37dd8xxxx-1cfb98c0262e", 102 | "resourceGroupName":"AccleratorDBKMLOps2", 103 | "resourceGroupLocation":"NorthEurope" 104 | } 105 | ``` 106 | 107 | 2. create the file and provide the client ID secret in this file : **src/setup/vault/appsecret.txt** 108 | 109 | > Incase you are not able to create the file from the solution, you can directly go to the file explorer to create the file. 110 | > 111 | > NOTE: DBToken.txt will be created in the later section, please ignore it for now. 112 | 113 | At the end of the secret files creation, the folder structure will like below: 114 | 115 | ![SecretsFileImage](docs/images/SecretsFileImage.jpg) 116 | 117 | 3. Open the Powershell ISE in your local machine. We are going to run the Powershell script to create the required resources. The name of the resources are basically having a prefix to the resourcegroup name. 118 | 4. set the root path of the Powershell terminal till setup, and execute the deployResource.ps1 119 | 120 | ``` powershell 121 | cd "C:\Users\projects\New folder\MLOpsBasic-Databricks\src\setup" 122 | .\deployResources.ps1 123 | ``` 124 | 125 | > If you receive the below error, execute the command [Set-ExecutionPolicy RemoteSigned] 126 | 127 | ``` cmd 128 | >.\deployResources.ps1 : File C:\Users\projects\New 129 | folder\MLOpsBasic-Databricks\src\setup\deployResources.ps1 cannot be loaded because running scripts is disabled on this. 130 | ``` 131 | > if you get the error module is not found, and if Powershell ISE is not able to recognize any specific Powershell command, then Install the Powershell Az Module. [Instructions](https://docs.microsoft.com/en-us/powershell/azure/install-az-ps?view=azps-6.4.0) 132 | ``` cmd 133 | Install-Module Az 134 | ``` 135 | 136 | ![PowershellScreen](docs/images/PowershellScreen.jpg) 137 | 138 | Post successful execution of the script, we can see the resources created successfully in the Azure Subscription. 139 | 140 | ![AzureResources](docs/images/AzureResources.JPG) 141 | 142 | 143 | ## Section 3: Databricks cluster creation 144 | 145 | ![map03](docs/images/map03.png) 146 | 147 | 1. To create the databricks cluster we need to have personal Access token created. Go to the Databricks workspace, and get the personal access token from the user setting, and save it in the file src/setup/vault/DBKtoken.txt 148 | 149 | ![DatabricksTokenGeneration](docs/images/DatabricksTokenGeneration.jpg) 150 | 151 | 2. Run the following command 152 | 153 | ``` cmd 154 | cd "C:\Users\projects\New folder\MLOpsBasic-Databricks\src\setup" 155 | 156 | .\configureResources.ps1 157 | ``` 158 | 159 | 3. At the end of the script execution, we will be able to see the databricks cluster has been created successfully.the config file: src\setup\util\DBCluster-Configuration.json is being used to create the cluster. 160 | 161 | ![SuccessfulClusterCreation](docs/images/SuccessfulClusterCreation.JPG) 162 | 163 | 4. Copy the output of the script and paste it to the .env file which we had created previously. Please note that the values of the variables will be different as per your environment configuration. the later section (Section 4) describes the creation of .env file in detail. 164 | 165 | ![OutputOfTheConfigurationStep](docs/images/OutputOfTheConfigurationStep.jpg) 166 | 167 | ## Section 4: Create the .env file 168 | 169 | ![map04](docs/images/map04.png) 170 | 171 | We need to manually change the databricks host and appI_IK values. Other values should be "as is" from the output of the previous script. 172 | 173 | - PYTHONPATH: /workspaces/dstoolkit-ml-ops-for-databricks/src/modules [This is full path to the module folder in the repository.] 174 | - APPI_IK: connection string of the application insight 175 | - DATABRICKS_HOST: The URL of the databricks workspace. 176 | - DATABRICKS_TOKEN: Databricks Personal Access Token which was generated in the previous step. 177 | - DATABRICKS_ORDGID: OrgID of the databricks that can be fetched from the databricks URL. 178 | 179 | ![DatabricksORGIDandHOSTID](docs/images/DatabricksORGIDandHOSTID.JPG) 180 | 181 | Application Insight Connection String 182 | 183 | ![AppInsightConnectionString](docs/images/AppInsightConnectionString.jpg) 184 | 185 | At the end, our .env file is going to look as below. You can copy the content and change the values according to your environment. 186 | 187 | ``` conf 188 | PYTHONPATH=/workspaces/dstoolkit-ml-ops-for-databricks/src/modules 189 | APPI_IK=InstrumentationKey=e6221ea6xxxxxxf-8a0985a1502f;IngestionEndpoint=https://northeurope-2.in.applicationinsights.azure.com/ 190 | DATABRICKS_HOST=https://adb-7936878321001673.13.azuredatabricks.net 191 | DATABRICKS_TOKEN= 192 | DATABRICKS_ORDGID=7936878321001673 193 | ``` 194 | 195 | ## Section 5: Configure the Databricks connect 196 | 197 | ![map05](docs/images/map05.png) 198 | 199 | 1. In this step we are going to configure the databricks connect for VS code to connect to databricks. Run the below command for that from the docker (VS Code) terminal. 200 | 201 | ``` bash 202 | $ python "src/tutorial/scripts/local_config.py" -c "src/tutorial/cluster_config.json" 203 | ``` 204 | 205 | >Note: If you get any error saying that "ModelNotFound : No module names dbkcore". Try to reload the VS code window and see if you are getting prompt right bottom corner saying that configuration file changes, rebuild the docker image. Rebuild it and then reload the window. Post that you would not be getting any error. Also, check if the python interpreter is being selected properly. They python interpreter path should be **/usr/local/bin/python ** 206 | 207 | ![Verify_Python_Interpreter](docs/images/Verify_Python_Interpreter.jpg) 208 | 209 | ### Verify 210 | 211 | 1. You will be able to see the message All tests passed. 212 | 213 | ![databricks-connect-pass](docs/images/databricks-connect-pass.jpg) 214 | 215 | ## Section 6: Wheel creation and workspace upload 216 | 217 | ![map06](docs/images/map06.png) 218 | 219 | In this section, we will create the private python package and upload it to the databricks environment. 220 | 221 | 1. Run the below command: 222 | 223 | ``` bash 224 | python src/tutorial/scripts/install_dbkframework.py -c "src/tutorial/cluster_config.json" 225 | ``` 226 | 227 | Post Execution of the script, we will be able to see the module to be installed. 228 | 229 | ![cluster-upload-wheel](docs/images/cluster-upload-wheel.jpg) 230 | 231 | ## Section 7: Using the framework 232 | 233 | ![map07](docs/images/map07.png) 234 | 235 | We have a pipeline that performs the data preparation, unit testing, logging, training of the model. 236 | 237 | 238 | ![PipelineSteps](docs/images/PipelineSteps.JPG) 239 | 240 | 241 | ### Execution from Local VS Code 242 | 243 | To check if the framework is working fine or not, let's execute this file : **src/tutorial/scripts/framework_testing/remote_analysis.py** . It is better to execute is using the interactive window. As the Interactive window can show the pandas dataframe which is the output of the script. Otherwise the script can be executed from the Terminal as well. 244 | To run the script from the interactive window, select the whole script => right click => run the selection in the interactive window. 245 | 246 | Post running the script, we will be able to see the data in the terminal. 247 | 248 | ![final](docs/images/final.jpg) 249 | 250 | ### Execution from Databricks 251 | 252 | In order to run the same notebook in the databricks, we just need to create a databricks secrets for the application insight connection string. 253 | 254 | For this, we can execute the below query: 255 | 256 | ``` bash 257 | python src/tutorial/create_databricks_secrets.py 258 | 259 | ``` 260 | 261 | After copying the content of the remote_analysis.py in the databricks notebook, we get the output as below: 262 | 263 | ![DatabricksNotebookExecution](docs/images/DatabricksNotebookExecution.JPG) 264 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /dev.env: -------------------------------------------------------------------------------- 1 | PYTHONPATH=/workspaces/MLOpsBasic-Databricks/src/modules 2 | APPI_IK="" 3 | DATABRICKS_HOST="" 4 | DATABRICKS_TOKEN="" 5 | DATABRICKS_ORDGID="" 6 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /docs/images/AppInsightConnectionString.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/AppInsightConnectionString.jpg -------------------------------------------------------------------------------- /docs/images/AzureResources.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/AzureResources.JPG -------------------------------------------------------------------------------- /docs/images/DatabricksNotebookExecution.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/DatabricksNotebookExecution.JPG -------------------------------------------------------------------------------- /docs/images/DatabricksORGIDandHOSTID.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/DatabricksORGIDandHOSTID.JPG -------------------------------------------------------------------------------- /docs/images/DatabricksTokenGeneration.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/DatabricksTokenGeneration.jpg -------------------------------------------------------------------------------- /docs/images/DevContainer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/DevContainer.jpg -------------------------------------------------------------------------------- /docs/images/DockerImageLoad.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/DockerImageLoad.jpg -------------------------------------------------------------------------------- /docs/images/InstallExtensions.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/InstallExtensions.jpg -------------------------------------------------------------------------------- /docs/images/MLOps_for_databricks_Solution_Acclerator_logo.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/MLOps_for_databricks_Solution_Acclerator_logo.JPG -------------------------------------------------------------------------------- /docs/images/OutputOfTheConfigurationStep.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/OutputOfTheConfigurationStep.jpg -------------------------------------------------------------------------------- /docs/images/Overview.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/Overview.JPG -------------------------------------------------------------------------------- /docs/images/PipelineSteps.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/PipelineSteps.JPG -------------------------------------------------------------------------------- /docs/images/PowershellScreen.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/PowershellScreen.jpg -------------------------------------------------------------------------------- /docs/images/SecretsFileImage.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/SecretsFileImage.jpg -------------------------------------------------------------------------------- /docs/images/SuccessfulClusterCreation.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/SuccessfulClusterCreation.JPG -------------------------------------------------------------------------------- /docs/images/Verify_Python_Interpreter.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/Verify_Python_Interpreter.jpg -------------------------------------------------------------------------------- /docs/images/cluster-upload-wheel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/cluster-upload-wheel.jpg -------------------------------------------------------------------------------- /docs/images/databricks-connect-pass.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/databricks-connect-pass.jpg -------------------------------------------------------------------------------- /docs/images/final.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/final.jpg -------------------------------------------------------------------------------- /docs/images/map01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map01.png -------------------------------------------------------------------------------- /docs/images/map02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map02.png -------------------------------------------------------------------------------- /docs/images/map03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map03.png -------------------------------------------------------------------------------- /docs/images/map04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map04.png -------------------------------------------------------------------------------- /docs/images/map05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map05.png -------------------------------------------------------------------------------- /docs/images/map06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map06.png -------------------------------------------------------------------------------- /docs/images/map07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map07.png -------------------------------------------------------------------------------- /docs/images/pythonversion.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/pythonversion.jpg -------------------------------------------------------------------------------- /docs/images/workspaceselection.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/workspaceselection.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | opencensus-ext-azure>=1.0.7 3 | # databricks-connect==7.3.* 4 | databricks-cli>=0.14.3 5 | typeguard>=2.12.0 6 | pytest>=6.2.3 7 | jupyter>=1.0.0 8 | python-dotenv>=0.17.0 9 | pypandoc>=1.4 10 | pdoc3>=0.7.4 11 | pandas>=1.2.4 12 | setuptools>=56.0.0 13 | pydataset>=0.2.0 14 | scikit-learn>=0.24.1 15 | PyArrow>=0.15.1 -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /src/modules/acai_ml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/modules/acai_ml/__init__.py -------------------------------------------------------------------------------- /src/modules/acai_ml/core.py: -------------------------------------------------------------------------------- 1 | from dbkdev.core import DevelopmentClient 2 | from dbkdev.core import DevelopmentEngine 3 | from dbkcore.core import Log 4 | from dbkcore.core import trace 5 | from dbkcore.core import Singleton 6 | from dbkdev.core import IdeEnvironment 7 | from pyspark.sql import SparkSession 8 | from typing import Dict 9 | 10 | 11 | class Engine(metaclass=Singleton): 12 | 13 | """ 14 | This is the core of the framework. 15 | It configures the environment to interact with the remote Databricks. 16 | """ 17 | 18 | def __init__(self): 19 | """ 20 | Instantiate the current object 21 | 22 | """ 23 | self.__ide_environment = DevelopmentEngine().get_instance().ide_environment 24 | self.appi_ik = None 25 | 26 | def initialize_env(self): 27 | """ 28 | Initializes the DevelopmentClient. 29 | That is, sets the dbutils and spark context accordingly if the code is runt on cluster or locally. 30 | """ 31 | DevelopmentClient( 32 | dbutils=DevelopmentEngine().get_instance().dbutils, 33 | spark=DevelopmentEngine().get_instance().spark, 34 | ide_environment=self.__ide_environment 35 | ) 36 | 37 | # def initialize_import(self): 38 | # # Setting pipeline module path 39 | # if self.__ide_environment == IdeEnvironment.DATABRICKS: 40 | # import sys 41 | # sys.path.append(str(self.pipelines_lib_path)) 42 | 43 | def initialize_logger( 44 | self, 45 | pipeline_name: str, 46 | appi_ik_scope: str = 'config', 47 | appi_ik_secret: str = 'APPI_IK' 48 | ): 49 | """ 50 | Initializes the logger 51 | 52 | Parameters 53 | ---------- 54 | pipeline_name : str 55 | Name to use with the logger. It will be the base name used for all the upcoming logs and tracing 56 | appi_ik_scope : str, optional 57 | Databricks secret scope where the Application Insight key is stored, by default "dds" 58 | appi_ik_secret : str, optional 59 | Databricks secret name where the Application Insight key is stored, by default "appiik" 60 | 61 | Raises 62 | ------ 63 | ValueError 64 | Unknown Ide Environment used 65 | """ 66 | # Configuring application insight key 67 | if self.__ide_environment == IdeEnvironment.LOCAL: 68 | from dbkenv.core import Configuration 69 | configurations = Configuration() 70 | self.appi_ik = configurations.APPINSIGHT_CONNECTIONSTRING 71 | elif self.__ide_environment == IdeEnvironment.DATABRICKS: 72 | self.appi_ik = DevelopmentEngine().get_instance().dbutils.secrets.get(appi_ik_scope, appi_ik_secret) 73 | else: 74 | raise ValueError(f'ide_environment unknown: {self.__ide_environment}') 75 | # Instantiating logger 76 | Log(pipeline_name, self.appi_ik) 77 | 78 | def spark(self) -> SparkSession: 79 | """ 80 | Current spark context 81 | 82 | Returns 83 | ------- 84 | SparkSession 85 | Spark context 86 | """ 87 | return DevelopmentClient().get_instance().spark 88 | 89 | def dbutils(self): 90 | """ 91 | Current dbutils 92 | 93 | Returns 94 | ------- 95 | DBUtils 96 | The DBUtils 97 | """ 98 | return DevelopmentClient().get_instance().dbutils 99 | 100 | @classmethod 101 | def get_instance(cls): 102 | """ 103 | Current singleton Engine 104 | 105 | Returns 106 | ------- 107 | Engine 108 | The Engine 109 | """ 110 | return Engine() 111 | 112 | @staticmethod 113 | def ide_environment() -> IdeEnvironment: 114 | """ 115 | Current Ide Environment 116 | 117 | Returns 118 | ------- 119 | IdeEnvironment 120 | The Ide Environment 121 | """ 122 | return DevelopmentClient().get_instance().ide_environment 123 | 124 | @staticmethod 125 | def is_ide_dataricks() -> bool: 126 | """ 127 | Checks if the current environment is Databricks 128 | 129 | Returns 130 | ------- 131 | bool 132 | Check result 133 | """ 134 | return DevelopmentClient().get_instance().ide_environment == IdeEnvironment.DATABRICKS 135 | 136 | @staticmethod 137 | def is_ide_local() -> bool: 138 | """ 139 | Checks if the current environment is Local 140 | 141 | Returns 142 | ------- 143 | bool 144 | Check result 145 | """ 146 | return DevelopmentClient().get_instance().ide_environment == IdeEnvironment.LOCAL 147 | 148 | def run_notebook_with_retry(self, notebook: str, args: Dict, timeout=86400, max_retries=3): 149 | """ 150 | Runs the specified notebook through dbutils 151 | 152 | Parameters 153 | ---------- 154 | notebook : str 155 | Name or path of the notebook 156 | args : Dict 157 | [description] 158 | timeout : int, optional 159 | [description], by default 86400 160 | max_retries : int, optional 161 | [description], by default 3 162 | 163 | Returns 164 | ------- 165 | [type] 166 | [description] 167 | 168 | Raises 169 | ------ 170 | e 171 | [description] 172 | """ 173 | num_retries = 0 174 | while True: 175 | try: 176 | return DevelopmentClient().get_instance().dbutils.notebook.run(notebook, timeout, args) 177 | except Exception as e: 178 | if num_retries > max_retries: 179 | raise e 180 | else: 181 | print("Retrying error"), e 182 | num_retries += 1 183 | 184 | @trace 185 | # TODO: rename check 186 | def run_notebook(self, notebook: str, args: Dict, timeout=86400, error_raise=True): 187 | try: 188 | res = DevelopmentClient().get_instance().dbutils.notebook.run(notebook, timeout, args) 189 | except Exception as e: 190 | res = f"Notebook {notebook} failed" 191 | Log().get_instance().log_error(res) 192 | if error_raise: 193 | raise e 194 | return res 195 | -------------------------------------------------------------------------------- /src/modules/dbkcore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/modules/dbkcore/__init__.py -------------------------------------------------------------------------------- /src/modules/dbkcore/core.py: -------------------------------------------------------------------------------- 1 | from opencensus.ext.azure.trace_exporter import AzureExporter 2 | from opencensus.trace.samplers import AlwaysOnSampler 3 | from opencensus.trace.tracer import Tracer 4 | from opencensus.trace.span import Span 5 | from opencensus.ext.azure.log_exporter import AzureLogHandler 6 | import logging 7 | from logging import Logger 8 | from abc import abstractmethod 9 | from typeguard import typechecked 10 | from .helpers import is_json_serializable 11 | from datetime import datetime 12 | import functools as _functools 13 | from typing import Any, List, Union 14 | from collections import OrderedDict 15 | import json 16 | 17 | 18 | 19 | 20 | 21 | class Singleton(type): 22 | """Create a singleton.""" 23 | 24 | _instances = OrderedDict() 25 | 26 | def __call__(cls, *args, **kwargs): 27 | """ 28 | Instantiate the singleton. 29 | 30 | Returns 31 | ------- 32 | any 33 | Parameters of the singleton 34 | """ 35 | if cls not in cls._instances: 36 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 37 | return cls._instances[cls] 38 | 39 | 40 | class Log(metaclass=Singleton): 41 | """Helper class for Application Insight Logger.""" 42 | 43 | def __init__(self, name: str, connection_string: str = None): 44 | """ 45 | Create a new Log object. 46 | 47 | Parameters 48 | ---------- 49 | name : str 50 | Name used by the logger for tracing 51 | connection_string : [type], optional 52 | Application Insight's connection string 53 | """ 54 | self.name = name 55 | self.__connection_string = connection_string 56 | 57 | # config_integration.trace_integrations(['logging']) 58 | # [Documentation](https://docs.microsoft.com/it-it/azure/azure-monitor/app/opencensus-python#logs) 59 | # [Documentation](https://docs.microsoft.com/it-it/azure/azure-monitor/app/opencensus-python#trace) 60 | self.__logger = self._get_logger() 61 | self.__tracer = self._get_tracer() 62 | 63 | def _get_logger(self) -> Logger: 64 | """ 65 | Create the logger with an Azure Handler for Application Insight. 66 | 67 | Returns 68 | ------- 69 | Logger 70 | Current logger 71 | """ 72 | logger = logging.getLogger(name=self.name) 73 | logger.setLevel(logging.DEBUG) 74 | if self.__connection_string: 75 | handler = AzureLogHandler(connection_string=self.__connection_string) 76 | # handler.export_interval = 1 77 | # handler.max_batch_size = 1 78 | # handler.setFormatter(logging.Formatter('%(traceId)s:%(spanId)s:%(message)s')) 79 | logger.addHandler(handler) 80 | return logger 81 | 82 | def _get_tracer(self) -> Tracer: 83 | """ 84 | Create the Opencencus Tracer with Azure Exporter. 85 | 86 | Returns 87 | ------- 88 | Tracer 89 | Opencencus Tracer 90 | """ 91 | if self.__connection_string: 92 | tracer = Tracer( 93 | exporter=AzureExporter(connection_string=self.__connection_string), 94 | sampler=AlwaysOnSampler() 95 | ) 96 | else: 97 | tracer = None 98 | return tracer 99 | 100 | @classmethod 101 | def get_instance(cls): 102 | """Current instance""" 103 | return Log() 104 | 105 | @typechecked 106 | def trace_function(self, name: str, kwargs: dict) -> Union[Span, None]: 107 | """ 108 | Traces a function 109 | 110 | Parameters 111 | ---------- 112 | name : str 113 | Name of the function used for tracing 114 | 115 | name : kwargs 116 | The parameters of the function 117 | 118 | Returns 119 | ------- 120 | Span 121 | A Span that can be used for customizing logging 122 | """ 123 | tracer = self.__tracer 124 | if tracer: 125 | span = self.__tracer.span(name=name) 126 | if kwargs: 127 | for key, value in kwargs.items(): 128 | # if hasattr(value, 'to_json_logger'): 129 | # value = value.to_json_logger() 130 | if not is_json_serializable(value): 131 | value = str(value) 132 | span.add_attribute(key, value) 133 | # self.log_info(f"TRACING:{key}:{value}") 134 | else: 135 | span = None 136 | return span 137 | 138 | @property 139 | def tracer(self) -> Tracer: 140 | """ 141 | Tracer that will be used. 142 | 143 | Returns 144 | ------- 145 | Tracer 146 | The tracer 147 | """ 148 | return self.__tracer 149 | 150 | @property 151 | def logger(self) -> Logger: 152 | """ 153 | Logger that will be used. 154 | 155 | Returns 156 | ------- 157 | Logger 158 | This logger 159 | """ 160 | return self.__logger 161 | 162 | def __log_message(self, message: str, prefix: str) -> str: 163 | if prefix: 164 | msg = f'{prefix}:{message}' 165 | else: 166 | msg = f'{message}' 167 | # res = f"{self.name}:{msg}" 168 | return msg 169 | 170 | def log_info(self, message: str, prefix="", custom_dimension: dict = None): 171 | """ 172 | Log a message as info. 173 | 174 | Parameters 175 | ---------- 176 | message : str 177 | The message 178 | """ 179 | msg = self.__log_message(message=message, prefix=prefix) 180 | local_msg = f'{msg}\nDetails: {json.dumps(custom_dimension, indent=4)}' if custom_dimension else msg 181 | print(f'INFO:{local_msg}') 182 | properties = {'custom_dimensions': custom_dimension} 183 | self.__logger.info(msg, extra=properties) 184 | 185 | def log_debug(self, message: str, prefix="", custom_dimension: dict = None): 186 | """ 187 | Log a message as debug. 188 | 189 | Parameters 190 | ---------- 191 | message : str 192 | The message 193 | """ 194 | msg = self.__log_message(message=message, prefix=prefix) 195 | local_msg = f'{msg}|Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg 196 | print(f'DEBUG:{local_msg}') 197 | # logging.debug(msg=f'{msg}| Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg) 198 | properties = {'custom_dimensions': custom_dimension if custom_dimension else {}} 199 | self.__logger.debug(msg=msg, extra=properties) 200 | 201 | def log_warning(self, message: str, prefix="", custom_dimension: dict = None): 202 | """ 203 | Log a message as warning. 204 | 205 | Parameters 206 | ---------- 207 | message : str 208 | The message 209 | """ 210 | msg = self.__log_message(message=message, prefix=prefix) 211 | local_msg = f'{msg}|Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg 212 | print(f'WARNING:{local_msg}') 213 | # logging.warning(msg=f'{msg}| Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg) 214 | properties = {'custom_dimensions': custom_dimension if custom_dimension else {}} 215 | self.__logger.warning(msg=msg, extra=properties) 216 | 217 | def log_error(self, message: str, include_stack=True, prefix="", custom_dimension: dict = None): 218 | """ 219 | Log a message as error. 220 | 221 | Parameters 222 | ---------- 223 | message : str 224 | The message 225 | """ 226 | msg = self.__log_message(message=message, prefix=prefix) 227 | local_msg = f'{msg}|Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg 228 | print(f'ERROR:{local_msg}') 229 | # logging.error(msg=f'{msg}| Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg, exc_info=include_stack) 230 | properties = {'custom_dimensions': custom_dimension if custom_dimension else {}} 231 | self.__logger.error(msg=msg, exc_info=include_stack, extra=properties) 232 | 233 | def log_critical(self, message: str, prefix="", custom_dimension: dict = None): 234 | """ 235 | Log a message as critical. 236 | 237 | Parameters 238 | ---------- 239 | message : str 240 | The message 241 | """ 242 | msg = self.__log_message(message=message, prefix=prefix) 243 | local_msg = f'{msg}|Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg 244 | print(f'CRITICAL:{local_msg}') 245 | properties = {'custom_dimensions': custom_dimension if custom_dimension else {}} 246 | logging.critical(msg=f'{msg}| Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg) 247 | self.__logger.critical(msg=msg, extra=properties) 248 | 249 | 250 | @typechecked 251 | def trace(original_function: Any = None, *, attrs_refact: List[str] = None): 252 | """ 253 | Log the function call. 254 | 255 | Parameters 256 | ---------- 257 | original_function : Any, optional 258 | Function to trace, by default None 259 | attrs_refact : List[str], optional 260 | List of parameters to hide from logging, by default None 261 | """ 262 | 263 | def __log(func, fn_k, *args, **kwargs): 264 | start = datetime.utcnow() 265 | # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:START @ {start}", custom_dimension=fn_k) 266 | res = func(*args, **kwargs) 267 | end = datetime.utcnow() 268 | elapsed = end - start 269 | # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:COMPLETE @ {end}:ELAPSED {elapsed}", custom_dimension=fn_k) 270 | fn_k['elapsed'] = str(elapsed) 271 | fn_k['module'] = str(func.__module__) 272 | fn_k['qualname'] = str(func.__qualname__) 273 | Log.get_instance().log_info(f"Executed function {func.__module__}.{func.__qualname__}", custom_dimension=fn_k) 274 | return res 275 | 276 | """Decorator for tracing functions (link)[https://stackoverflow.com/a/24617244]""" 277 | def _decorate(func): 278 | @_functools.wraps(func) 279 | def wrapper(*args, **kwargs): 280 | fn_k = {} 281 | # if not attrs_refact: 282 | # fn_k = kwargs 283 | # else: 284 | for key, value in kwargs.items(): 285 | v = value if is_json_serializable(value) else 'not serializable' 286 | if attrs_refact: 287 | if key in attrs_refact: 288 | v = '***' 289 | fn_k[key] = v 290 | # if key not in attrs_refact: 291 | # fn_k[key] = value 292 | # else: 293 | # fn_k[key] = '***' 294 | if Log.get_instance().tracer: 295 | with Log.get_instance().trace_function( 296 | name=func.__name__, 297 | kwargs=fn_k 298 | ): 299 | return __log(func, fn_k, *args, **kwargs) 300 | # start = datetime.utcnow() 301 | # # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:START @ {start}", custom_dimension=fn_k) 302 | # res = func(*args, **kwargs) 303 | # end = datetime.utcnow() 304 | # elapsed = end - start 305 | # # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:COMPLETE @ {end}:ELAPSED {elapsed}", custom_dimension=fn_k) 306 | # fn_k['elapsed'] = str(elapsed) 307 | # fn_k['module'] = str(func.__module__) 308 | # fn_k['qualname'] = str(func.__qualname__) 309 | # Log.get_instance().log_info(f"Executed function {func.__module__}.{func.__qualname__}", custom_dimension=fn_k) 310 | # return res 311 | else: 312 | return __log(func, fn_k, *args, **kwargs) 313 | # start = datetime.utcnow() 314 | # # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:START @ {start}", custom_dimension=fn_k) 315 | # res = func(*args, **kwargs) 316 | # end = datetime.utcnow() 317 | # elapsed = end - start 318 | # # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:COMPLETE @ {end}:ELAPSED {elapsed}", custom_dimension=fn_k) 319 | # fn_k['elapsed'] = str(elapsed) 320 | # fn_k['module'] = str(func.__module__) 321 | # fn_k['qualname'] = str(func.__qualname__) 322 | # Log.get_instance().log_info(f"Executed function {func.__module__}.{func.__qualname__}", custom_dimension=fn_k) 323 | return wrapper 324 | 325 | if original_function: 326 | return _decorate(original_function) 327 | 328 | return _decorate 329 | 330 | 331 | @typechecked 332 | # class BaseObject(ABC): 333 | # TODO: if works, remove ABC class 334 | class BaseObject(): 335 | """ 336 | Base class to use with any object new object. 337 | It implements the method log which will be used for logging 338 | 339 | """ 340 | 341 | @abstractmethod 342 | def log(self, prefix="", suffix=""): 343 | """ 344 | Specifices how to log the object 345 | """ 346 | pass 347 | 348 | @classmethod 349 | def class_name(cls) -> str: 350 | return cls.__name__.lower() 351 | -------------------------------------------------------------------------------- /src/modules/dbkcore/helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various utilities for speed up development 3 | """ 4 | import os 5 | from pathlib import Path 6 | import json 7 | from typing import Any 8 | 9 | 10 | 11 | 12 | def current_directory() -> str: 13 | """ 14 | Get current directory. 15 | 16 | Returns 17 | ------- 18 | str 19 | The current directory path 20 | """ 21 | return os.path.dirname(os.path.realpath(__file__)) 22 | 23 | 24 | def add_folder_in_current_directory(folder_name: str) -> bool: 25 | """ 26 | Add a folder in the current directory. 27 | 28 | Parameters 29 | ---------- 30 | folder_name : str 31 | New folder name 32 | 33 | Returns 34 | ------- 35 | bool 36 | True if success 37 | """ 38 | output_folder = os.path.join(current_directory(), folder_name) 39 | os.makedirs(output_folder) 40 | return True 41 | 42 | 43 | def is_json_serializable(x: Any) -> bool: 44 | """ 45 | Check if the object is serializable. 46 | 47 | Parameters 48 | ---------- 49 | x : Any 50 | Object to validate 51 | 52 | Returns 53 | ------- 54 | bool 55 | True if success 56 | """ 57 | try: 58 | json.dumps(x) 59 | return True 60 | except Exception: 61 | return False 62 | 63 | 64 | 65 | # TODO: remove 66 | # def load_envs(current_file: Path): 67 | # """ 68 | # Helper function for local development 69 | 70 | # Parameters 71 | # ---------- 72 | # current_file : Path 73 | # Paht of the current file 74 | # """ 75 | # from dotenv import load_dotenv 76 | 77 | # root_folder = "analytics" 78 | # found_env = False 79 | # base_env = current_file.parent.absolute() 80 | 81 | # while not found_env: 82 | # matches = [f for f in base_env.glob("*.env")] 83 | # # print(matches) 84 | # if matches: 85 | # env_file = [f for f in base_env.glob("*.env")][0] 86 | # print(env_file) 87 | # load_dotenv(env_file, override=True, verbose=True) 88 | # found_env = True 89 | # print("Environment file found") 90 | # elif base_env.name == root_folder: 91 | # break 92 | # else: 93 | # base_env = base_env.parent 94 | 95 | 96 | -------------------------------------------------------------------------------- /src/modules/dbkcore/requirements.txt: -------------------------------------------------------------------------------- 1 | opencensus-ext-azure>=1.0.2 2 | typeguard==2.7.1 -------------------------------------------------------------------------------- /src/modules/dbkdev/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/modules/dbkdev/__init__.py -------------------------------------------------------------------------------- /src/modules/dbkdev/core.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import DataFrame, SparkSession 2 | from dbkcore.core import trace, Log 3 | import re 4 | from pathlib import Path 5 | from typing import Any, List 6 | from typeguard import typechecked 7 | from dbkcore.core import Singleton 8 | from enum import Enum 9 | import pkg_resources 10 | 11 | 12 | 13 | 14 | 15 | class IdeEnvironment(str, Enum): 16 | LOCAL = "local" 17 | DATABRICKS = "databricks" 18 | 19 | 20 | @typechecked 21 | class DevelopmentClient(metaclass=Singleton): 22 | """ 23 | Client to use for local Databricks' local development 24 | """ 25 | 26 | # @trace 27 | def __init__( 28 | self, 29 | dbutils, 30 | spark: SparkSession, 31 | ide_environment: IdeEnvironment 32 | # deployment_environment: DeploymentEnvironment 33 | ): 34 | """ 35 | Instantiates this object 36 | 37 | Parameters 38 | ---------- 39 | dbutils : Dbutils 40 | The Dbutils instance to use 41 | spark : SparkSession 42 | The SparkSession to use 43 | ide_environment : IdeEnvironment 44 | The environment used 45 | deployment_environment : DeploymentEnvironment 46 | The deployment environment 47 | """ 48 | self.__spark = spark 49 | self.__dbutils = dbutils 50 | self.__ide_environment = ide_environment 51 | 52 | @property 53 | def spark(self) -> SparkSession: 54 | return self.__spark 55 | 56 | @property 57 | def dbutils(self) -> Any: 58 | return self.__dbutils 59 | 60 | @property 61 | def __storage_account_name(self) -> str: 62 | res = self.dbutils.secrets.get( 63 | scope="storage", 64 | key="name" 65 | ) 66 | return res 67 | 68 | @property 69 | def __blob_container_name(self) -> str: 70 | res = self.dbutils.secrets.get( 71 | scope="storage", 72 | key="container" 73 | ) 74 | return res 75 | 76 | @property 77 | def __storage_account_access_key(self) -> str: 78 | res = self.dbutils.secrets.get( 79 | scope="storage", 80 | key="key" 81 | ) 82 | return res 83 | 84 | @property 85 | def ide_environment(self) -> IdeEnvironment: 86 | return self.__ide_environment 87 | 88 | @trace(attrs_refact=['dbkea_token']) 89 | def set_dbkea(self, dbkea_token: str): 90 | """ 91 | To use when the environment is LOCAL for using the dbutils secrets 92 | 93 | Parameters 94 | ---------- 95 | dbkea_token : str 96 | The token 97 | """ 98 | self.__dbkea_token = dbkea_token 99 | self.dbutils.secrets.setToken(dbkea_token) 100 | 101 | @property 102 | def mount_name(self) -> str: 103 | """ 104 | Standard name of the root mount for the configured storage account and container 105 | 106 | Returns 107 | ------- 108 | str 109 | [description] 110 | """ 111 | res = "{}_{}".format(self.__storage_account_name, self.__blob_container_name) 112 | return res 113 | 114 | @property 115 | def mount_path(self) -> str: 116 | """ 117 | Standard mount path 118 | 119 | Returns 120 | ------- 121 | str 122 | The path 123 | """ 124 | res = 'dbfs:/mnt/{}/'.format(self.mount_name) 125 | return res 126 | 127 | @trace 128 | def read_csv(self, file_path: str) -> DataFrame: 129 | # blob_path = self.__blob_path(file_path) 130 | blob_df = self.spark.read.format("csv").\ 131 | option("inferSchema", "true").\ 132 | option("header", "true").\ 133 | option("delimiter", ",").\ 134 | option("charset", "utf-8").load(file_path) 135 | return blob_df 136 | 137 | @trace 138 | def read_parquet(self, file_path: str) -> DataFrame: 139 | return self.spark.read.parquet(file_path) 140 | 141 | @trace 142 | def save_temp_table( 143 | self, 144 | dataframe: DataFrame, 145 | # schema: str, 146 | table_name: str, 147 | cache=True 148 | ): 149 | # TODO: Documentation 150 | 151 | # self.create_schema(schema) 152 | # dbk_table_name = f"{schema}_{table_name}" 153 | Log.get_instance().log_info(f"Creating temp table: {table_name}") 154 | if cache: 155 | dataframe.cache().createOrReplaceGlobalTempView(table_name) 156 | else: 157 | dataframe.createOrReplaceGlobalTempView(table_name) 158 | 159 | @trace 160 | def load_temp_table( 161 | self, 162 | # schema: str, 163 | table_name: str 164 | ) -> DataFrame: 165 | # TODO: Documentation 166 | 167 | Log.get_instance().log_info(f"Loading temp table: {table_name}") 168 | # self.create_schema(schema) 169 | # dbk_table_name = f"{schema}.{table_name}" 170 | global_temp_db = self.spark.conf.get("spark.sql.globalTempDatabase") 171 | # dt = self.spark.conf.get(f"spark.sql.{dbk_table_name}") 172 | dt = self.spark.read.table(f"{global_temp_db}.{table_name}") 173 | return dt 174 | 175 | @trace 176 | def save_delta_table( 177 | self, 178 | dataframe: DataFrame, 179 | schema: str, 180 | table_name: str, 181 | output_path: Path, 182 | partition_columns: List[str] = None, 183 | mode: str = 'overwrite', 184 | overwrite_schema: bool = False 185 | ): 186 | """ 187 | Saves the dataframe as a delta table in an external location 188 | Parameters 189 | ---------- 190 | dataframe : DataFrame 191 | The dataframe 192 | schema : str 193 | Destination schema 194 | table_name : str 195 | Destination schema 196 | output_path : Path 197 | Folder where to save the dataframe 198 | partition_columns: List[str] 199 | Columns to use for partitioning, default is None 200 | mode: str 201 | e.g. append, overwrite, passed to dataframe.write.saveAsTable 202 | """ 203 | self.create_schema(schema) 204 | dbk_table_name = f"{schema}.{table_name}" 205 | # mnt_path = str(Path('mnt', mount_name, root_path, schema, table_name)) 206 | # path = f"dbfs:/{mnt_path}" 207 | path = output_path.joinpath(dbk_table_name.replace('.', '_')) 208 | self._save_table( 209 | dataframe=dataframe, 210 | table_name=dbk_table_name, 211 | path=str(path), 212 | format='delta', 213 | mode=mode, 214 | partition_columns=partition_columns, 215 | overwrite_schema=overwrite_schema 216 | ) 217 | 218 | @trace 219 | def _save_table( 220 | self, 221 | dataframe: DataFrame, 222 | table_name: str, 223 | path: str, 224 | format: str, 225 | mode: str, 226 | partition_columns: List[str] = None, 227 | overwrite_schema=False 228 | ): 229 | """ 230 | Saves the given dataframe into a delta table 231 | 232 | Parameters 233 | ---------- 234 | dataframe : DataFrame 235 | Dataframe to save 236 | schema : str 237 | Schema into save 238 | table_name : str 239 | Name of the table 240 | """ 241 | # TODO: Update documentation 242 | if table_name is None or "": 243 | raise Exception("Table name missing") 244 | if not path: 245 | raise Exception("Path missing") 246 | # Create hive table from a dataframe (for the final ETL process) 247 | # self.spark.sql("DROP TABLE IF EXISTS {}".format(table_name)) 248 | dataframe.write.saveAsTable(table_name, mode=mode, format=format, path=path, partitionBy=partition_columns, overwriteSchema=overwrite_schema) 249 | 250 | @trace 251 | def table_exists( 252 | self, 253 | schema_name: str, 254 | table_name: str 255 | ): 256 | return table_name in [t.name for t in self.spark.catalog.listTables(schema_name)] 257 | 258 | @trace 259 | def list_tables(self, schema: str) -> List[str]: 260 | """ 261 | List the tables in the given schema 262 | 263 | Parameters 264 | ---------- 265 | schema : str 266 | The Databricks schema 267 | 268 | Returns 269 | ------- 270 | List[str] 271 | List of tables 272 | """ 273 | df = self.spark.sql("show tables in {}".format(schema)).toPandas() 274 | table_name = df["tableName"] # ! Could be wrong 275 | return list(table_name) 276 | 277 | @trace 278 | def list_databases(self) -> List[str]: 279 | """ 280 | Gets the list of Databricks databases (a.k.a. schemas) 281 | 282 | Returns 283 | ------- 284 | List[str] 285 | List of schemas 286 | """ 287 | df = self.spark.sql("show schemas").toPandas() 288 | databases = df["databaseName"].tolist() 289 | return databases 290 | 291 | @trace 292 | def create_schema(self, schema_databricks: str): 293 | """ 294 | Creates a schema in Databricks 295 | 296 | Parameters 297 | ---------- 298 | schema_databricks : str 299 | Name of the schema 300 | """ 301 | self.spark.sql("CREATE SCHEMA IF NOT EXISTS {}".format(schema_databricks)) 302 | 303 | @trace 304 | def mount_exists(self, mount_name: str) -> bool: 305 | mounts = self.list_mounts() 306 | names = [m.name.replace('/', '') for m in mounts] 307 | res = True if mount_name in names else False 308 | return res 309 | 310 | @trace 311 | def list_mounts(self) -> list: 312 | mounts = self.files('/mnt') 313 | return mounts 314 | 315 | @trace 316 | def files(self, path: str) -> list: 317 | files = self.dbutils.fs.ls(path) 318 | return files 319 | 320 | @classmethod 321 | def get_instance(cls): 322 | return DevelopmentClient() 323 | 324 | 325 | class DevelopmentEngine(metaclass=Singleton): 326 | 327 | def __init__(self): 328 | self.spark = self.__get_spark() 329 | dbutils, ide_environment = self.__get_dbutils(self.spark) 330 | self.dbutils = dbutils 331 | self.ide_environment = ide_environment 332 | 333 | def __get_spark(self) -> SparkSession: 334 | MAX_MEMORY = "5g" 335 | spark = SparkSession.builder.\ 336 | config("spark.executor.memory", MAX_MEMORY).\ 337 | config("spark.driver.memory", MAX_MEMORY).\ 338 | config("spark.driver.maxResultSize", MAX_MEMORY).\ 339 | getOrCreate() 340 | return spark 341 | # import IPython 342 | # user_ns = IPython.get_ipython().user_ns 343 | # if "spark" in user_ns: 344 | # return user_ns["spark"] 345 | # else: 346 | # from pyspark.sql import SparkSession 347 | # user_ns["spark"] = SparkSession.builder.getOrCreate() 348 | # return user_ns["spark"] 349 | 350 | def __get_dbutils(self, spark: SparkSession): 351 | try: 352 | from pyspark.dbutils import DBUtils 353 | dbutils = DBUtils(spark) 354 | env = IdeEnvironment.LOCAL if "databricks-connect" in [i.key for i in pkg_resources.working_set] else IdeEnvironment.DATABRICKS 355 | except ImportError: 356 | import IPython 357 | dbutils = IPython.get_ipython().user_ns["dbutils"] 358 | env = IdeEnvironment.DATABRICKS 359 | return (dbutils, env) 360 | 361 | @classmethod 362 | def get_instance(cls): 363 | return DevelopmentEngine() 364 | -------------------------------------------------------------------------------- /src/modules/dbkdev/data_steps.py: -------------------------------------------------------------------------------- 1 | from dbkcore.core import BaseObject, trace, Log 2 | from pyspark.sql.dataframe import DataFrame as PyDataFrame 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql.utils import AnalysisException 6 | from enum import Enum 7 | # from abc import ABC, abstractmethod 8 | from abc import abstractmethod 9 | import pandas as pd 10 | from pandas import DataFrame as PdDataFrame 11 | from pandas.api.types import is_numeric_dtype 12 | from typing import Union, List 13 | from typeguard import typechecked 14 | from pathlib import Path 15 | import functools as _functools 16 | 17 | 18 | 19 | 20 | class DataDirection(str, Enum): 21 | IN = "in" 22 | OUT = "out" 23 | 24 | 25 | class DataStepDataframe(BaseObject): 26 | 27 | # TODO: Change name to name_or_path 28 | 29 | def __init__(self, name: str, dataframe: Union[PyDataFrame, PdDataFrame], cache=False): 30 | self.name = name 31 | self.dataframe = dataframe 32 | # Note: it has been removed for spark since the dataframe has to be read 33 | self.__rows = None 34 | self.columns_count = len(dataframe.columns) if isinstance(dataframe, PyDataFrame) else dataframe.shape[1] 35 | self.columns_names = dataframe.columns 36 | self.cache = cache 37 | self.__columns_negative = None 38 | self.__columns_null = None 39 | 40 | def to_pandas(self) -> PdDataFrame: 41 | if self.is_pandas: 42 | return self.dataframe 43 | elif self.is_pyspark: 44 | return self.dataframe.toPandas() 45 | 46 | @property 47 | def rows(self): 48 | if not self.__rows: 49 | # self.__rows = self.dataframe.cache().count() if self.cache else dataframe.count() 50 | # TODO: improve me 51 | if self.cache: 52 | self.__rows = self.dataframe.cache().count() if isinstance(self.dataframe, PyDataFrame) else self.dataframe.shape[0] 53 | else: 54 | self.__rows = self.dataframe.count() if isinstance(self.dataframe, PyDataFrame) else self.dataframe.shape[0] 55 | return self.__rows 56 | 57 | @trace 58 | def columns_negative(self) -> List[str]: 59 | """ 60 | Identifies the columns with negative values 61 | 62 | Returns 63 | ------- 64 | List[str] 65 | Column names 66 | """ 67 | columns = [] 68 | 69 | if not self.__columns_negative: 70 | if isinstance(self.dataframe, PyDataFrame): 71 | for column in self.columns_names: 72 | count = 0 73 | try: 74 | count = self.dataframe.filter((F.col(column) < 0)).count() 75 | except AnalysisException: 76 | pass 77 | if count > 0: 78 | columns.append(column) 79 | elif isinstance(self.dataframe, pd.DataFrame): 80 | for column in self.columns_names: 81 | if is_numeric_dtype(self.dataframe[column]): 82 | dt_filtered = self.dataframe[self.dataframe[column] < 0] 83 | count = dt_filtered.shape[0] 84 | if count > 0: 85 | columns.append(column) 86 | self.__columns_negative = columns 87 | return self.__columns_negative 88 | 89 | @trace 90 | def columns_null(self) -> List[str]: 91 | """ 92 | Identifies the columns with null values 93 | 94 | Returns 95 | ------- 96 | List[str] 97 | Column names 98 | """ 99 | if not self.__columns_null: 100 | columns = [] 101 | if isinstance(self.dataframe, PyDataFrame): 102 | for column in self.columns_names: 103 | count = self.dataframe.filter(F.col(column).isNull()).count() 104 | if count > 0: 105 | columns.append(column) 106 | elif isinstance(self.dataframe, pd.DataFrame): 107 | nan_cols = self.dataframe.columns[self.dataframe.isna().any()].tolist() 108 | columns.extend(nan_cols) 109 | self.__columns_null = columns 110 | return self.__columns_null 111 | 112 | @property 113 | def is_pandas(self) -> bool: 114 | return isinstance(self.dataframe, PdDataFrame) 115 | 116 | @property 117 | def is_pyspark(self) -> bool: 118 | return isinstance(self.dataframe, PyDataFrame) 119 | 120 | def log_in(self): 121 | self.log(direction=DataDirection.IN) 122 | 123 | def log_out(self): 124 | self.log(direction=DataDirection.OUT) 125 | 126 | def log(self, direction: DataDirection): 127 | dt_name = self.name 128 | 129 | # dt_tag_prefix = "DT:{}".format(direction.upper(), dt_name) 130 | # dt_name_tag = "{}:NAME".format(dt_tag_prefix) 131 | 132 | # dt_rows_tag = "{}:ROWS:COUNT".format(dt_tag_prefix) 133 | # if isinstance(self.dataframe, PyDataFrame): 134 | # dt_rows = self.dataframe.count() 135 | # elif isinstance(self.dataframe, PdDataFrame): 136 | # dt_rows = self.dataframe.shape[0] 137 | 138 | # dt_columns_tag = "{}:COLUMNS:COUNT".format(dt_tag_prefix) 139 | # if isinstance(self.dataframe, PyDataFrame): 140 | # dt_columns = len(self.dataframe.columns) 141 | # elif isinstance(self.dataframe, PdDataFrame): 142 | # dt_columns = self.dataframe.shape[1] 143 | 144 | # dt_columns_names_tag = "{}:COLUMNS:NAMES".format(dt_tag_prefix) 145 | # dt_columns_names = ', '.join(self.dataframe.columns) 146 | 147 | # Log.get_instance().log_info(f"{dt_name_tag}:{dt_name}", prefix=direction, custom_dimension=dimensions) 148 | # Log.get_instance().log_info(f"{dt_rows_tag}:{dt_rows}", prefix=direction) 149 | # Log.get_instance().log_info(f"{dt_columns_tag}:{dt_columns}", prefix=direction) 150 | # Log.get_instance().log_info(f"{dt_columns_names_tag}:{dt_columns_names}", prefix=direction) 151 | 152 | dimensions = { 153 | 'dataset_name': dt_name, 154 | 'rows_count': self.rows, 155 | 'columns_count': self.columns_count, 156 | 'columns_name': self.columns_names, 157 | 'direction': direction 158 | } 159 | Log.get_instance().log_info(f"Processed dataset {dt_name} with {direction.upper()} direction", custom_dimension=dimensions) 160 | 161 | 162 | def apply_test(func): 163 | """ 164 | Execute test function after the initialize. 165 | 166 | Notes 167 | ----- 168 | [Example](https://stackoverflow.com/a/15196410) 169 | """ 170 | @_functools.wraps(func) 171 | def wrapper(self, *args, **kwargs): 172 | res = func(self, *args, **kwargs) 173 | self.tests() 174 | return res 175 | return wrapper 176 | 177 | 178 | def pre_apply_test(func): 179 | """ 180 | Execute test function before the initialize. 181 | 182 | Notes 183 | ----- 184 | [Example](https://stackoverflow.com/a/15196410) 185 | """ 186 | @_functools.wraps(func) 187 | def wrapper(self, *args, **kwargs): 188 | self.tests() 189 | res = func(self, *args, **kwargs) 190 | return res 191 | return wrapper 192 | 193 | 194 | def log_output(func): 195 | """ 196 | Decorator for executing test in sequence 197 | 198 | Notes 199 | ----- 200 | [Example](https://stackoverflow.com/a/15196410) 201 | """ 202 | @_functools.wraps(func) 203 | def wrapper(self, *args, **kwargs): 204 | res = func(self, *args, **kwargs) 205 | self.tests() 206 | return res 207 | return wrapper 208 | 209 | 210 | # TODO: if works, remove ABC class 211 | # class DataStep(ABC): 212 | @typechecked 213 | class DataStep(): 214 | """ 215 | Creates a datastep to be used in a pipeline 216 | 217 | Parameters 218 | ---------- 219 | metaclass : [type], optional 220 | [description], by default abc.ABCMeta 221 | 222 | Raises 223 | ------ 224 | Exception 225 | [description] 226 | """ 227 | 228 | @trace 229 | def __init__( 230 | self, 231 | spark: SparkSession, 232 | run_id: str 233 | ): 234 | self.spark = spark 235 | self.run_id = run_id 236 | 237 | @property 238 | def display_name(self) -> str: 239 | res = type(self).__name__ 240 | return res 241 | 242 | @trace 243 | def spark_read_table(self, name: str) -> DataStepDataframe: 244 | dt = self.spark.read.table(name) 245 | datastep_dataframe = DataStepDataframe(name=name, dataframe=dt) 246 | datastep_dataframe.log(DataDirection.IN) 247 | return datastep_dataframe 248 | 249 | @trace 250 | def spark_read_temp_table(self, name: str) -> DataStepDataframe: 251 | global_temp_db = self.spark.conf.get(f"spark.sql.globalTempDatabase") 252 | dt = self.spark.read.table(f'{global_temp_db}.{name}') 253 | datastep_dataframe = DataStepDataframe(name=name, dataframe=dt) 254 | datastep_dataframe.log_in() 255 | return datastep_dataframe 256 | 257 | @trace 258 | def spark_read_parquet_path(self, path: Path, cache=False) -> DataStepDataframe: 259 | path_str = str(path) 260 | dt = self.spark.read.parquet(path_str) 261 | datastep_dataframe = DataStepDataframe(name=path_str, dataframe=dt, cache=cache) 262 | datastep_dataframe.log_in() 263 | return datastep_dataframe 264 | 265 | @trace 266 | def pandas_read_csv(self, path: Path) -> DataStepDataframe: 267 | datastep_dataframe = self.spark_read_csv(path) 268 | datastep_dataframe.dataframe = datastep_dataframe.dataframe.toPandas() 269 | datastep_dataframe.log_in() 270 | return datastep_dataframe 271 | 272 | @trace 273 | def spark_read_csv(self, path: Path) -> DataStepDataframe: 274 | path_str = str(path) 275 | dt = self.spark.read.format("csv").\ 276 | option("inferSchema", "true").\ 277 | option("header", "true").\ 278 | option("delimiter", ",").\ 279 | option("charset", "utf-8").load(path_str) 280 | datastep_dataframe = DataStepDataframe(name=path_str, dataframe=dt) 281 | datastep_dataframe.log_in() 282 | return datastep_dataframe 283 | 284 | @trace 285 | def test_rows_leq(self, dt_1: DataStepDataframe, dt_2: DataStepDataframe): 286 | assert dt_1.rows < dt_2.rows,\ 287 | "ROWS CHECK: {dt_1_name} ({dt_1_rows}) < {dt_2_name} ({dt_2_rows})".format( 288 | dt_1_name=dt_1.name, 289 | dt_1_rows=dt_1.rows, 290 | dt_2_name=dt_2.name, 291 | dt_2_rows=dt_2.rows) 292 | print("Asserted") 293 | 294 | @trace 295 | def test_rows_eq(self, dt_1: DataStepDataframe, dt_2: DataStepDataframe): 296 | assert dt_1.rows == dt_2.rows,\ 297 | "ROWS CHECK: {dt_1_name} ({dt_1_rows}) == {dt_2_name} ({dt_2_rows})".format( 298 | dt_1_name=dt_1.name, 299 | dt_1_rows=dt_1.rows, 300 | dt_2_name=dt_2.name, 301 | dt_2_rows=dt_2.rows) 302 | print("Asserted") 303 | 304 | @trace 305 | def test_rows_geq(self, dt_1: DataStepDataframe, dt_2: DataStepDataframe): 306 | assert dt_1.rows >= dt_2.rows,\ 307 | "ROWS CHECK: {dt_1_name} ({dt_1_rows}) >= {dt_2_name} ({dt_2_rows})".format( 308 | dt_1_name=dt_1.name, 309 | dt_1_rows=dt_1.rows, 310 | dt_2_name=dt_2.name, 311 | dt_2_rows=dt_2.rows) 312 | print("Asserted") 313 | 314 | @trace 315 | def test_rows_diff(self, dt_1: DataStepDataframe, dt_2: DataStepDataframe): 316 | assert dt_1.rows != dt_2.rows,\ 317 | "ROWS CHECK: {dt_1_name} ({dt_1_rows}) >= {dt_2_name} ({dt_2_rows})".format( 318 | dt_1_name=dt_1.name, 319 | dt_1_rows=dt_1.rows, 320 | dt_2_name=dt_2.name, 321 | dt_2_rows=dt_2.rows) 322 | print("Asserted") 323 | 324 | @trace 325 | def test_negative_values(self, cols: List[str], dt: DataStepDataframe): 326 | for col in cols: 327 | assert col not in dt.columns_negative(), f"Dataset {dt.name} has columns with negative values -> {col}" 328 | 329 | @trace 330 | def test_null_values(self, cols: List[str], dt: DataStepDataframe): 331 | for col in cols: 332 | assert col not in dt.columns_null(), f"Dataset {dt.name} has columns with null values -> {col}" 333 | 334 | @trace 335 | def test_is_dataframe_empty(self, df: PyDataFrame): 336 | count = df.count() 337 | assert count > 0, "the dataframe count is zero" 338 | 339 | @property 340 | def output_data(self) -> DataStepDataframe: 341 | return self.__output_data 342 | 343 | @trace 344 | def check_output(self, **kwargs): 345 | if kwargs: 346 | for key, value in kwargs.items(): 347 | if isinstance(value, (PyDataFrame, PdDataFrame)): 348 | msg = "Output Pandas or PySpark dataframe must be encapsulated into DataStepDataframe" 349 | Log.get_instance().log_error(msg) 350 | raise ValueError(msg) 351 | elif isinstance(value, DataStepDataframe): 352 | value.log(direction=DataDirection.OUT) 353 | else: 354 | Log.get_instance().log_info(f'{key}:{value}') 355 | # setattr(self, key, value) 356 | 357 | @trace 358 | def set_output_data(self, dataframe: Union[PyDataFrame, PdDataFrame], name='', cache: bool = False): 359 | name = self.display_name if not name else name 360 | self.__output_data = DataStepDataframe( 361 | name=name, 362 | dataframe=dataframe, 363 | cache=cache) 364 | self.__output_data.log_out() 365 | 366 | @trace 367 | @abstractmethod 368 | def initialize(self): 369 | """ 370 | Define the DataStep logic. 371 | """ 372 | pass 373 | 374 | @trace 375 | @abstractmethod 376 | def tests(self): 377 | """ 378 | Define all the the tests that this step must pass 379 | """ 380 | pass 381 | -------------------------------------------------------------------------------- /src/modules/dbkdev/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark 2 | pandas>=0.24.2 -------------------------------------------------------------------------------- /src/modules/dbkenv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/modules/dbkenv/__init__.py -------------------------------------------------------------------------------- /src/modules/dbkenv/core.py: -------------------------------------------------------------------------------- 1 | import databricks_cli.sdk as _dbkcli 2 | import databricks_cli.sdk.service as _dss 3 | from dbkcore.core import trace 4 | from dbkcore.core import Log 5 | # from dbkcore.core import Log 6 | import os as _os 7 | import base64 as _base64 8 | import tempfile as _tempfile 9 | import time as _time 10 | import enum as _enum 11 | import typing as _typing 12 | from dotenv import load_dotenv 13 | 14 | 15 | 16 | 17 | 18 | class Configuration(): 19 | """Retrieve the keys used from the package from the local environment.""" 20 | 21 | def __init__(self, file_load: bool = False): 22 | """ 23 | Initialize the configuration class. 24 | 25 | Parameters 26 | ---------- 27 | file_load : bool, optional 28 | Search .env file and loads it, by default False 29 | """ 30 | 31 | if file_load: 32 | load_dotenv(override=True) 33 | # self._APPINSIGHT_CONNECTIONSTRING = _os.getenv("APPI_IK") 34 | # self.DATABRICKS_HOST = _os.getenv('DATABRICKS_HOST') 35 | # self.DATABRICKS_TOKEN = _os.getenv('DATABRICKS_TOKEN') 36 | # self.DATABRICKS_ORDGID = _os.getenv('DATABRICKS_ORDGID') 37 | # self.AZURE_SUBSCRIPTIONID = _os.getenv('AZURE_SUBSCRIPTIONID') 38 | # self.RESOURCEGROUP_NAME = _os.getenv('RESOURCEGROUP_NAME') 39 | # self.RESOURCEGROUP_REGION = _os.getenv('RESOURCEGROUP_REGION') 40 | 41 | @property 42 | def APPINSIGHT_CONNECTIONSTRING(self) -> str: 43 | """ 44 | Application insight connection string. 45 | 46 | Returns 47 | ------- 48 | str 49 | The connection string 50 | """ 51 | res = _os.environ["APPI_IK"] 52 | return res 53 | 54 | @property 55 | def DATABRICKS_HOST(self) -> str: 56 | """ 57 | Databricks host url. 58 | 59 | Returns 60 | ------- 61 | str 62 | The host url 63 | """ 64 | res = _os.environ["DATABRICKS_HOST"] 65 | return res 66 | 67 | @property 68 | def DATABRICKS_TOKEN(self) -> str: 69 | """ 70 | Databricks personal roken. 71 | 72 | Returns 73 | ------- 74 | str 75 | The token 76 | """ 77 | res = _os.environ["DATABRICKS_TOKEN"] 78 | return res 79 | 80 | @property 81 | def DATABRICKS_ORDGID(self) -> str: 82 | """ 83 | Databricks organization id. 84 | 85 | Returns 86 | ------- 87 | str 88 | The id 89 | """ 90 | res = _os.environ["DATABRICKS_ORDGID"] 91 | return res 92 | 93 | 94 | class DatabricksEnvironment(str, _enum.Enum): 95 | """ 96 | Describes the type of environment used 97 | """ 98 | LOCAL = 'local' 99 | DATABRICKS = 'databricks' 100 | 101 | 102 | class Package(): 103 | """ 104 | Rapresents the python package installed in databricks 105 | 106 | Reference 107 | --------- 108 | 109 | Documentation [link](https://docs.databricks.com/dev-tools/api/latest/libraries.html#example-response) 110 | """ 111 | 112 | @trace 113 | def __init__(self, origin: str, package: str, repo: str): 114 | """ 115 | Creates this object 116 | 117 | Parameters 118 | ---------- 119 | origin : str 120 | Origin of the package 121 | package : str 122 | Name and version of the package 123 | repo : str 124 | The repository of the package 125 | """ 126 | super().__init__() 127 | self.origin = origin 128 | self.package = package 129 | self.repo = repo 130 | 131 | @trace 132 | def to_api_json(self): 133 | return { 134 | self.origin: { 135 | 'package': self.package, 136 | 'repo': self.repo 137 | } 138 | } 139 | 140 | 141 | class ResourceClient(): 142 | """ 143 | Client used to interact with a Databricks cluster 144 | """ 145 | 146 | @trace(attrs_refact=['personal_token']) 147 | def __init__(self, host: str, personal_token: str): 148 | """ 149 | Instantiates this object 150 | 151 | Parameters 152 | ---------- 153 | host : str 154 | Host of the cluster 155 | personal_token : str 156 | Databricks personal token 157 | """ 158 | super().__init__() 159 | self.host = host 160 | self.personal_token = personal_token 161 | self.__api_client = None 162 | 163 | @property 164 | def apiClient(self) -> _dbkcli.ApiClient: 165 | """ 166 | Creates the Databricks API client 167 | 168 | Returns 169 | ------- 170 | ApiClient 171 | The client 172 | """ 173 | if not self.__api_client: 174 | self.__api_client = _dbkcli.ApiClient( 175 | host=self.host, 176 | token=self.personal_token 177 | ) 178 | return self.__api_client 179 | 180 | 181 | class Cluster(): 182 | """ 183 | Manages a Databricks cluster 184 | """ 185 | 186 | @trace 187 | def __init__( 188 | self, 189 | client: ResourceClient, 190 | cluster_name: str, 191 | cluster_configuration: dict 192 | ): 193 | """ 194 | Instantiates this object 195 | 196 | Parameters 197 | ---------- 198 | client : ResourceClient 199 | A ResourceClient object 200 | cluster_name : str 201 | Name of the cluster 202 | cluster_configuration : dict 203 | Dictionary that contains the cluster configuration 204 | appinsight_instrumentation_key : str 205 | Application Insights' instrumentation key 206 | """ 207 | self.client = client 208 | self.__cluster_name = cluster_name 209 | self.cluster_configuration = cluster_configuration 210 | # self.appinsight_instrumentation_key = appinsight_instrumentation_key 211 | self.__cluster_service = None 212 | 213 | @property 214 | def cluster_service(self) -> _dss.ClusterService: 215 | if not self.__cluster_service: 216 | self.__cluster_service = _dss.ClusterService(self.client.apiClient) 217 | return self.__cluster_service 218 | 219 | @trace 220 | def create_cluster_and_wait(self, redeploy=False) -> bool: 221 | """ 222 | Creates the cluster and waits for its done 223 | 224 | Parameters 225 | ---------- 226 | redeploy: bool 227 | Redeploy the cluster 228 | """ 229 | if self.cluster_configuration: 230 | deploy = False 231 | if self.cluster_exists(): 232 | if redeploy: 233 | print("Cluster {} exists. Dropping and recreating".format(self.cluster_name)) 234 | self.delete_cluster() 235 | deploy = True 236 | else: 237 | deploy = False 238 | else: 239 | deploy = True 240 | 241 | if deploy: 242 | spark_env_vars = self.cluster_configuration["spark_env_vars"] 243 | # spark_env_vars["APPINSIGHT_CONNECTIONSTRING"] = self.appinsight_instrumentation_key 244 | spark_env_vars["EXECUTER"] = f"DATABRICKS_{self.cluster_name}" 245 | self.cluster_service.create_cluster( 246 | num_workers=self.cluster_configuration.get("num_workers"), 247 | autoscale=self.cluster_configuration.get("autoscale"), 248 | cluster_name=self.cluster_configuration.get("cluster_name"), 249 | spark_version=self.cluster_configuration.get("spark_version"), 250 | spark_conf=self.cluster_configuration.get("spark_conf"), 251 | node_type_id=self.cluster_configuration.get("node_type_id"), 252 | driver_node_type_id=self.cluster_configuration.get("driver_node_type_id"), 253 | spark_env_vars=spark_env_vars, 254 | autotermination_minutes=self.cluster_configuration.get("autotermination_minutes"), 255 | enable_elastic_disk=self.cluster_configuration.get("enable_elastic_disk") 256 | ) 257 | 258 | searched_times = 0 259 | while not self.cluster_exists(): 260 | searched_times += 1 261 | _time.sleep(10) 262 | if searched_times > 10: 263 | raise Exception("Cluster failed to deploy") 264 | self.start_cluster_and_wait() 265 | else: 266 | print("Can't deploy since cluster configuration is missing") 267 | return True 268 | 269 | @trace 270 | def databricks_list_clusters(self) -> _typing.List[str]: 271 | """ 272 | List clusters in Databricks 273 | 274 | Returns 275 | ------- 276 | List[str] 277 | List of clusters 278 | """ 279 | return self.cluster_service.list_clusters() 280 | 281 | @property 282 | def cluster_name(self) -> str: 283 | if self.cluster_configuration: 284 | return self.cluster_configuration.get("cluster_name") 285 | else: 286 | return self.__cluster_name 287 | 288 | @trace 289 | def cluster_started(self) -> bool: 290 | """ 291 | Checks if the cluster is started 292 | 293 | Returns 294 | ------- 295 | bool 296 | True if started 297 | """ 298 | started = False 299 | if self.cluster_state() == "RUNNING": 300 | started = True 301 | return started 302 | 303 | @trace 304 | def cluster_exists(self) -> bool: 305 | """ 306 | Checks is the cluster exists 307 | 308 | Returns 309 | ------- 310 | bool 311 | True if exists 312 | """ 313 | exists = False 314 | if self.cluster_id: 315 | exists = True 316 | return exists 317 | 318 | @trace 319 | def install_package(self, packages: _typing.List[Package]) -> str: 320 | """ 321 | Installs the given packages 322 | 323 | Parameters 324 | ---------- 325 | packages : List[Package] 326 | The packages to install 327 | 328 | Returns 329 | ------- 330 | str 331 | Result from Databricks API call 332 | """ 333 | mls = _dss.ManagedLibraryService(self.client.apiClient) 334 | pkgs = [p.to_api_json() for p in packages] 335 | res = mls.install_libraries(self.cluster_id, pkgs) 336 | return res 337 | 338 | @property 339 | def cluster_id(self) -> str: 340 | """ 341 | Retrieves cluster's id 342 | 343 | Returns 344 | ------- 345 | str 346 | Id of the cluster 347 | """ 348 | cs = self.cluster_service 349 | cluster_list = cs.list_clusters() 350 | id = None 351 | if cluster_list: 352 | matches = [c['cluster_id'] for c in cluster_list["clusters"] if c['cluster_name'] == self.cluster_name] 353 | if matches: 354 | id = matches[0] 355 | self.__cluster_id = id 356 | return self.__cluster_id 357 | 358 | @trace 359 | def delete_cluster_and_wait(self) -> bool: 360 | """ 361 | Deletes the cluster and waits for completion 362 | 363 | """ 364 | cs = self.cluster_service 365 | id = self.cluster_id 366 | 367 | is_deleted = False if id else True 368 | 369 | if not is_deleted: 370 | cs.permanent_delete_cluster(id) 371 | 372 | requests = 0 373 | seconds_interval = 20 374 | timeout_requests = 20 375 | 376 | is_deleted = True 377 | while self.cluster_exists(): 378 | requests += 1 379 | _time.sleep(seconds_interval) 380 | message = "Waiting from {} seconds. Timeout at {}".format( 381 | seconds_interval * requests, 382 | timeout_requests * seconds_interval 383 | ) 384 | 385 | Log.get_instance().log_info(message=message) 386 | 387 | if requests > 20: 388 | is_deleted = False 389 | return is_deleted 390 | 391 | @trace 392 | def cluster_state(self) -> str: 393 | """ 394 | Checks cluster state 395 | 396 | Returns 397 | ------- 398 | str 399 | State of the cluster 400 | """ 401 | cs = self.cluster_service 402 | cluster_state = cs.get_cluster(self.cluster_id)["state"] 403 | return cluster_state 404 | 405 | @trace 406 | def start_cluster_and_wait(self) -> bool: 407 | """ 408 | Starts the cluster and wait for it completion 409 | 410 | Returns 411 | ------- 412 | bool 413 | True if started 414 | """ 415 | cluster_id = self.cluster_id 416 | cs = self.cluster_service 417 | 418 | if self.cluster_state() == "RUNNING": 419 | return "Already running" 420 | elif self.cluster_state() == "PENDING": 421 | _time.sleep(20) 422 | # Waiting cluster to start 423 | requests = 0 424 | seconds_interval = 20 425 | timeout_requests = 20 426 | while self.cluster_state() == "PENDING": 427 | requests += 1 428 | _time.sleep(seconds_interval) 429 | message = "Waiting from {} seconds. Timeout at {}".format( 430 | seconds_interval * requests, 431 | timeout_requests * seconds_interval 432 | ) 433 | Log.get_instance().log_info(message=message) 434 | if requests > 20: 435 | raise Exception("Cluster not started") 436 | return True 437 | elif self.cluster_state() in ["TERMINATED", "TERMINATING"]: 438 | cs.start_cluster(cluster_id) 439 | _time.sleep(20) 440 | # Waiting cluster to start 441 | requests = 0 442 | seconds_interval = 20 443 | timeout_requests = 20 444 | while self.cluster_state() == "PENDING": 445 | requests += 1 446 | _time.sleep(seconds_interval) 447 | message = "Waiting from {} seconds. Timeout at {}".format( 448 | seconds_interval * requests, 449 | timeout_requests * seconds_interval 450 | ) 451 | Log.get_instance().log_info(message=message) 452 | if requests > 20: 453 | raise Exception("Cluster not started") 454 | return True 455 | else: 456 | raise Exception("Unmanaged state: {}".format(self.cluster_state())) 457 | 458 | 459 | class Jobs(): 460 | """ 461 | Rapresents a Databricks Job 462 | """ 463 | @trace 464 | def __init__( 465 | self, 466 | client: ResourceClient 467 | ): 468 | """ 469 | Instantiates this object 470 | 471 | Parameters 472 | ---------- 473 | client : ResourceClient 474 | A client 475 | """ 476 | self.client = client 477 | self.__jobs_service = _dss.JobsService(self.client.apiClient) 478 | 479 | @trace 480 | def run_notebook_and_wait( 481 | self, 482 | destination_path: str, 483 | cluster_id: str, 484 | delete_run=False, 485 | ) -> str: 486 | """ 487 | Run a notebooks and waits for its completion 488 | 489 | Parameters 490 | ---------- 491 | destination_path : str 492 | Notebooks path 493 | cluster_id : str 494 | Cluster's id 495 | delete_run : bool, optional 496 | Deletes the run onces it's completed, by default False 497 | 498 | Returns 499 | ------- 500 | str 501 | Result from Databricks API call 502 | """ 503 | djs = self.__jobs_service 504 | destination_path_dbfs = destination_path 505 | base = _os.path.basename(destination_path) 506 | filename = _os.path.splitext(base)[0] 507 | 508 | job = djs.create_job( 509 | name=filename, 510 | existing_cluster_id=cluster_id, 511 | notebook_task={"notebook_path": destination_path_dbfs} 512 | ) 513 | 514 | job_id = job["job_id"] 515 | run = djs.run_now(job_id=job_id) 516 | run_id = run['run_id'] 517 | run_status = djs.get_run(run_id) 518 | # run_state = run_status['state'] 519 | 520 | while run_status['state']["life_cycle_state"] != "TERMINATED": 521 | _time.sleep(10) 522 | run_status = djs.get_run(run_id) 523 | if run_status['state']["life_cycle_state"] == 'INTERNAL_ERROR': 524 | raise Exception(run_status['state']["life_cycle_state"]) 525 | 526 | output = None 527 | 528 | if run_status['state']['result_state'] == 'SUCCESS': 529 | output = djs.get_run_output(run_id).get('notebook_output').get('result') 530 | elif run_status['state']['result_state'] == 'FAILED': 531 | output = "FAILED" 532 | 533 | if delete_run: 534 | djs.delete_job(job_id) 535 | djs.delete_run(run_id) 536 | return output 537 | 538 | 539 | class Secret(): 540 | """ 541 | Manages Databricks' secrets 542 | """ 543 | 544 | @trace 545 | def __init__( 546 | self, 547 | client: ResourceClient 548 | ): 549 | """ 550 | Instantiates this object. 551 | 552 | Parameters 553 | ---------- 554 | client : ResourceClient 555 | A ResourceClient object 556 | """ 557 | self.client = client 558 | self.__secret_service = None 559 | 560 | @property 561 | def secret_service(self): 562 | if not self.__secret_service: 563 | self.__secret_service = _dss.SecretService(self.client.apiClient) 564 | return self.__secret_service 565 | 566 | @trace 567 | def delete_scope(self, scope: str) -> str: 568 | """ 569 | Deletes the given scope 570 | 571 | Parameters 572 | ---------- 573 | scope : str 574 | Scope's name 575 | 576 | Returns 577 | ------- 578 | str 579 | Result from Databricks API call 580 | """ 581 | dbs = self.secret_service 582 | res = dbs.delete_scope(scope) 583 | return res 584 | 585 | @trace 586 | def add_scope(self, scope: str) -> str: 587 | """ 588 | Creates the scope 589 | 590 | Parameters 591 | ---------- 592 | scope : str 593 | Scope's name 594 | 595 | Returns 596 | ------- 597 | str 598 | Result from Databricks API call 599 | """ 600 | res = self.secret_service.create_scope( 601 | scope, 602 | initial_manage_principal='users' 603 | ) 604 | return res 605 | 606 | @trace(attrs_refact=['secret_value']) 607 | def add_secret(self, scope: str, secret_name: str, secret_value: str) -> str: 608 | """ 609 | Adds a secret to the given scope. 610 | If a secret already exists with the same name, it will be overwritten. 611 | 612 | Note 613 | ---- 614 | The server encrypts the secret using the secret scope’s encryption settings before storing it. You must have WRITE or MANAGE permission on the secret scope. 615 | 616 | Parameters 617 | ---------- 618 | scope : str 619 | Name of the scope 620 | secret_name : str 621 | Name of the secret 622 | secret_value : str 623 | Value of the secret 624 | 625 | Returns 626 | ------- 627 | str 628 | Result from Databricks API call 629 | """ 630 | dbs = self.secret_service 631 | res = dbs.put_secret( 632 | scope=scope, 633 | key=secret_name, 634 | string_value=secret_value, 635 | bytes_value=None 636 | ) 637 | return res 638 | 639 | def scopes(self) -> _typing.List[str]: 640 | """ 641 | Retrieve list of scopes. 642 | 643 | Returns 644 | ------- 645 | List[str] 646 | List of scopes 647 | """ 648 | dbs = self.secret_service 649 | sc = dbs.list_scopes() 650 | if sc: 651 | scopes = [s["name"] for s in sc["scopes"]] 652 | else: 653 | scopes = [] 654 | return scopes 655 | 656 | @trace(attrs_refact=['secrets']) 657 | def create_scope_secrets(self, scope: str, secrets: dict): 658 | """ 659 | Insert a secret under the provided scope with the given name. 660 | If the scope already exists, it will be dropped and recreated. 661 | If a secret already exists with the same name, it will be overwritten. 662 | 663 | Notes 664 | ----- 665 | The server encrypts the secret using the secret scope’s encryption settings before storing it. You must have WRITE or MANAGE permission on the secret scope. 666 | 667 | Parameters 668 | ---------- 669 | scope : str 670 | Name of the scope 671 | secret_names : [str] 672 | Secrets names that must be searched in the Key Vault 673 | """ 674 | 675 | scopes = self.scopes() 676 | dbs = self.secret_service 677 | # The only way to update a secret is to delete it and reload it 678 | if scope in scopes: 679 | dbs.delete_scope(scope) 680 | 681 | self.add_scope(scope, dbs) 682 | for name, value in secrets.items(): 683 | self.add_secret( 684 | scope=scope, 685 | name=name, 686 | value=value 687 | ) 688 | 689 | @trace 690 | def list_scopes_secrets(self) -> _typing.Dict: 691 | """ 692 | Returns all the scopes and their secrets 693 | 694 | Returns 695 | ------- 696 | dict 697 | The secrets {"secret": ...} 698 | """ 699 | dbs = self.secret_service 700 | scopes = dbs.list_scopes()["scopes"] 701 | sc_names = [sc["name"] for sc in scopes] 702 | res = {} 703 | for name in sc_names: 704 | sc_secrets = dbs.list_secrets(name) 705 | secrets = None 706 | if sc_secrets: 707 | secrets = [k["key"] for k in sc_secrets["secrets"]] 708 | res[name] = secrets 709 | return res 710 | 711 | 712 | class Workspace(): 713 | """ 714 | Manages a Databricks Workspace 715 | """ 716 | 717 | def __init__( 718 | self, 719 | client: ResourceClient 720 | ): 721 | """ 722 | Instantiates this object 723 | 724 | Parameters 725 | ---------- 726 | client : ResourceClient 727 | A ResourceClient object 728 | """ 729 | self.client = client 730 | self.__workspace_service = None 731 | 732 | @property 733 | def workspace_service(self) -> _dss.WorkspaceService: 734 | """ 735 | The Databricks Workspace Service 736 | 737 | Returns 738 | ------- 739 | WorkspaceService 740 | Workspace service from Databricks API 741 | """ 742 | if not self.__workspace_service: 743 | self.__workspace_service = _dss.WorkspaceService(self.client.apiClient) 744 | return self.__workspace_service 745 | 746 | @trace 747 | def upload_content( 748 | self, 749 | destination_path: str, 750 | content: str, 751 | format="SOURCE", # TODO: Rename to content_format 752 | language="PYTHON", 753 | overwrite=True 754 | ) -> str: 755 | """ 756 | Uploads content to the workspace 757 | 758 | Parameters 759 | ---------- 760 | destination_path : str 761 | Destination of the file 762 | content : str 763 | File's content as string 764 | format : str, optional 765 | Databricks file format, by default "SOURCE" 766 | overwrite : bool, optional 767 | Overwrite the file if exists, by default True 768 | 769 | Returns 770 | ------- 771 | str 772 | Result from Databricks API call 773 | """ 774 | base64_bytes = _base64.b64encode(content.encode("utf-8")) 775 | base64_string = base64_bytes.decode('utf-8') 776 | dws = self.workspace_service 777 | file_dir = _os.path.dirname(destination_path) 778 | dws.mkdirs(file_dir) 779 | res = dws.import_workspace( 780 | path=destination_path, 781 | content=base64_string, 782 | format=format, 783 | language=language, 784 | overwrite=overwrite 785 | ) 786 | return res 787 | 788 | @trace 789 | def list_content(self, destination_folder: str) -> _typing.List[str]: 790 | """ 791 | Lists the content in the given folder 792 | 793 | Parameters 794 | ---------- 795 | destination_folder : str 796 | Folder to check 797 | 798 | Returns 799 | ------- 800 | List[str] 801 | List of files 802 | """ 803 | dws = self.workspace_service 804 | content = dws.list(destination_folder) 805 | return content 806 | 807 | @trace 808 | def delete_content(self, destination_path: str) -> str: 809 | """ 810 | Deletes the content at the given path 811 | 812 | Parameters 813 | ---------- 814 | destination_path : str 815 | Path of the file 816 | 817 | Returns 818 | ------- 819 | str 820 | Result from Databricks API call 821 | """ 822 | dws = self.workspace_service 823 | res = dws.delete(path=destination_path) 824 | return res 825 | 826 | @trace 827 | def make_dir(self, path_dir: str) -> str: 828 | """ 829 | Makes a directory in the given folder 830 | 831 | Parameters 832 | ---------- 833 | path_dir : str 834 | Base directory to which add the folder 835 | 836 | Returns 837 | ------- 838 | str 839 | Result from Databricks API call 840 | """ 841 | dws = self.workspace_service 842 | res = dws.mkdirs(path_dir) 843 | return res 844 | 845 | 846 | class DatabricksResourceManager(): 847 | """ 848 | The orchestrator for managing the Databricks resources with ease 849 | """ 850 | 851 | @trace 852 | def __init__( 853 | self, 854 | client: ResourceClient, 855 | cluster_name: str, 856 | cluster_configuration: dict, 857 | log_to_appi: bool = False 858 | ): 859 | """ 860 | Instantiates this object 861 | 862 | Parameters 863 | ---------- 864 | client : ResourceClient 865 | A ResourceClient object 866 | cluster_name : str 867 | Name of the cluster 868 | cluster_configuration : Dict, optional 869 | The configuration of the cluster, by default None 870 | log_to_appi: bool 871 | Log to application insights 872 | """ 873 | self.client = client 874 | 875 | self.cluster = Cluster( 876 | client=client, 877 | cluster_name=cluster_name, 878 | cluster_configuration=cluster_configuration 879 | # appinsight_instrumentation_key="", 880 | ) 881 | 882 | self.jobs = Jobs(client=self.client) 883 | self.secret = Secret(client=self.client) 884 | self.workspace = Workspace(client=self.client) 885 | self.log_to_appi = log_to_appi 886 | 887 | @trace 888 | def __dkea_token(self) -> str: 889 | """ 890 | Returns the DKEA token needed for using dbutils secrets locally. 891 | 892 | Returns 893 | ------- 894 | str 895 | The token 896 | """ 897 | code = ''' 898 | v = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get() 899 | k = v.replace("Some(", "").replace(")", "") 900 | key = k[:4] + '-' + k[4:] 901 | 902 | dbutils.notebook.exit(key) 903 | ''' 904 | output_notebook = self.run_python_code_on_notebook(code) 905 | dkea = output_notebook.replace('-', '') 906 | return dkea 907 | 908 | @trace 909 | def run_python_code_on_notebook(self, code: str) -> str: 910 | """ 911 | Runs python code as a notebook 912 | 913 | Parameters 914 | ---------- 915 | code : str 916 | The code to execute 917 | 918 | Returns 919 | ------- 920 | str 921 | Results from Databricks API call 922 | """ 923 | # TODO: Add _and_wait in the name 924 | temp_name = "{}.py".format(next(_tempfile._get_candidate_names())) 925 | defult_tmp_dir = _tempfile._get_default_tempdir() 926 | temp_file_path = _os.path.join(defult_tmp_dir, temp_name) 927 | self.workspace.upload_content(temp_file_path, code) 928 | output = self.jobs.run_notebook_and_wait( 929 | destination_path=temp_file_path, 930 | cluster_id=self.cluster.cluster_id, 931 | delete_run=True 932 | ) 933 | self.workspace.delete_content(temp_file_path) 934 | return output 935 | 936 | @trace 937 | def run_notebook_and_wait( 938 | self, 939 | destination_path: str, 940 | delete_run=False 941 | ): 942 | return self.jobs.run_notebook_and_wait( 943 | destination_path=destination_path, 944 | cluster_id=self.cluster.cluster_id, 945 | delete_run=delete_run 946 | ) 947 | -------------------------------------------------------------------------------- /src/modules/dbkenv/local.py: -------------------------------------------------------------------------------- 1 | import pyspark.databricks_connect as _dbc 2 | 3 | 4 | 5 | 6 | class DatabricksLocal: 7 | """ 8 | Sets up the local environment to use a remote instance of Databricks 9 | """ 10 | def __init__( 11 | self, 12 | host: str, 13 | databricks_token: str, 14 | cluster_id: str, 15 | org_id: str, 16 | port=15001 17 | ): 18 | """ 19 | Instantiates this object. 20 | 21 | Parameters 22 | ---------- 23 | host : str 24 | Databricks host 25 | databricks_token : str 26 | Personal token 27 | cluster_id : str 28 | Cluster's id 29 | org_id : str 30 | Organization id 31 | port : int, optional 32 | Port for connection, by default 15001 33 | """ 34 | self.host = host 35 | self.databricks_token = databricks_token 36 | self.cluster_id = cluster_id 37 | self.org_id = org_id 38 | self.port = port 39 | 40 | def initialize(self): 41 | """Initialize the configuration.""" 42 | _dbc.save_config( 43 | host=self.host, 44 | token=self.databricks_token, 45 | cluster=self.cluster_id, 46 | org_id=self.org_id, 47 | port=self.port 48 | ) 49 | _dbc.test() 50 | return True 51 | -------------------------------------------------------------------------------- /src/modules/dbkenv/requirements.txt: -------------------------------------------------------------------------------- 1 | databricks-cli==0.9.1 2 | databricks-connect==7.* -------------------------------------------------------------------------------- /src/modules/devmaint/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/modules/devmaint/__init__.py -------------------------------------------------------------------------------- /src/modules/devmaint/command_line.py: -------------------------------------------------------------------------------- 1 | from .docgenerator import generate_documentation 2 | import argparse 3 | from argparse import Namespace 4 | 5 | 6 | 7 | 8 | 9 | def arguments() -> Namespace: 10 | parser = argparse.ArgumentParser( 11 | prog='docgen', 12 | description='Generates the documentation of the given package.' 13 | ) 14 | parser.add_argument('-p', type=str, help='package parent folder') 15 | parser.add_argument('-n', type=str, help='package name') 16 | parser.add_argument('-o', default='documentation.md', help="output filename") 17 | args = parser.parse_args() 18 | return args 19 | 20 | 21 | def main(): 22 | """ 23 | Main function that runs this script 24 | """ 25 | 26 | args = arguments() 27 | 28 | package_parent_path = args.p.strip() 29 | package_name = args.n.strip() 30 | output_name = args.o.strip() 31 | 32 | outpath = generate_documentation( 33 | package_parent_path=package_parent_path, 34 | package_name=package_name, 35 | output_name=output_name 36 | ) 37 | 38 | print(f'Documentation saved in "{outpath}"') 39 | -------------------------------------------------------------------------------- /src/modules/devmaint/docgenerator.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module is used to create the documentation of the library 3 | in a compatible way with Azure DevOps Wiki. 4 | 5 | ## Prerequisites 6 | 7 | Make sure that you have installed pandoc in Ubuntu 8 | 9 | ```bash 10 | sudo apt-get install pandoc 11 | ``` 12 | """ 13 | 14 | import pdoc 15 | from pdoc import Module 16 | from pypandoc import convert_text 17 | import pypandoc as pp 18 | import re 19 | from typing import List 20 | import sys 21 | from pathlib import Path 22 | 23 | 24 | 25 | 26 | def recursive_mods(mod: Module) -> List[Module]: 27 | """ 28 | Gets all the submodules of the given module 29 | 30 | Parameters 31 | ---------- 32 | mod : pdoc.Module 33 | The father module 34 | 35 | Yields 36 | ------- 37 | [pdoc.Module] 38 | List of the module with the submodules 39 | """ 40 | yield mod 41 | for submod in mod.submodules(): 42 | yield from recursive_mods(submod) 43 | 44 | 45 | def normalize_md(text: str) -> str: 46 | """ 47 | Applies a series of normalization steps to a text 48 | 49 | Parameters 50 | ---------- 51 | text : str 52 | Text to manipulate 53 | 54 | Returns 55 | ------- 56 | str 57 | The new text 58 | """ 59 | # Replace \r\n with \n 60 | res = re.sub(r"(\r\n)", '\n', text) 61 | # Remove multiple lines 62 | res = re.sub(r"(\n{2,})", '\n', res) 63 | # Fix title = 64 | res = re.sub(r"(\n\n)=", '\n', res) 65 | # Fix title - 66 | res = re.sub(r"(\n\n)-", '\n', res) 67 | # Fix parameters 68 | res = re.sub(r"(^\*{2,})", '\n**', res) 69 | return res 70 | 71 | 72 | def normalize_pandoc(text: str) -> str: 73 | """ 74 | Series of normalization steps for pandoc generated text 75 | 76 | Parameters 77 | ---------- 78 | text : str 79 | Pandoc text 80 | 81 | Returns 82 | ------- 83 | str 84 | Normalized pandoc text 85 | """ 86 | text_norm = re.sub(r"(\r\n)", '\n', text) 87 | text_norm = re.sub(r"( {4})", '\t', text_norm) 88 | text_norm = re.sub(r'(-----\n\nGenerated by.*)', '----', text_norm) 89 | return text_norm 90 | 91 | 92 | def set_methods_ado_links(text: str) -> str: 93 | """ 94 | Add links to the markdown text compatible with Azure DevOps 95 | 96 | Parameters 97 | ---------- 98 | text : str 99 | [description] 100 | 101 | Returns 102 | ------- 103 | str 104 | [description] 105 | """ 106 | text_links = text 107 | links = set(re.findall(r"\(#(.*)\)", text)) 108 | for link in links: 109 | text_links = text_links.replace("(#{})".format(link), "(#module-`{}`)".format(link)) 110 | return text_links 111 | 112 | 113 | def convert_format(text: str, f='markdown+abbreviations', t='commonmark') -> str: 114 | """ 115 | Converts a text from a format to another 116 | 117 | Parameters 118 | ---------- 119 | text : str 120 | The text to convert 121 | f : str, optional 122 | The from format, by default 'markdown+abbreviations' 123 | t : str, optional 124 | [description], by default 'commonmark' 125 | 126 | Returns 127 | ------- 128 | str 129 | [description] 130 | """ 131 | return convert_text(text, format=f, to=t) 132 | 133 | 134 | def modules_documentation_pdf(modules: List[Module], f='markdown+abbreviations', t='commonmark') -> str: 135 | doc_string = pdoc._render_template(template_name='/pdf.mako', modules=modules) 136 | md_raw = convert_format(doc_string) 137 | md = normalize_pandoc(md_raw) 138 | md = set_methods_ado_links(md) 139 | return md 140 | 141 | 142 | # def modules_documentation_text(modules: [pdoc.Module], f='markdown+abbreviations', t='commonmark') -> str: 143 | # """ 144 | # An attempt to create the documentation starting from the text extracted from pdoc. 145 | # !Not to use 146 | 147 | # Parameters 148 | # ---------- 149 | # modules : list(pdoc.Module) 150 | # List of modules from which extract the documentation 151 | # f : str, optional 152 | # From pandoc format, by default 'markdown+abbreviations' 153 | # t : str, optional 154 | # To pandoc format, by default 'commonmark' 155 | 156 | # Returns 157 | # ------- 158 | # str 159 | # [description] 160 | # """ 161 | # docs = extract_documentation_as_text(modules) 162 | # md_raw = convert_text(docs, format=f, to=t) 163 | # md = normalize_pandoc(md_raw) 164 | # md = set_methods_ado_links(md) 165 | # return md 166 | 167 | 168 | def extract_documentation_as_text(modules: List[Module]) -> str: 169 | """ 170 | Extracts the documentation as text from the given modules. 171 | Than these are merged with a separator. 172 | 173 | Parameters 174 | ---------- 175 | modules : list(pdoc.Modules) 176 | The modules for which create the documentation 177 | 178 | Returns 179 | ------- 180 | str 181 | The documentation as plain text 182 | """ 183 | module_texts = "___".join([x.text() for x in modules]) 184 | return module_texts 185 | 186 | 187 | def extract_modules(modules_parent: str) -> List[Module]: 188 | """ 189 | Extracts recursively all the modules that are part of the given module 190 | 191 | Parameters 192 | ---------- 193 | modules_parent : str 194 | Name of the module from which start the extraction 195 | 196 | Returns 197 | ------- 198 | [pdoc.Module] 199 | List of pdoc modules 200 | """ 201 | context = pdoc.Context() 202 | pdoc.link_inheritance(context) 203 | mod = pdoc.Module(modules_parent, context=context) 204 | modules_ls = [m for m in recursive_mods(mod)] 205 | return modules_ls 206 | 207 | 208 | def test_markdown_convertions(text: str, module_name: str): 209 | pandoc_from, pandoc_to = pp.get_pandoc_formats() 210 | res = [] 211 | for f in [x for x in pandoc_from if 'mark' in x]: 212 | for t in [x for x in pandoc_to if 'mark' in x]: 213 | text_new = normalize_pandoc(convert_format(text, f, t)) 214 | res.append((text_new, f, t)) 215 | with open('docs_test\\test_doc_{}_{}_{}.md'.format(module_name, f, t), 'w', encoding="utf-8") as out: 216 | out.write(text_new) 217 | 218 | return True 219 | 220 | 221 | def create_adow_documentation(package: str) -> str: 222 | """ 223 | Creates the documentation to use with Azure DevOps Wiki (adow) 224 | 225 | Parameters 226 | ---------- 227 | module : str 228 | The name of the module for which create the documentation 229 | 230 | Returns 231 | ------- 232 | str 233 | The documentation 234 | """ 235 | modules = extract_modules(package) 236 | modules_documentation = [modules_documentation_pdf([module]) for module in modules] 237 | package_documentation = "".join(modules_documentation) 238 | return package_documentation 239 | 240 | 241 | def generate_documentation(package_parent_path: str, package_name: str, output_name: str) -> Path: 242 | """ 243 | Generates and saves the documentation 244 | 245 | Parameters 246 | ---------- 247 | package_root_path : str 248 | Root folder of the package 249 | package_name : str 250 | Name of the package 251 | output_name : str 252 | Output path of the file 253 | """ 254 | output_path = Path(package_parent_path).joinpath(output_name) 255 | sys.path.append(package_parent_path) 256 | doc = create_adow_documentation(package_name) 257 | with open(str(output_path), 'w+', encoding="utf-8") as out: 258 | out.write(doc) 259 | return output_path 260 | -------------------------------------------------------------------------------- /src/modules/devmaint/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | pypandoc>=1.4 3 | pdoc3>=0.7.4 -------------------------------------------------------------------------------- /src/modules/tests/dbkcore/test_logger.py: -------------------------------------------------------------------------------- 1 | """Test script.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | sys.path.append(str(Path(__file__).parent.parent.parent)) 6 | 7 | import os 8 | from dotenv import load_dotenv 9 | import pytest 10 | load_dotenv(override=True) 11 | 12 | 13 | from dbkcore.core import trace 14 | from dbkcore.core import Log 15 | Log("Unit Tests", os.environ['APPI_IK']) 16 | 17 | 18 | @trace 19 | def func_to_test_div(a, b): 20 | res = a / b 21 | return res 22 | 23 | 24 | def test_log_info(): 25 | Log.get_instance().log_info("Test info") 26 | 27 | 28 | def test_log_debug(): 29 | Log.get_instance().log_debug("Test debug") 30 | 31 | 32 | def test_log_critical(): 33 | Log.get_instance().log_critical("Test critical") 34 | 35 | 36 | def test_log_warning(): 37 | Log.get_instance().log_warning("Test warning") 38 | 39 | 40 | def test_log_error(): 41 | Log.get_instance().log_error("Test error") 42 | 43 | 44 | def test_trace(): 45 | func_to_test_div(a=3, b=2) 46 | 47 | 48 | def test_trace_error(): 49 | with pytest.raises(ZeroDivisionError): 50 | try: 51 | func_to_test_div(a=1, b=0) 52 | except ZeroDivisionError as e: 53 | Log.get_instance().log_error(f"Function call failed: {e}") 54 | raise e 55 | -------------------------------------------------------------------------------- /src/modules/tests/dbkenv/content/py_file.py: -------------------------------------------------------------------------------- 1 | 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | from __future__ import print_function 19 | 20 | import sys 21 | from random import random 22 | from operator import add 23 | 24 | from pyspark.sql import SparkSession 25 | 26 | 27 | if __name__ == "__main__": 28 | """ 29 | Usage: pi [partitions] 30 | """ 31 | spark = SparkSession\ 32 | .builder\ 33 | .appName("PythonPi")\ 34 | .getOrCreate() 35 | 36 | partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2 37 | n = 100000 * partitions 38 | 39 | def f(_): 40 | x = random() * 2 - 1 41 | y = random() * 2 - 1 42 | return 1 if x ** 2 + y ** 2 <= 1 else 0 43 | 44 | count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add) 45 | print("Pi is roughly %f" % (4.0 * count / n)) -------------------------------------------------------------------------------- /src/modules/tests/dbkenv/content/unittest_notebook.py: -------------------------------------------------------------------------------- 1 | dbutils.notebook.exit("success") -------------------------------------------------------------------------------- /src/modules/tests/dbkenv/test_cluster.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | sys.path.append(str(Path(__file__).parent.parent.parent)) 5 | 6 | from dbkcore.core import Log 7 | from dbkenv.core import ResourceClient 8 | from dbkenv.core import Configuration 9 | from dbkenv.core import DatabricksResourceManager 10 | from dbkenv.local import DatabricksLocal 11 | import json 12 | import os 13 | import pytest 14 | 15 | # import time 16 | 17 | 18 | def clients(): 19 | configuration = Configuration(file_load=True) 20 | cluster_config_file = str(Path(__file__).parent.joinpath('unittest_cluster.json')) 21 | 22 | with open(cluster_config_file, 'r') as cl: 23 | cluster_configuration = json.load(cl) 24 | 25 | cluster_name = cluster_configuration['cluster_name'] 26 | # instantiate the logger 27 | Log( 28 | name='unittest', 29 | connection_string=configuration.APPINSIGHT_CONNECTIONSTRING 30 | ) 31 | client = ResourceClient( 32 | host=configuration.DATABRICKS_HOST, 33 | personal_token=configuration.DATABRICKS_TOKEN 34 | ) 35 | drm = DatabricksResourceManager( 36 | client=client, 37 | cluster_name=cluster_name, 38 | cluster_configuration=cluster_configuration 39 | ) 40 | 41 | return drm 42 | 43 | 44 | def test_cluster_create(): 45 | assert clients().cluster.create_cluster_and_wait(), "Cluster not created" 46 | 47 | 48 | # def test_cluster_start(): 49 | # assert clients().cluster.cluster_started(), "Failed to start cluster" 50 | 51 | 52 | def test_local_dev(): 53 | configuration = Configuration(file_load=True) 54 | dbc = DatabricksLocal( 55 | host=configuration.DATABRICKS_HOST, 56 | databricks_token=configuration.DATABRICKS_TOKEN, 57 | cluster_id=clients().cluster.cluster_id, 58 | org_id=configuration.DATABRICKS_ORDGID 59 | ) 60 | success = dbc.initialize() 61 | assert success, "Failed to configure locally" 62 | 63 | 64 | # Test content 65 | source_file_name = 'unittest_notebook.py' 66 | source_file_path = str(Path(__file__).parent.joinpath('content', source_file_name)) 67 | with open(source_file_path, 'r') as file: 68 | data = file.read() 69 | 70 | destination_dir = "/unittesting" 71 | destination_file_path = os.path.join(destination_dir, source_file_name) 72 | 73 | 74 | def test_file_upload(): 75 | clients().workspace.make_dir(destination_dir) 76 | clients().workspace.upload_content(destination_file_path, data) 77 | content = clients().workspace.list_content(destination_folder=destination_file_path) 78 | elements_in_folder = [os.path.basename(e["path"]) for e in content['objects']] 79 | assert source_file_name in elements_in_folder, "Failed to upload the file" 80 | 81 | 82 | def test_file_run(): 83 | output = clients().run_notebook_and_wait( 84 | destination_path=destination_file_path, 85 | delete_run=True 86 | ) 87 | assert output == "success", "Failed to upload and run notebook" 88 | 89 | 90 | def test_file_delete(): 91 | content = clients().workspace.list_content(destination_folder=destination_dir) 92 | if not content: 93 | pytest.skip("Folder is empty") 94 | elif source_file_name not in [os.path.basename(e["path"]) for e in content['objects']]: 95 | pytest.skip("File not in folder") 96 | 97 | clients().workspace.delete_content(destination_file_path) 98 | content = clients().workspace.list_content(destination_folder=destination_dir) 99 | elements_in_folder = [] 100 | if content: 101 | elements_in_folder = [os.path.basename(e["path"]) for e in content['objects']] 102 | assert source_file_name not in elements_in_folder, "Failed to upload the file" 103 | clients().workspace.delete_content(destination_dir) 104 | 105 | 106 | def test_run_code(): 107 | code = ''' 108 | a = 1 109 | b = 2 110 | c = a + b 111 | dbutils.notebook.exit(c) 112 | ''' 113 | output = clients().run_python_code_on_notebook(code) 114 | assert output == "3", "Failed to compute code" 115 | 116 | 117 | def test_cluster_delete(): 118 | assert clients().cluster.delete_cluster_and_wait(), "Failed to delete cluster" 119 | -------------------------------------------------------------------------------- /src/modules/tests/dbkenv/unittest_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "autoscale": { 3 | "min_workers": 1, 4 | "max_workers": 4 5 | }, 6 | "num_workers": 1, 7 | "cluster_name": "unittest_cluster", 8 | "spark_version": "7.3.x-cpu-ml-scala2.12", 9 | "spark_conf": { 10 | "spark.databricks.delta.preview.enabled": "true" 11 | }, 12 | "node_type_id": "Standard_DS3_v2", 13 | "driver_node_type_id": "Standard_DS3_v2", 14 | "ssh_public_keys": [], 15 | "custom_tags": {}, 16 | "spark_env_vars": { 17 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 18 | }, 19 | "autotermination_minutes": 60, 20 | "enable_elastic_disk": true 21 | } -------------------------------------------------------------------------------- /src/modules/tests/pytest.ini: -------------------------------------------------------------------------------- 1 | # content of pytest.ini 2 | [pytest] 3 | markers = 4 | cenmaint: test for cenmaint package 5 | cenlocal: test for cenlocal package 6 | cendatabricks: test for cendatabricks package -------------------------------------------------------------------------------- /src/pipelines/dbkframework/documentation.md: -------------------------------------------------------------------------------- 1 | # Module `dbkcore` 2 | 3 | ## Sub-modules 4 | 5 | - [dbkcore.core](#module-`dbkcore.core`) 6 | - [dbkcore.helpers](#module-`dbkcore.helpers`) 7 | 8 | ---- 9 | # Module `dbkcore.core` 10 | 11 | ## Functions 12 | 13 | ### Function `trace` 14 | 15 | > 16 | > 17 | > def trace( 18 | > original_function: Any = None, 19 | > *, 20 | > attrs_refact: List[str] = None 21 | > ) 22 | 23 | Log the function call. 24 | 25 | ###### Parameters 26 | 27 | - **`original_function`** : Any, optional 28 | Function to trace, by default None 29 | - **`attrs_refact`** : List\[str\], optional 30 | List of parameters to hide from logging, by default None 31 | 32 | ## Classes 33 | 34 | ### Class `BaseObject` 35 | 36 | > 37 | > 38 | > class BaseObject 39 | 40 | Base class to use with any object new object. It implements the method 41 | log which will be used for logging 42 | 43 | #### Static methods 44 | 45 | ##### `Method class_name` 46 | 47 | > 48 | > 49 | > def class_name( 50 | > cls 51 | > ) ‑> str 52 | 53 | #### Methods 54 | 55 | ##### Method `log` 56 | 57 | > 58 | > 59 | > def log( 60 | > self, 61 | > prefix='', 62 | > suffix='' 63 | > ) 64 | 65 | Specifices how to log the object 66 | 67 | ### Class `Log` 68 | 69 | > 70 | > 71 | > class Log( 72 | > name: str, 73 | > connection_string: str = None 74 | > ) 75 | 76 | Helper class for Application Insight Logger. 77 | 78 | Create a new Log object. 79 | 80 | #### Parameters 81 | 82 | - **`name`** : str 83 | Name used by the logger for tracing 84 | - **`connection_string`** : \[type\], optional 85 | Application Insight’s connection string 86 | 87 | #### Instance variables 88 | 89 | ##### Variable `logger` 90 | 91 | Type: `logging.Logger` 92 | 93 | Logger that will be used. 94 | 95 | ###### Returns 96 | 97 | - Logger 98 | This logger 99 | 100 | ##### Variable `tracer` 101 | 102 | Type: `opencensus.trace.tracer.Tracer` 103 | 104 | Tracer that will be used. 105 | 106 | ###### Returns 107 | 108 | - Tracer 109 | The tracer 110 | 111 | #### Static methods 112 | 113 | ##### `Method get_instance` 114 | 115 | > 116 | > 117 | > def get_instance() 118 | 119 | Current instance 120 | 121 | #### Methods 122 | 123 | ##### Method `log_critical` 124 | 125 | > 126 | > 127 | > def log_critical( 128 | > self, 129 | > message: str, 130 | > prefix='', 131 | > custom_dimension: dict = None 132 | > ) 133 | 134 | Log a message as critical. 135 | 136 | ###### Parameters 137 | 138 | - **`message`** : str 139 | The message 140 | 141 | ##### Method `log_debug` 142 | 143 | > 144 | > 145 | > def log_debug( 146 | > self, 147 | > message: str, 148 | > prefix='', 149 | > custom_dimension: dict = None 150 | > ) 151 | 152 | Log a message as debug. 153 | 154 | ###### Parameters 155 | 156 | - **`message`** : str 157 | The message 158 | 159 | ##### Method `log_error` 160 | 161 | > 162 | > 163 | > def log_error( 164 | > self, 165 | > message: str, 166 | > include_stack=True, 167 | > prefix='', 168 | > custom_dimension: dict = None 169 | > ) 170 | 171 | Log a message as error. 172 | 173 | ###### Parameters 174 | 175 | - **`message`** : str 176 | The message 177 | 178 | ##### Method `log_info` 179 | 180 | > 181 | > 182 | > def log_info( 183 | > self, 184 | > message: str, 185 | > prefix='', 186 | > custom_dimension: dict = None 187 | > ) 188 | 189 | Log a message as info. 190 | 191 | ###### Parameters 192 | 193 | - **`message`** : str 194 | The message 195 | 196 | ##### Method `log_warning` 197 | 198 | > 199 | > 200 | > def log_warning( 201 | > self, 202 | > message: str, 203 | > prefix='', 204 | > custom_dimension: dict = None 205 | > ) 206 | 207 | Log a message as warning. 208 | 209 | ###### Parameters 210 | 211 | - **`message`** : str 212 | The message 213 | 214 | ##### Method `trace_function` 215 | 216 | > 217 | > 218 | > def trace_function( 219 | > self, 220 | > name: str, 221 | > kwargs: dict 222 | > ) ‑> Optional[opencensus.trace.span.Span] 223 | 224 | Traces a function 225 | 226 | ###### Parameters 227 | 228 | - **`name`** : str 229 | Name of the function used for tracing 230 | - **`name`** : kwargs 231 | The parameters of the function 232 | 233 | ###### Returns 234 | 235 | - Span 236 | A Span that can be used for customizing logging 237 | 238 | ### Class `Singleton` 239 | 240 | > 241 | > 242 | > class Singleton( 243 | > *args, 244 | > **kwargs 245 | > ) 246 | 247 | Create a singleton. 248 | 249 | #### Ancestors (in MRO) 250 | 251 | - [builtins.type](#module-`builtins.type`) 252 | 253 | ---- 254 | # Module `dbkcore.helpers` 255 | 256 | Various utilities for speed up development 257 | 258 | ## Functions 259 | 260 | ### Function `add_folder_in_current_directory` 261 | 262 | > 263 | > 264 | > def add_folder_in_current_directory( 265 | > folder_name: str 266 | > ) ‑> bool 267 | 268 | Add a folder in the current directory. 269 | 270 | ###### Parameters 271 | 272 | - **`folder_name`** : str 273 | New folder name 274 | 275 | ###### Returns 276 | 277 | - bool 278 | True if success 279 | 280 | ### Function `current_directory` 281 | 282 | > 283 | > 284 | > def current_directory() ‑> str 285 | 286 | Get current directory. 287 | 288 | ###### Returns 289 | 290 | - str 291 | The current directory path 292 | 293 | ### Function `is_json_serializable` 294 | 295 | > 296 | > 297 | > def is_json_serializable( 298 | > x: Any 299 | > ) ‑> bool 300 | 301 | Check if the object is serializable. 302 | 303 | ###### Parameters 304 | 305 | - **`x`** : Any 306 | Object to validate 307 | 308 | ###### Returns 309 | 310 | - bool 311 | True if success 312 | 313 | ---- 314 | 315 | 316 | # Module `dbkdev` 317 | 318 | ## Sub-modules 319 | 320 | - [dbkdev.core](#module-`dbkdev.core`) 321 | - [dbkdev.data\_steps](#module-`dbkdev.data_steps`) 322 | 323 | ---- 324 | # Module `dbkdev.core` 325 | 326 | ## Classes 327 | 328 | ### Class `DevelopmentClient` 329 | 330 | > 331 | > 332 | > class DevelopmentClient( 333 | > dbutils, 334 | > spark: pyspark.sql.session.SparkSession, 335 | > ide_environment: dbkdev.core.IdeEnvironment 336 | > ) 337 | 338 | Client to use for local Databricks’ local development 339 | 340 | Instantiates this object 341 | 342 | #### Parameters 343 | 344 | - **`dbutils`** : Dbutils 345 | The Dbutils instance to use 346 | - **`spark`** : SparkSession 347 | The SparkSession to use 348 | - **`ide_environment`** 349 | : [IdeEnvironment](#module-`dbkdev.core.IdeEnvironment "dbkdev.core.IdeEnvironment"`) 350 | The environment used 351 | - **`deployment_environment`** : DeploymentEnvironment 352 | The deployment environment 353 | 354 | #### Instance variables 355 | 356 | ##### Variable `dbutils` 357 | 358 | Type: `Any` 359 | 360 | ##### Variable `ide_environment` 361 | 362 | Type: `dbkdev.core.IdeEnvironment` 363 | 364 | ##### Variable `mount_name` 365 | 366 | Type: `str` 367 | 368 | Standard name of the root mount for the configured storage account and 369 | container 370 | 371 | ###### Returns 372 | 373 | - str 374 | \[description\] 375 | 376 | ##### Variable `mount_path` 377 | 378 | Type: `str` 379 | 380 | Standard mount path 381 | 382 | ###### Returns 383 | 384 | - str 385 | The path 386 | 387 | ##### Variable `spark` 388 | 389 | Type: `pyspark.sql.session.SparkSession` 390 | 391 | #### Static methods 392 | 393 | ##### `Method get_instance` 394 | 395 | > 396 | > 397 | > def get_instance() 398 | 399 | #### Methods 400 | 401 | ##### Method `create_schema` 402 | 403 | > 404 | > 405 | > def create_schema( 406 | > self, 407 | > schema_databricks: str 408 | > ) 409 | 410 | Creates a schema in Databricks 411 | 412 | ###### Parameters 413 | 414 | - **`schema_databricks`** : str 415 | Name of the schema 416 | 417 | ##### Method `files` 418 | 419 | > 420 | > 421 | > def files( 422 | > self, 423 | > path: str 424 | > ) ‑> list 425 | 426 | ##### Method `list_databases` 427 | 428 | > 429 | > 430 | > def list_databases( 431 | > self 432 | > ) ‑> List[str] 433 | 434 | Gets the list of Databricks databases (a.k.a. schemas) 435 | 436 | ###### Returns 437 | 438 | - List\[str\] 439 | List of schemas 440 | 441 | ##### Method `list_mounts` 442 | 443 | > 444 | > 445 | > def list_mounts( 446 | > self 447 | > ) ‑> list 448 | 449 | ##### Method `list_tables` 450 | 451 | > 452 | > 453 | > def list_tables( 454 | > self, 455 | > schema: str 456 | > ) ‑> List[str] 457 | 458 | List the tables in the given schema 459 | 460 | ###### Parameters 461 | 462 | - **`schema`** : str 463 | The Databricks schema 464 | 465 | ###### Returns 466 | 467 | - List\[str\] 468 | List of tables 469 | 470 | ##### Method `load_temp_table` 471 | 472 | > 473 | > 474 | > def load_temp_table( 475 | > self, 476 | > table_name: str 477 | > ) ‑> pyspark.sql.dataframe.DataFrame 478 | 479 | ##### Method `mount_exists` 480 | 481 | > 482 | > 483 | > def mount_exists( 484 | > self, 485 | > mount_name: str 486 | > ) ‑> bool 487 | 488 | ##### Method `read_csv` 489 | 490 | > 491 | > 492 | > def read_csv( 493 | > self, 494 | > file_path: str 495 | > ) ‑> pyspark.sql.dataframe.DataFrame 496 | 497 | ##### Method `read_parquet` 498 | 499 | > 500 | > 501 | > def read_parquet( 502 | > self, 503 | > file_path: str 504 | > ) ‑> pyspark.sql.dataframe.DataFrame 505 | 506 | ##### Method `save_delta_table` 507 | 508 | > 509 | > 510 | > def save_delta_table( 511 | > self, 512 | > dataframe: pyspark.sql.dataframe.DataFrame, 513 | > schema: str, 514 | > table_name: str, 515 | > output_path: pathlib.Path, 516 | > partition_columns: List[str] = None, 517 | > mode: str = 'overwrite', 518 | > overwrite_schema: bool = False 519 | > ) 520 | 521 | Saves the dataframe as a delta table in an external location 522 | \#\#\#\#\#\# Parameters 523 | 524 | - **`dataframe`** : DataFrame 525 | The dataframe 526 | - **`schema`** : str 527 | Destination schema 528 | - **`table_name`** : str 529 | Destination schema 530 | - **`output_path`** : Path 531 | Folder where to save the dataframe 532 | - **`partition_columns`** : List\[str\] 533 | Columns to use for partitioning, default is None 534 | - **`mode`** : str 535 |   536 | 537 | e.g. append, overwrite, passed to dataframe.write.saveAsTable 538 | 539 | ##### Method `save_temp_table` 540 | 541 | > 542 | > 543 | > def save_temp_table( 544 | > self, 545 | > dataframe: pyspark.sql.dataframe.DataFrame, 546 | > table_name: str, 547 | > cache=True 548 | > ) 549 | 550 | ##### Method `set_dbkea` 551 | 552 | > 553 | > 554 | > def set_dbkea( 555 | > self, 556 | > dbkea_token: str 557 | > ) 558 | 559 | To use when the environment is LOCAL for using the dbutils secrets 560 | 561 | ###### Parameters 562 | 563 | - **`dbkea_token`** : str 564 | The token 565 | 566 | ##### Method `table_exists` 567 | 568 | > 569 | > 570 | > def table_exists( 571 | > self, 572 | > schema_name: str, 573 | > table_name: str 574 | > ) 575 | 576 | ### Class `DevelopmentEngine` 577 | 578 | > 579 | > 580 | > class DevelopmentEngine 581 | 582 | #### Static methods 583 | 584 | ##### `Method get_instance` 585 | 586 | > 587 | > 588 | > def get_instance() 589 | 590 | ### Class `IdeEnvironment` 591 | 592 | > 593 | > 594 | > class IdeEnvironment( 595 | > value, 596 | > names=None, 597 | > *, 598 | > module=None, 599 | > qualname=None, 600 | > type=None, 601 | > start=1 602 | > ) 603 | 604 | An enumeration. 605 | 606 | #### Ancestors (in MRO) 607 | 608 | - [builtins.str](#module-`builtins.str`) 609 | - [enum.Enum](#module-`enum.Enum`) 610 | 611 | #### Class variables 612 | 613 | ##### Variable `DATABRICKS` 614 | 615 | ##### Variable `LOCAL` 616 | 617 | ---- 618 | # Module `dbkdev.data_steps` 619 | 620 | ## Functions 621 | 622 | ### Function `apply_test` 623 | 624 | > 625 | > 626 | > def apply_test( 627 | > func 628 | > ) 629 | 630 | Execute test function after the initialize. 631 | 632 | ###### Notes 633 | 634 | [Example](https://stackoverflow.com/a/15196410) 635 | 636 | ### Function `log_output` 637 | 638 | > 639 | > 640 | > def log_output( 641 | > func 642 | > ) 643 | 644 | Decorator for executing test in sequence 645 | 646 | ###### Notes 647 | 648 | [Example](https://stackoverflow.com/a/15196410) 649 | 650 | ### Function `pre_apply_test` 651 | 652 | > 653 | > 654 | > def pre_apply_test( 655 | > func 656 | > ) 657 | 658 | Execute test function before the initialize. 659 | 660 | ###### Notes 661 | 662 | [Example](https://stackoverflow.com/a/15196410) 663 | 664 | ## Classes 665 | 666 | ### Class `DataDirection` 667 | 668 | > 669 | > 670 | > class DataDirection( 671 | > value, 672 | > names=None, 673 | > *, 674 | > module=None, 675 | > qualname=None, 676 | > type=None, 677 | > start=1 678 | > ) 679 | 680 | An enumeration. 681 | 682 | #### Ancestors (in MRO) 683 | 684 | - [builtins.str](#module-`builtins.str`) 685 | - [enum.Enum](#module-`enum.Enum`) 686 | 687 | #### Class variables 688 | 689 | ##### Variable `IN` 690 | 691 | ##### Variable `OUT` 692 | 693 | ### Class `DataStep` 694 | 695 | > 696 | > 697 | > class DataStep( 698 | > spark: pyspark.sql.session.SparkSession, 699 | > run_id: str 700 | > ) 701 | 702 | Creates a datastep to be used in a pipeline 703 | 704 | #### Parameters 705 | 706 | - **`metaclass`** : \[type\], optional 707 | \[description\], by default abc.ABCMeta 708 | 709 | #### Raises 710 | 711 | - Exception 712 | \[description\] 713 | 714 | #### Instance variables 715 | 716 | ##### Variable `display_name` 717 | 718 | Type: `str` 719 | 720 | ##### Variable `output_data` 721 | 722 | Type: `dbkdev.data_steps.DataStepDataframe` 723 | 724 | #### Methods 725 | 726 | ##### Method `check_output` 727 | 728 | > 729 | > 730 | > def check_output( 731 | > self, 732 | > **kwargs 733 | > ) 734 | 735 | ##### Method `initialize` 736 | 737 | > 738 | > 739 | > def initialize( 740 | > self 741 | > ) 742 | 743 | Define the DataStep logic. 744 | 745 | ##### Method `pandas_read_csv` 746 | 747 | > 748 | > 749 | > def pandas_read_csv( 750 | > self, 751 | > path: pathlib.Path 752 | > ) ‑> dbkdev.data_steps.DataStepDataframe 753 | 754 | ##### Method `set_output_data` 755 | 756 | > 757 | > 758 | > def set_output_data( 759 | > self, 760 | > dataframe: Union[pyspark.sql.dataframe.DataFrame, pandas.core.frame.DataFrame], 761 | > name='', 762 | > cache: bool = False 763 | > ) 764 | 765 | ##### Method `spark_read_csv` 766 | 767 | > 768 | > 769 | > def spark_read_csv( 770 | > self, 771 | > path: pathlib.Path 772 | > ) ‑> dbkdev.data_steps.DataStepDataframe 773 | 774 | ##### Method `spark_read_parquet_path` 775 | 776 | > 777 | > 778 | > def spark_read_parquet_path( 779 | > self, 780 | > path: pathlib.Path, 781 | > cache=False 782 | > ) ‑> dbkdev.data_steps.DataStepDataframe 783 | 784 | ##### Method `spark_read_table` 785 | 786 | > 787 | > 788 | > def spark_read_table( 789 | > self, 790 | > name: str 791 | > ) ‑> dbkdev.data_steps.DataStepDataframe 792 | 793 | ##### Method `spark_read_temp_table` 794 | 795 | > 796 | > 797 | > def spark_read_temp_table( 798 | > self, 799 | > name: str 800 | > ) ‑> dbkdev.data_steps.DataStepDataframe 801 | 802 | ##### Method `test_is_dataframe_empty` 803 | 804 | > 805 | > 806 | > def test_is_dataframe_empty( 807 | > self, 808 | > df: pyspark.sql.dataframe.DataFrame 809 | > ) 810 | 811 | ##### Method `test_negative_values` 812 | 813 | > 814 | > 815 | > def test_negative_values( 816 | > self, 817 | > cols: List[str], 818 | > dt: dbkdev.data_steps.DataStepDataframe 819 | > ) 820 | 821 | ##### Method `test_null_values` 822 | 823 | > 824 | > 825 | > def test_null_values( 826 | > self, 827 | > cols: List[str], 828 | > dt: dbkdev.data_steps.DataStepDataframe 829 | > ) 830 | 831 | ##### Method `test_rows_diff` 832 | 833 | > 834 | > 835 | > def test_rows_diff( 836 | > self, 837 | > dt_1: dbkdev.data_steps.DataStepDataframe, 838 | > dt_2: dbkdev.data_steps.DataStepDataframe 839 | > ) 840 | 841 | ##### Method `test_rows_eq` 842 | 843 | > 844 | > 845 | > def test_rows_eq( 846 | > self, 847 | > dt_1: dbkdev.data_steps.DataStepDataframe, 848 | > dt_2: dbkdev.data_steps.DataStepDataframe 849 | > ) 850 | 851 | ##### Method `test_rows_geq` 852 | 853 | > 854 | > 855 | > def test_rows_geq( 856 | > self, 857 | > dt_1: dbkdev.data_steps.DataStepDataframe, 858 | > dt_2: dbkdev.data_steps.DataStepDataframe 859 | > ) 860 | 861 | ##### Method `test_rows_leq` 862 | 863 | > 864 | > 865 | > def test_rows_leq( 866 | > self, 867 | > dt_1: dbkdev.data_steps.DataStepDataframe, 868 | > dt_2: dbkdev.data_steps.DataStepDataframe 869 | > ) 870 | 871 | ##### Method `tests` 872 | 873 | > 874 | > 875 | > def tests( 876 | > self 877 | > ) 878 | 879 | Define all the the tests that this step must pass 880 | 881 | ### Class `DataStepDataframe` 882 | 883 | > 884 | > 885 | > class DataStepDataframe( 886 | > name: str, 887 | > dataframe: Union[pyspark.sql.dataframe.DataFrame, pandas.core.frame.DataFrame], 888 | > cache=False 889 | > ) 890 | 891 | Base class to use with any object new object. It implements the method 892 | log which will be used for logging 893 | 894 | #### Ancestors (in MRO) 895 | 896 | - [dbkcore.core.BaseObject](#module-`dbkcore.core.BaseObject`) 897 | 898 | #### Instance variables 899 | 900 | ##### Variable `is_pandas` 901 | 902 | Type: `bool` 903 | 904 | ##### Variable `is_pyspark` 905 | 906 | Type: `bool` 907 | 908 | ##### Variable `rows` 909 | 910 | #### Methods 911 | 912 | ##### Method `columns_negative` 913 | 914 | > 915 | > 916 | > def columns_negative( 917 | > self 918 | > ) ‑> List[str] 919 | 920 | Identifies the columns with negative values 921 | 922 | ###### Returns 923 | 924 | - List\[str\] 925 | Column names 926 | 927 | ##### Method `columns_null` 928 | 929 | > 930 | > 931 | > def columns_null( 932 | > self 933 | > ) ‑> List[str] 934 | 935 | Identifies the columns with null values 936 | 937 | ###### Returns 938 | 939 | - List\[str\] 940 | Column names 941 | 942 | ##### Method `log` 943 | 944 | > 945 | > 946 | > def log( 947 | > self, 948 | > direction: dbkdev.data_steps.DataDirection 949 | > ) 950 | 951 | Specifices how to log the object 952 | 953 | ##### Method `log_in` 954 | 955 | > 956 | > 957 | > def log_in( 958 | > self 959 | > ) 960 | 961 | ##### Method `log_out` 962 | 963 | > 964 | > 965 | > def log_out( 966 | > self 967 | > ) 968 | 969 | ##### Method `to_pandas` 970 | 971 | > 972 | > 973 | > def to_pandas( 974 | > self 975 | > ) ‑> pandas.core.frame.DataFrame 976 | 977 | ---- 978 | 979 | 980 | # Module `acai_ml` 981 | 982 | ## Sub-modules 983 | 984 | - [acai\_ml.core](#module-`acai_ml.core`) 985 | 986 | ---- 987 | # Module `acai_ml.core` 988 | 989 | ## Classes 990 | 991 | ### Class `Engine` 992 | 993 | > 994 | > 995 | > class Engine 996 | 997 | This is the core of the framework. It configures the environment to interact with the remote Databricks cluster. 998 | 999 | Instantiate the current object 1000 | 1001 | #### Static methods 1002 | 1003 | ##### `Method get_instance` 1004 | 1005 | > 1006 | > 1007 | > def get_instance() 1008 | 1009 | Current singleton Engine 1010 | 1011 | ###### Returns 1012 | 1013 | - [Engine](#module-`acai_ml.core.Engine "acai_ml.core.Engine"`) 1014 | The Engine 1015 | 1016 | ##### `Method ide_environment` 1017 | 1018 | > 1019 | > 1020 | > def ide_environment() ‑> dbkdev.core.IdeEnvironment 1021 | 1022 | Current Ide Environment 1023 | 1024 | ###### Returns 1025 | 1026 | - IdeEnvironment 1027 | The Ide Environment 1028 | 1029 | ##### `Method is_ide_dataricks` 1030 | 1031 | > 1032 | > 1033 | > def is_ide_dataricks() ‑> bool 1034 | 1035 | Checks if the current environment is Databricks 1036 | 1037 | ###### Returns 1038 | 1039 | - bool 1040 | Check result 1041 | 1042 | ##### `Method is_ide_local` 1043 | 1044 | > 1045 | > 1046 | > def is_ide_local() ‑> bool 1047 | 1048 | Checks if the current environment is Local 1049 | 1050 | ###### Returns 1051 | 1052 | - bool 1053 | Check result 1054 | 1055 | #### Methods 1056 | 1057 | ##### Method `dbutils` 1058 | 1059 | > 1060 | > 1061 | > def dbutils( 1062 | > self 1063 | > ) 1064 | 1065 | Current dbutils 1066 | 1067 | ###### Returns 1068 | 1069 | - DBUtils 1070 | The DBUtils 1071 | 1072 | ##### Method `initialize_env` 1073 | 1074 | > 1075 | > 1076 | > def initialize_env( 1077 | > self 1078 | > ) 1079 | 1080 | Initializes the DevelopmentClient. That is, sets the dbutils and spark 1081 | context accordingly if the code is runt on cluster or locally. 1082 | 1083 | ##### Method `initialize_logger` 1084 | 1085 | > 1086 | > 1087 | > def initialize_logger( 1088 | > self, 1089 | > pipeline_name: str, 1090 | > appi_ik_scope: str = 'config', 1091 | > appi_ik_secret: str = 'APPI_IK' 1092 | > ) 1093 | 1094 | Initializes the logger 1095 | 1096 | ###### Parameters 1097 | 1098 | - **`pipeline_name`** : str 1099 | Name to use with the logger. It will be the base name used for all 1100 | the upcoming logs and tracing 1101 | - **`appi_ik_scope`** : str, optional 1102 | Databricks secret scope where the Application Insight key is stored, 1103 | by default “dds” 1104 | - **`appi_ik_secret`** : str, optional 1105 | Databricks secret name where the Application Insight key is stored, 1106 | by default “appiik” 1107 | 1108 | ###### Raises 1109 | 1110 | - ValueError 1111 | Unknown Ide Environment used 1112 | 1113 | ##### Method `run_notebook` 1114 | 1115 | > 1116 | > 1117 | > def run_notebook( 1118 | > self, 1119 | > notebook: str, 1120 | > args: Dict[~KT, ~VT], 1121 | > timeout=86400, 1122 | > error_raise=True 1123 | > ) 1124 | 1125 | ##### Method `run_notebook_with_retry` 1126 | 1127 | > 1128 | > 1129 | > def run_notebook_with_retry( 1130 | > self, 1131 | > notebook: str, 1132 | > args: Dict[~KT, ~VT], 1133 | > timeout=86400, 1134 | > max_retries=3 1135 | > ) 1136 | 1137 | Runs the specified notebook through dbutils 1138 | 1139 | ###### Parameters 1140 | 1141 | - **`notebook`** : str 1142 | Name or path of the notebook 1143 | - **`args`** : Dict 1144 | \[description\] 1145 | - **`timeout`** : int, optional 1146 | \[description\], by default 86400 1147 | - **`max_retries`** : int, optional 1148 | \[description\], by default 3 1149 | 1150 | ###### Returns 1151 | 1152 | \[type\] \[description\] 1153 | 1154 | ###### Raises 1155 | 1156 | - e 1157 | \[description\] 1158 | 1159 | ##### Method `spark` 1160 | 1161 | > 1162 | > 1163 | > def spark( 1164 | > self 1165 | > ) ‑> pyspark.sql.session.SparkSession 1166 | 1167 | Current spark context 1168 | 1169 | ###### Returns 1170 | 1171 | - SparkSession 1172 | Spark context 1173 | 1174 | ---- 1175 | -------------------------------------------------------------------------------- /src/pipelines/dbkframework/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | opencensus-ext-azure>=1.0.7 3 | typeguard>=2.12.0 4 | pandas>=1.2.4 5 | pyspark 6 | pydataset -------------------------------------------------------------------------------- /src/pipelines/dbkframework/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from datetime import datetime 3 | from pathlib import Path 4 | 5 | current_file = Path(__file__).absolute() 6 | print(f'Current file path: {current_file}') 7 | # current_file_folder = Path(os.getcwd()) 8 | current_file_folder = Path(__file__).parent.absolute() 9 | print(f"Current folder: {current_file_folder}") 10 | 11 | path_readme = current_file_folder.joinpath('documentation.md') 12 | modules_root = current_file.parent.parent.parent.joinpath('modules') 13 | 14 | import sys 15 | sys.path.append(str(current_file.parent.parent.parent.joinpath('modules'))) 16 | from devmaint.docgenerator import create_adow_documentation 17 | 18 | package_name = current_file_folder.stem 19 | 20 | modules_to_use = ['dbkcore', 'dbkdev', 'acai_ml'] 21 | path_requirements = current_file_folder.joinpath('requirements.txt') 22 | package_dir = {} 23 | documentations = [] 24 | 25 | with open(path_requirements, "r") as fh: 26 | requirements = [l.strip() for l in fh.readlines()] 27 | 28 | requirements = [rq for rq in requirements if (rq) and (rq.startswith('#') is False)] 29 | 30 | packages = [] 31 | 32 | for module in modules_to_use: 33 | module_path = modules_root.joinpath(module) 34 | packages = packages + setuptools.find_namespace_packages(where=modules_root, include=[f'{module}*']) 35 | package_dir[module] = module_path 36 | doc = create_adow_documentation(str(module_path)) 37 | documentations.append(doc) 38 | 39 | documentation = '\n\n'.join(documentations) 40 | 41 | with open(str(path_readme), 'w', encoding="utf-8") as out: 42 | out.write(documentation) 43 | 44 | today = datetime.today() 45 | version = f'{today:%Y}{today:%m}{today:%d}_{today:%H}{today:%M}{today:%S}' 46 | 47 | setuptools.setup( 48 | name=package_name, 49 | version=version, 50 | author="Davide Fornelli", 51 | author_email="daforne@microsoft.com", 52 | description="Core library for logging and using proper base object", 53 | # long_description=documentation, 54 | long_description_content_type="text/markdown", 55 | packages=packages, 56 | package_dir=package_dir, 57 | install_requires=requirements, 58 | python_requires='~=3.7.6' 59 | ) 60 | -------------------------------------------------------------------------------- /src/setup/arm-templates/parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "location": { 6 | "value": "northeurope" 7 | }, 8 | "resource_group": { 9 | "value": "rg-dbk-dev-001" 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /src/setup/arm-templates/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "location": { 6 | "type": "string" 7 | }, 8 | "resource_group": { 9 | "type": "string" 10 | }, 11 | "key_vault": { 12 | "defaultValue": "kv", 13 | "type": "String" 14 | }, 15 | "application_insights": { 16 | "defaultValue": "ai", 17 | "type": "String" 18 | }, 19 | "databricks_workspace": { 20 | "defaultValue": "dbkworkspace", 21 | "type": "String" 22 | }, 23 | "storage_account": { 24 | "defaultValue": "sa", 25 | "type": "String" 26 | }, 27 | "log_analytics_workspace": { 28 | "defaultValue": "law", 29 | "type": "String" 30 | } 31 | }, 32 | "variables": { 33 | 34 | }, 35 | "resources": [ 36 | { 37 | "type": "Microsoft.Resources/resourceGroups", 38 | "apiVersion": "2020-10-01", 39 | "location": "[parameters('location')]", 40 | "name": "[parameters('resource_group')]", 41 | "properties": {} 42 | }, 43 | { 44 | "name": "nestedDeployment1", 45 | "type": "Microsoft.Resources/deployments", 46 | "apiVersion": "2020-10-01", 47 | "resourceGroup": "[parameters('resource_group')]", 48 | "dependsOn": [ 49 | "[resourceId('Microsoft.Resources/resourceGroups/', parameters('resource_group'))]" 50 | ], 51 | "properties": { 52 | "expressionEvaluationOptions": { 53 | "scope": "inner" 54 | }, 55 | "mode": "Incremental", 56 | "parameters": { 57 | "resource_group": { 58 | "value": "[parameters('resource_group')]" 59 | }, 60 | "databricks_workspace": { 61 | "value": "[toLower(parameters('databricks_workspace'))]" 62 | }, 63 | "key_vault": { 64 | "value": "[toLower(parameters('key_vault'))]" 65 | }, 66 | "application_insights": { 67 | "value": "[tolower(parameters('application_insights'))]" 68 | }, 69 | "log_analytics_workspace": { 70 | "value": "[tolower(parameters('log_analytics_workspace'))]" 71 | }, 72 | "storage_account": { 73 | "value": "[tolower(parameters('storage_account'))]" 74 | } 75 | }, 76 | "template": { 77 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", 78 | "contentVersion": "1.0.0.0", 79 | "parameters": { 80 | "resource_group": { 81 | "type": "string" 82 | }, 83 | "key_vault": { 84 | "type": "String" 85 | }, 86 | "application_insights": { 87 | "type": "String" 88 | }, 89 | "databricks_workspace": { 90 | "type": "String" 91 | }, 92 | "storage_account": { 93 | "type": "String" 94 | }, 95 | "log_analytics_workspace": { 96 | "type": "String" 97 | } 98 | }, 99 | "variables": { 100 | "var_dbk_workspace_name": "[tolower(concat(resourceGroup().name, '-', parameters('databricks_workspace')))]", 101 | "var_dbk_managedResourceGroupName": "[tolower(concat(variables('var_dbk_workspace_name'), '-databricks-rg-', uniqueString(variables('var_dbk_workspace_name'), resourceGroup().id)))]", 102 | "var_dbk_managedResourceGroupId": "[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('var_dbk_managedResourceGroupName'))]", 103 | "var_storage": { 104 | "storageAccounts": { 105 | "name": "[tolower(concat(replace(parameters('resource_group'), '-', ''), parameters('storage_account')))]", 106 | // "name": "[replace(replace(tolower(parameters('storageAccountNamePrefix')), '-',''),'.','')]", 107 | "type": "Standard_LRS" 108 | } 109 | }, 110 | "var_application_insights": { 111 | "law_name": "[tolower(concat(parameters('resource_group'), '-', parameters('log_analytics_workspace')))]", 112 | "name": "[tolower(concat(parameters('resource_group'), '-', parameters('application_insights')))]" 113 | } 114 | }, 115 | "resources": [ 116 | { 117 | "type": "Microsoft.Databricks/workspaces", 118 | "apiVersion": "2018-04-01", 119 | "name": "[variables('var_dbk_workspace_name')]", 120 | "location": "[resourceGroup().location]", 121 | "sku": { 122 | "name": "standard" 123 | }, 124 | "properties": { 125 | "managedResourceGroupId": "[variables('var_dbk_managedResourceGroupId')]" 126 | // "authorizations": [ 127 | // { 128 | // "principalId": "9a74af6f-d153-4348-988a-e2672920bee9", 129 | // "roleDefinitionId": "8e3af657-a8ff-443c-a75c-2fe8c4bcb635" // Owner 130 | // } 131 | // ] 132 | // "createdBy": {}, 133 | // "updatedBy": {}, 134 | // "createdDateTime": "2021-04-09T09:29:22.5851863Z" 135 | } 136 | }, 137 | { 138 | "type": "Microsoft.KeyVault/vaults", 139 | "apiVersion": "2020-04-01-preview", 140 | "name": "[concat(resourceGroup().name, parameters('key_vault'))]", 141 | "location": "[resourceGroup().location]", 142 | "properties": { 143 | "sku": { 144 | "family": "A", 145 | "name": "Standard" 146 | }, 147 | "tenantId": "[subscription().tenantId]", 148 | "accessPolicies": [], 149 | "enabledForDeployment": false, 150 | "enabledForDiskEncryption": false, 151 | "enabledForTemplateDeployment": false, 152 | "enableSoftDelete": true, 153 | "softDeleteRetentionInDays": 90, 154 | "enableRbacAuthorization": false, 155 | "vaultUri": "[concat('https://', parameters('key_vault'), '.vault.azure.net/')]" 156 | } 157 | }, 158 | { 159 | "type": "microsoft.operationalinsights/workspaces", 160 | "apiVersion": "2020-10-01", 161 | "name": "[variables('var_application_insights').law_name]", 162 | "location": "[resourceGroup().location]", 163 | "properties": { 164 | "sku": { 165 | "name": "pergb2018" 166 | }, 167 | "retentionInDays": 30, 168 | "features": { 169 | "legacy": 0, 170 | "searchVersion": 1, 171 | "enableLogAccessUsingOnlyResourcePermissions": true 172 | }, 173 | "workspaceCapping": { 174 | "dailyQuotaGb": -1 175 | }, 176 | "publicNetworkAccessForIngestion": "Enabled", 177 | "publicNetworkAccessForQuery": "Enabled" 178 | } 179 | }, 180 | { 181 | "type": "Microsoft.Storage/storageAccounts", 182 | "apiVersion": "2021-01-01", 183 | "name": "[variables('var_storage').storageAccounts.name]", 184 | "location": "[resourceGroup().location]", 185 | "sku": { 186 | "name": "Standard_RAGRS", 187 | "tier": "Standard" 188 | }, 189 | "kind": "StorageV2", 190 | "properties": { 191 | "minimumTlsVersion": "TLS1_2", 192 | "allowBlobPublicAccess": true, 193 | "allowSharedKeyAccess": true, 194 | "isHnsEnabled": true, 195 | "networkAcls": { 196 | "bypass": "AzureServices", 197 | "virtualNetworkRules": [], 198 | "ipRules": [], 199 | "defaultAction": "Allow" 200 | }, 201 | "supportsHttpsTrafficOnly": true, 202 | "encryption": { 203 | "services": { 204 | "file": { 205 | "keyType": "Account", 206 | "enabled": true 207 | }, 208 | "blob": { 209 | "keyType": "Account", 210 | "enabled": true 211 | } 212 | }, 213 | "keySource": "Microsoft.Storage" 214 | }, 215 | "accessTier": "Hot" 216 | } 217 | }, 218 | { 219 | "type": "microsoft.insights/components", 220 | "apiVersion": "2020-02-02-preview", 221 | "name": "[variables('var_application_insights').name]", 222 | "location": "[resourceGroup().location]", 223 | "dependsOn": [ 224 | "[resourceId('microsoft.operationalinsights/workspaces', variables('var_application_insights').law_name)]" 225 | ], 226 | "kind": "web", 227 | "properties": { 228 | "Application_Type": "web", 229 | "Flow_Type": "Redfield", 230 | "Request_Source": "IbizaAIExtension", 231 | "WorkspaceResourceId": "[resourceId('microsoft.operationalinsights/workspaces', variables('var_application_insights').law_name)]", 232 | "IngestionMode": "LogAnalytics", 233 | "publicNetworkAccessForIngestion": "Enabled", 234 | "publicNetworkAccessForQuery": "Enabled" 235 | } 236 | }, 237 | { 238 | "type": "Microsoft.Storage/storageAccounts/blobServices", 239 | "apiVersion": "2021-01-01", 240 | "name": "[concat(variables('var_storage').storageAccounts.name, '/default')]", 241 | "dependsOn": [ 242 | "[resourceId('Microsoft.Storage/storageAccounts', variables('var_storage').storageAccounts.name)]" 243 | ], 244 | "sku": { 245 | "name": "Standard_RAGRS", 246 | "tier": "Standard" 247 | }, 248 | "properties": { 249 | "cors": { 250 | "corsRules": [] 251 | }, 252 | "deleteRetentionPolicy": { 253 | "enabled": false 254 | } 255 | } 256 | } 257 | // { 258 | // "type": "Microsoft.Storage/storageAccounts/fileServices", 259 | // "apiVersion": "2021-01-01", 260 | // "name": "[concat(variables('var_storage').storageAccounts.name, '/default')]", 261 | // "dependsOn": [ 262 | // "[resourceId('Microsoft.Storage/storageAccounts', variables('var_storage').storageAccounts.name)]" 263 | // ], 264 | // "sku": { 265 | // "name": "Standard_RAGRS", 266 | // "tier": "Standard" 267 | // }, 268 | // "properties": { 269 | // "protocolSettings": { 270 | // "smb": {} 271 | // }, 272 | // "cors": { 273 | // "corsRules": [] 274 | // }, 275 | // "shareDeleteRetentionPolicy": { 276 | // "enabled": true, 277 | // "days": 7 278 | // } 279 | // } 280 | // }, 281 | // { 282 | // "type": "Microsoft.Storage/storageAccounts/queueServices", 283 | // "apiVersion": "2021-01-01", 284 | // "name": "[concat(variables('var_storage').storageAccounts.name, '/default')]", 285 | // "dependsOn": [ 286 | // "[resourceId('Microsoft.Storage/storageAccounts', variables('var_storage').storageAccounts.name)]" 287 | // ], 288 | // "properties": { 289 | // "cors": { 290 | // "corsRules": [] 291 | // } 292 | // } 293 | // }, 294 | // { 295 | // "type": "Microsoft.Storage/storageAccounts/tableServices", 296 | // "apiVersion": "2021-01-01", 297 | // "name": "[concat(variables('var_storage').storageAccounts.name, '/default')]", 298 | // "dependsOn": [ 299 | // "[resourceId('Microsoft.Storage/storageAccounts', variables('var_storage').storageAccounts.name)]" 300 | // ], 301 | // "properties": { 302 | // "cors": { 303 | // "corsRules": [] 304 | // } 305 | // } 306 | // } 307 | ], 308 | "outputs": {} 309 | } 310 | } 311 | } 312 | ] 313 | } -------------------------------------------------------------------------------- /src/setup/config/setup_config.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "applicationID":"deeadfbxxxd39049b450", 4 | "tenantID":"72f988bxxxab-2dxxxx7cd011db47", 5 | "subscriptionID":"89c37xxxxxx98e0-1cfb98c0262e", 6 | "resourceGroupName":"acltrsapadbkmlops99", 7 | "resourceGroupLocation":"NorthEurope" 8 | } 9 | -------------------------------------------------------------------------------- /src/setup/configureResources.ps1: -------------------------------------------------------------------------------- 1 | <# 2 | Prereqisite : 3 | 4 | Service Principal must be granted subscription contributor permission 5 | 6 | 1. setup_config.json should be filled with the proper details. ( already done during the first script execution ) 7 | 2. appsecret.txt should be having the client secret of the service principal. ( already done during the first script execution ) 8 | 3..\vault\DBKtoken.txt file should be filled with the Databricks Personal Access token. 9 | 3. change the directory path in the command line to the project file path. 10 | cd C:\Users\......\MLOpsBasic-Databricks\src\setup 11 | 12 | 13 | Post Execution Step 14 | ================================= 15 | 1. create .env file in root with the details from the output of the execution 16 | 17 | PYTHONPATH=/workspaces/MLOpsBasic-Databricks/src/modules 18 | APPI_IK="7936xxxx8497696" 19 | DATABRICKS_HOST=https://adb-dapi398220xxxxxb066e49b7-2.XX.azuredatabricks.net/ 20 | DATABRICKS_TOKEN=793xxxx8497696 21 | DATABRICKS_ORDGID=53d000xxxxb-9634-ae6a9658c775 22 | 23 | 2. DATABRICKS_HOST=https://adb-dapi3982xxxx6e94657b066e49b7-2.XX.azuredatabricks.net/ ==> change the "XX" with the correct version from the databricks workspace URL. 24 | 25 | #> 26 | 27 | Write-Verbose "PSScriptRoot is: $PSScriptRoot" 28 | $rootPath = (Get-Item -Path $PSScriptRoot).FullName 29 | Write-Verbose "config file path: $rootPath\config\setup_config.json" 30 | $config = Get-Content -Raw -Path "$rootPath\config\setup_config.json" | ConvertFrom-Json 31 | 32 | 33 | # $rootPath = $config.rootDirectoryPath 34 | # cd $rootpath 35 | 36 | $applicationID=$config.applicationID 37 | $appsecret = Get-Content -Path "$rootPath\vault\appsecret.txt" 38 | $tenantId =$config.tenantID 39 | $subscriptionID = $config.subscriptionID 40 | $resourceGroupname = $config.resourceGroupName 41 | $resourceGroupLocation = $config.resourceGroupLocation 42 | 43 | 44 | $dbktoken = Get-Content -Path "$rootPath\vault\DBKtoken.txt" 45 | 46 | # DBK Cluster Creation 47 | cd $rootPath 48 | cd util 49 | $returnResult = .\Deploy-DBCluster.ps1 -ResourceGroupName $resourceGroupname -Location $resourceGroupLocation -BearerToken $dbktoken -Verbose 50 | $clusterID = $returnResult | select -Last 1 51 | 52 | cd.. 53 | # Login to databricks 54 | $resourceGroupLocation = $resourceGroupLocation.replace(' ','') 55 | 56 | 57 | $DBAPIRootUrl = "https://"+$resourceGroupLocation+".azuredatabricks.net" 58 | $DBAPIKey = $dbktoken 59 | 60 | [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12 61 | 62 | $ClustersAPIListUrl = $DBAPIRootUrl.Trim('/') + "/api/2.0/workspace/list" 63 | 64 | $headers = @{ 65 | Authorization = "Bearer $DBAPIKey" 66 | "Content-Type" = "application/json" 67 | } 68 | 69 | $Path= "/" 70 | $parameters = @{ 71 | path = $Path 72 | } 73 | 74 | $response = Invoke-WebRequest -Uri $ClustersAPIListUrl -Method GET -Headers $headers -Body $parameters 75 | $orgID = $response.Headers.'X-Databricks-Org-Id' 76 | 77 | $appInsightName = ((Get-AzApplicationInsights -ResourceGroupName $resourceGroupname) | Where-Object {$_.Name -eq $resourceGroupname+"-ai"}) 78 | $instrumentationKey = $appInsightName.InstrumentationKey 79 | 80 | 81 | $output = 'PYTHONPATH=/workspaces/dstoolkit-ml-ops-for-databricks/src/modules 82 | APPI_IK={0} 83 | DATABRICKS_HOST=https://adb-{1}.XX.azuredatabricks.net/ 84 | DATABRICKS_TOKEN={2} 85 | DATABRICKS_ORDGID={3}' -f $instrumentationKey, $orgID,$dbktoken,$orgID 86 | 87 | Write-Host $output 88 | -------------------------------------------------------------------------------- /src/setup/deployResources.ps1: -------------------------------------------------------------------------------- 1 | <# 2 | Prereqisite : 3 | Service Principal must be granted subscription contributor permission 4 | 5 | 1. setup_config.json should be filled with the proper details. 6 | 2. .\vault\appsecret.txt should be having the client secret of the service principal 7 | 3. change the directory path in the command line to the project file path. 8 | cd C:\Users\......\MLOpsBasic-Databricks\src\setup 9 | #> 10 | 11 | Write-Verbose "PSScriptRoot is: $PSScriptRoot" 12 | $rootPath = (Get-Item -Path $PSScriptRoot).FullName 13 | Write-Verbose "config file path: $rootPath\config\setup_config.json" 14 | $config = Get-Content -Raw -Path "$rootPath\config\setup_config.json" | ConvertFrom-Json 15 | 16 | 17 | # $rootPath = $config.rootDirectoryPath 18 | # cd $rootpath 19 | 20 | $applicationID=$config.applicationID 21 | $appsecret = Get-Content -Path "$rootPath\vault\appsecret.txt" 22 | $tenantId =$config.tenantID 23 | $subscriptionID = $config.subscriptionID 24 | $resourceGroupname = $config.resourceGroupName 25 | $resourceGroupLocation = $config.resourceGroupLocation 26 | 27 | #Install Modules. 28 | 29 | if (!(Get-Module -Name "Az.Accounts" -ListAvailable)){ 30 | Install-Module -Name "Az.Accounts" 31 | Import-Module -Name "Az.Accounts" 32 | 33 | } 34 | 35 | if (!(Get-Module -Name "Az.ApplicationInsights" -ListAvailable)){ 36 | Install-Module -Name "Az.ApplicationInsights" 37 | Import-Module -Name "Az.ApplicationInsights" 38 | 39 | } 40 | 41 | 42 | if (!(Get-Module -Name "Az.Databricks" -ListAvailable)){ 43 | Install-Module -Name "Az.Databricks" 44 | Import-Module -Name "Az.Databricks" 45 | 46 | } 47 | 48 | $PWord= ConvertTo-SecureString -String $appsecret -AsPlainText -Force 49 | $Credential1 = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $applicationID, $PWord 50 | $info= Connect-AzAccount -ServicePrincipal -Credential $Credential1 -TenantId $tenantid -Subscription $subscriptionID 51 | 52 | # Create the Resource Group 53 | if (!( Get-AzResourceGroup -Name $resourceGroupname -ErrorVariable notPresent -ErrorAction SilentlyContinue )){ 54 | New-AzResourceGroup -Location $resourceGroupLocation -Name $resourceGroupname 55 | } 56 | 57 | # Task 1: Deploy the Resource Group 58 | $templatefileLocation = $rootPath + "\arm-templates\template.json" 59 | 60 | # Task 2: Deploy the Resource 61 | $deploymentDetails = New-AzDeployment -Name "DBKadnResourceCreation" ` 62 | -Location $resourceGroupLocation -resource_group $resourceGroupname ` 63 | -TemplateFile $templatefileLocation ` 64 | -locationFromTemplate $resourceGroupLocation 65 | 66 | $deploymentDetails 67 | 68 | # if ($deploymentDetails.ProvisioningState -eq "Succeeded"){ 69 | # $dbkName = (Get-AzDatabricksWorkspace -ResourceGroupName "AccleratorDBKMLOps1").Name 70 | # $appInsightName = (Get-AzApplicationInsights -ResourceGroupName "AccleratorDBKMLOps1") 71 | # } 72 | 73 | -------------------------------------------------------------------------------- /src/setup/util/DBCluster-Configuration.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_workers": 1, 3 | "cluster_name": "devcluster", 4 | "spark_version": "7.3.x-cpu-ml-scala2.12", 5 | "spark_conf": { 6 | "spark.databricks.delta.preview.enabled": "true" 7 | }, 8 | "azure_attributes": { 9 | "first_on_demand": 1, 10 | "availability": "ON_DEMAND_AZURE", 11 | "spot_bid_max_price": -1 12 | }, 13 | "node_type_id": "Standard_DS3_v2", 14 | "driver_node_type_id": "Standard_DS3_v2", 15 | "ssh_public_keys": [], 16 | "custom_tags": {}, 17 | "spark_env_vars": {}, 18 | "autotermination_minutes": 20, 19 | "enable_elastic_disk": true, 20 | "cluster_source": "API", 21 | "init_scripts": [], 22 | "cluster_id": "0519-195053-tough408" 23 | } -------------------------------------------------------------------------------- /src/setup/util/Deploy-DBCluster.ps1: -------------------------------------------------------------------------------- 1 | 2 | param 3 | ( 4 | 5 | [Parameter(Position = 0, Mandatory = $True, HelpMessage = "Specify the ResourceGroupName.")] 6 | [String] $ResourceGroupName, 7 | [Parameter(Position = 1, Mandatory = $True, HelpMessage = "Specify the Location.")] 8 | [String] $Location, 9 | [Parameter(Position = 2, Mandatory = $True, HelpMessage = "Specify the BearerToken.")] 10 | [String] $BearerToken # TODO: This should come from DevOps task 11 | ) 12 | 13 | $VerbosePreference = 'Continue' 14 | 15 | Write-Verbose "PSScriptRoot is: $PSScriptRoot" 16 | $ScriptFolderPath = (Get-Item -Path $PSScriptRoot).FullName 17 | Write-Verbose "parameter file path: $ScriptFolderPath" 18 | 19 | $clusterFilePath = "$ScriptFolderPath\DBCluster-Configuration.json" 20 | 21 | $clusterId = $null 22 | $clusterName = (Get-Content -Path $clusterFilePath | ConvertFrom-Json).cluster_name 23 | $clusterDefintion = Get-Content -Path $clusterFilePath 24 | 25 | $resourceGroupLocation = $Location.replace(' ','') 26 | $DBAPIRootUrl = "https://"+$resourceGroupLocation+".azuredatabricks.net" 27 | $DBAPIKey = $BearerToken 28 | 29 | [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12 30 | 31 | $ClustersAPIListUrl = $DBAPIRootUrl.Trim('/') + "/api/2.0/clusters/list" 32 | 33 | $headers = @{ 34 | Authorization = "Bearer $DBAPIKey" 35 | "Content-Type" = "application/json" 36 | } 37 | 38 | $Path= "/" 39 | $parameters = @{ 40 | path = $Path 41 | } 42 | 43 | $response = Invoke-WebRequest -Uri $ClustersAPIListUrl -Method GET -Headers $headers -Body $parameters 44 | 45 | $responseObj = $response.Content | ConvertFrom-Json 46 | $clusterid = "" 47 | foreach ( $c in $responseObj.clusters){ 48 | if($c.cluster_name -eq $clusterName){ 49 | $clusterid = $c.cluster_id 50 | } 51 | } 52 | if($clusterid){ 53 | Write-Host "The cluster is already present" 54 | } 55 | else{ 56 | Write-Host "new cluster to be created" 57 | 58 | $ClustersAPIListUrl = $DBAPIRootUrl.Trim('/') + "/api/2.0/clusters/create" 59 | 60 | $headers = @{ 61 | Authorization = "Bearer $DBAPIKey" 62 | "Content-Type" = "application/json" 63 | } 64 | 65 | $Path= "/" 66 | $parameters = @{ 67 | path = $Path 68 | } 69 | 70 | $response = Invoke-WebRequest -Uri $ClustersAPIListUrl -Method POST -Headers $headers -Body $clusterDefintion 71 | $clusterid = ($response.Content|ConvertFrom-Json).cluster_id 72 | } 73 | return $clusterid -------------------------------------------------------------------------------- /src/setup/util/Deploy-DBCluster_using_CLI.ps1: -------------------------------------------------------------------------------- 1 | 2 | param 3 | ( 4 | 5 | [Parameter(Position = 0, Mandatory = $True, HelpMessage = "Specify the ResourceGroupName.")] 6 | [String] $ResourceGroupName, 7 | [Parameter(Position = 1, Mandatory = $True, HelpMessage = "Specify the Location.")] 8 | [String] $Location, 9 | [Parameter(Position = 2, Mandatory = $True, HelpMessage = "Specify the BearerToken.")] 10 | [String] $BearerToken # TODO: This should come from DevOps task 11 | ) 12 | 13 | #$Environment = "Dev" 14 | #$ResourceGroupName = "RS-DEV-WE-03" 15 | #$Location = "westeurope" 16 | #$BearerToken = "d" 17 | 18 | #$psISE.CurrentFile.FullPath 19 | 20 | # This switch needs to be enabled to print verbose messages 21 | $VerbosePreference = 'Continue' 22 | 23 | Write-Verbose "PSScriptRoot is: $PSScriptRoot" 24 | $ScriptFolderPath = (Get-Item -Path $PSScriptRoot).FullName 25 | Write-Verbose "parameter file path: $ScriptFolderPath" 26 | 27 | $clusterFilePath = "$ScriptFolderPath\DBCluster-Configuration.json" 28 | $clusterId = $null 29 | 30 | # Install Libraries 31 | python -m pip install --upgrade pip 32 | #python -m pip install wheel 33 | #python -m pip install setuptools 34 | python -m pip install databricks-cli 35 | 36 | #Removing the space from the Location is there is any 37 | 38 | $Location = $Location.replace(' ','') 39 | # Login to databricks 40 | @" 41 | https://$Location.azuredatabricks.net 42 | $BearerToken 43 | "@ | databricks configure --token 44 | 45 | # Create Interactive clusters 46 | # Check if the cluster exist 47 | $clusterName = (Get-Content -Path $clusterFilePath | ConvertFrom-Json).cluster_name 48 | $clusters = (databricks clusters list --output="JSON" | ConvertFrom-Json).clusters | Where-Object { $_.cluster_name -eq $clusterName } 49 | 50 | if ($null -ne $clusters) 51 | { 52 | $clusterId = $clusters.cluster_id 53 | } 54 | if($clusterid -ne $null){ 55 | Write-Verbose $clusterId 56 | } 57 | 58 | if ($clusterId) 59 | { 60 | Write-Verbose "Cluster already exist with ID $clusterId" 61 | if ((databricks clusters get --cluster-id $clusterId | ConvertFrom-Json).state -ne "RUNNING") 62 | { 63 | Write-Verbose "Cluster state is terminated starting cluster: $clusterId" 64 | databricks clusters start --cluster-id $clusterId 65 | 66 | # Start the cluster and poll until its state changes to Running 67 | while ((databricks clusters get --cluster-id $clusterId | ConvertFrom-Json).state -eq "PENDING") 68 | { 69 | Write-Verbose "Waiting for Databrick cluster id $($clusterId) to get started, sleep for 30 seconds" 70 | Start-Sleep -Seconds 30 71 | } 72 | 73 | if ((databricks clusters get --cluster-id $clusterId | ConvertFrom-Json).state -eq "RUNNING") 74 | { 75 | Write-Verbose "Databrick cluster id $($clusterId) is now running" 76 | } 77 | else 78 | { 79 | Write-Verbose "Databrick cluster id $($clusterId) creation failed. exiting script" 80 | exit 81 | } 82 | } 83 | } 84 | else 85 | { 86 | #Create a fixed node cluster 87 | $clusterId = (databricks clusters create --json-file $clusterFilePath | ConvertFrom-Json).cluster_id 88 | if($clusterid -ne $null){ 89 | Write-Verbose "cluster id $clusterId" 90 | } 91 | 92 | while ((databricks clusters get --cluster-id $clusterId | ConvertFrom-Json).state -eq "PENDING") 93 | { 94 | Write-Verbose "Waiting for Databrick cluster id $($clusterId) to created, sleep for 30 seconds" 95 | Start-Sleep -Seconds 30 96 | } 97 | 98 | if ((databricks clusters get --cluster-id $clusterId | ConvertFrom-Json).state -eq "RUNNING") 99 | { 100 | Write-Verbose "Databrick cluster id $($clusterId) is now running" 101 | } 102 | else 103 | { 104 | Write-Verbose "Databrick cluster id $($clusterId) creation failed. exiting script" 105 | exit 106 | } 107 | 108 | } 109 | return $clusterId 110 | -------------------------------------------------------------------------------- /src/tutorial/README.md: -------------------------------------------------------------------------------- 1 | # TODO 2 | 3 | # Scripts 4 | 5 | ## Create cluster 6 | ```bash 7 | cd /workspaces/MLOpsBasic-Databricks/src/tutorial && \ 8 | python scripts/create_cluster.py -c cluster_config.json 9 | ``` 10 | 11 | ## Local configuration 12 | ```bash 13 | cd /workspaces/MLOpsBasic-Databricks/src/tutorial && \ 14 | python scripts/local_config.py -c cluster_config.json 15 | ``` 16 | 17 | ## Secrets configuration 18 | ```bash 19 | cd /workspaces/MLOpsBasic-Databricks/src/tutorial && \ 20 | python scripts/set_secrets.py -c cluster_config.json --scope 'test_scope' --secret_name 'test_scret_name' --secret_value 'test_secrete_value' 21 | ``` -------------------------------------------------------------------------------- /src/tutorial/cluster_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_workers": 2, 3 | "cluster_name": "devcluster", 4 | "spark_version": "7.3.x-cpu-ml-scala2.12", 5 | "spark_conf": { 6 | "spark.databricks.delta.preview.enabled": "true", 7 | "spark.sql.execution.arrow.enabled": "true" 8 | }, 9 | "node_type_id": "Standard_DS3_v2", 10 | "driver_node_type_id": "Standard_DS3_v2", 11 | "ssh_public_keys": [], 12 | "custom_tags": {}, 13 | "spark_env_vars": { 14 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 15 | }, 16 | "autotermination_minutes": 60, 17 | "enable_elastic_disk": true 18 | } -------------------------------------------------------------------------------- /src/tutorial/create_databricks_secrets.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dbkcore.core import Log 3 | from scripts import set_secrets 4 | 5 | appi_key_env = 'APPI_IK' 6 | set_secrets.main(scope='config', secret_name=appi_key_env, secret_value=os.environ[appi_key_env]) -------------------------------------------------------------------------------- /src/tutorial/deploy.py: -------------------------------------------------------------------------------- 1 | """Configure Databricks cluster.""" 2 | 3 | from pathlib import Path 4 | 5 | # import sys 6 | # sys.path.append(str(Path(__file__).parent.parent.joinpath('modules'))) 7 | import os 8 | import json 9 | from dbkcore.core import Log 10 | from scripts import create_cluster 11 | from scripts import install_dbkframework 12 | from scripts import set_secrets 13 | from scripts import local_config 14 | import argparse 15 | 16 | Log(name=Path(__file__).stem) 17 | 18 | 19 | def command_exec(command, ignore=False): 20 | """ 21 | Execute shell command. 22 | 23 | Parameters 24 | ---------- 25 | command : str 26 | Command to execute 27 | ignore : bool, optional 28 | Ignore exception, by default False 29 | 30 | Raises 31 | ------ 32 | Exception 33 | Raises exception if command failes 34 | """ 35 | Log.get_instance().log_info(f'Running command -> {command}') 36 | if not ignore: 37 | if os.system(command) != 0: 38 | raise Exception(f'Failed to execute: {command}') 39 | 40 | 41 | def parse_args(args_list=None): 42 | """ 43 | Parse command line arguments. 44 | 45 | Parameters 46 | ---------- 47 | args_list : [type], optional 48 | Argument list, by default None 49 | 50 | Returns 51 | ------- 52 | ArgumentParser 53 | Arguments parsed 54 | """ 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('-c', '--config_file', help="Full path of cluster's json configuration", type=str, required=True) 57 | args_parsed = parser.parse_args(args_list) 58 | return args_parsed 59 | 60 | 61 | def main(cluster_config_file): 62 | """ 63 | Execute the script. 64 | 65 | Parameters 66 | ---------- 67 | cluster_config_file : str 68 | Path of the configuration file 69 | 70 | Raises 71 | ------ 72 | Exception 73 | Raises when script failes 74 | """ 75 | appi_key_env = 'APPI_IK' 76 | 77 | create_cluster.main(cluster_config_file=cluster_config_file) 78 | local_config.main(cluster_config_file=cluster_config_file) 79 | set_secrets.main(scope='config', secret_name=appi_key_env, secret_value=os.environ[appi_key_env]) 80 | install_dbkframework.main(cluster_config_file=cluster_config_file) 81 | 82 | 83 | if __name__ == "__main__": 84 | args = parse_args() 85 | main(cluster_config_file=args.config_file) 86 | -------------------------------------------------------------------------------- /src/tutorial/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/tutorial/scripts/__init__.py -------------------------------------------------------------------------------- /src/tutorial/scripts/create_cluster.py: -------------------------------------------------------------------------------- 1 | """Create a cluster in Databricks.""" 2 | import sys 3 | from pathlib import Path 4 | 5 | # sys.path.append(str(Path(__file__).parent.parent.joinpath('modules'))) 6 | import json 7 | from dbkenv.core import DatabricksResourceManager 8 | from dbkenv.core import Configuration 9 | from dbkenv.core import ResourceClient 10 | from dbkenv.core import Log 11 | import argparse 12 | 13 | 14 | 15 | 16 | Log(name=Path(__file__).stem) 17 | 18 | 19 | def parse_args(args_list=None): 20 | """ 21 | Parse command line arguments. 22 | 23 | Parameters 24 | ---------- 25 | args_list : [type], optional 26 | Argument list, by default None 27 | 28 | Returns 29 | ------- 30 | ArgumentParser 31 | Arguments parsed 32 | """ 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('-c', '--config_file', help="Full path of cluster's json configuration", type=str, required=True) 35 | args_parsed = parser.parse_args(args_list) 36 | return args_parsed 37 | 38 | 39 | def main(cluster_config_file): 40 | """ 41 | Execute the script. 42 | 43 | Parameters 44 | ---------- 45 | cluster_config_file : str 46 | Path of the configuration file 47 | 48 | Raises 49 | ------ 50 | Exception 51 | Raises when script failes 52 | """ 53 | configuration = Configuration(file_load=True) 54 | # cluster_config_file = str(Path(__file__).parent.joinpath('unittest_cluster.json')) 55 | 56 | with open(cluster_config_file.strip(), 'r') as cl: 57 | cluster_configuration = json.load(cl) 58 | 59 | cluster_name = cluster_configuration['cluster_name'] 60 | # instantiate the logger 61 | 62 | client = ResourceClient( 63 | host=configuration.DATABRICKS_HOST, 64 | personal_token=configuration.DATABRICKS_TOKEN 65 | ) 66 | drm = DatabricksResourceManager( 67 | client=client, 68 | cluster_name=cluster_name, 69 | cluster_configuration=cluster_configuration 70 | ) 71 | 72 | drm.cluster.create_cluster_and_wait() 73 | 74 | 75 | if __name__ == "__main__": 76 | args = parse_args() 77 | main(cluster_config_file=args.config_file) 78 | -------------------------------------------------------------------------------- /src/tutorial/scripts/framework_testing/remote_analysis.py: -------------------------------------------------------------------------------- 1 | """Example of framework usage""" 2 | 3 | import random 4 | from acai_ml.core import Engine 5 | import pandas as pd 6 | from pydataset import data 7 | from pathlib import Path 8 | from dbkcore.core import trace 9 | from dbkcore.core import Log 10 | from dbkdev.data_steps import DataStep, DataStepDataframe 11 | from dbkdev.data_steps import apply_test 12 | from sklearn.model_selection import ParameterSampler 13 | from sklearn.utils.fixes import loguniform 14 | from pyspark.sql import functions as F 15 | import numpy as np 16 | from sklearn.model_selection import cross_val_score 17 | from sklearn import svm 18 | 19 | 20 | class Step_loadData(DataStep): 21 | """Load the defined dataset.""" 22 | 23 | def test(self): 24 | """Apply data tests.""" 25 | self.test_is_dataframe_empty(df=self.output_data.dataframe) 26 | self.test_null_values( 27 | cols=['Sepal.Length', 'Sepal.Width'], 28 | df=self.output_data.dataframe 29 | ) 30 | 31 | @apply_test 32 | @trace 33 | def initialize(self, name_dataset: str): 34 | """ 35 | Initialize the DataStep. 36 | 37 | Parameters 38 | ---------- 39 | name_dataset : str 40 | Name of the dataset to load from pydataset package 41 | """ 42 | p_df = data(name_dataset) 43 | p_df.columns = [c.replace('.', '') for c in p_df.columns] 44 | dt = self.spark.createDataFrame(p_df) 45 | self.set_output_data(dt) 46 | 47 | 48 | class Step_crossValidate(DataStep): 49 | """Run multiple models in parallel.""" 50 | 51 | def test(self): 52 | pass 53 | 54 | @trace(attrs_refact=['appi_ik']) 55 | def initialize( 56 | self, 57 | dt: DataStepDataframe, 58 | pipeline_name: str, 59 | appi_ik: str, 60 | n_iter: int 61 | ): 62 | param_grid = { 63 | 'C': loguniform(1e0, 1e3), 64 | 'kernel': ['linear', 'rbf'], 65 | 'class_weight': ['balanced', None] 66 | } 67 | rng = np.random.RandomState(0) 68 | param_list = list( 69 | ParameterSampler( 70 | param_grid, 71 | n_iter=n_iter, 72 | random_state=rng 73 | ) 74 | ) 75 | # p_dt = Engine.get_instance().spark().createDataFrame(pd.DataFrame(param_list)).\ 76 | # withColumn('id', F.monotonically_increasing_id()) 77 | p_dt = self.spark.createDataFrame(pd.DataFrame(param_list)).\ 78 | withColumn('id', F.monotonically_increasing_id()) 79 | dt_train = dt.dataframe.crossJoin( 80 | p_dt 81 | ) 82 | 83 | udf_schema = dt_train.select( 84 | 'id', 85 | F.lit(0.0).alias('score') 86 | ).schema 87 | 88 | def pudf_train(dt_model): 89 | param_id = dt_model['id'].unique()[0] 90 | param_c = dt_model['C'].unique()[0] 91 | param_class_weight = dt_model['class_weight'].unique()[0] 92 | param_kernel = dt_model['kernel'].unique()[0] 93 | 94 | logging_custom_dimensions = { 95 | 'id': str(param_id), 96 | 'C': str(param_c), 97 | 'class_weight': param_class_weight, 98 | 'kernel': param_kernel 99 | } 100 | 101 | Log(pipeline_name, appi_ik) 102 | 103 | try: 104 | 105 | # Raising randomly exception 106 | if random.randint(0, 20) > 15: 107 | raise 'Random exception' 108 | 109 | dt_x = dt_model[ 110 | [ 111 | 'SepalLength', 112 | 'SepalWidth', 113 | 'PetalLength', 114 | 'PetalWidth' 115 | ] 116 | ] 117 | y = dt_model['Species'] 118 | clf = svm.SVC( 119 | kernel=param_kernel, 120 | C=param_c, 121 | class_weight=param_class_weight, 122 | random_state=42 123 | ) 124 | scores = cross_val_score(clf, dt_x, y, cv=5, scoring='f1_macro') 125 | score = scores.mean() 126 | dt_out = pd.DataFrame( 127 | { 128 | 'id': [param_id], 129 | 'score': [score] 130 | } 131 | ) 132 | Log.get_instance().log_info("Training:success", custom_dimension=logging_custom_dimensions) 133 | except Exception: 134 | Log.get_instance().log_error("Training:failed", custom_dimension=logging_custom_dimensions) 135 | dt_out = pd.DataFrame( 136 | { 137 | 'id': [param_id], 138 | 'score': [-1] 139 | } 140 | ) 141 | return dt_out 142 | 143 | ''' 144 | dt_model = dt_train.where(F.col('id') == 17179869184).toPandas() 145 | ''' 146 | dt_cross_evals = dt_train.\ 147 | groupBy(['id']).\ 148 | applyInPandas(pudf_train, schema=udf_schema).\ 149 | cache() 150 | dt_cross_evals.count() 151 | self.set_output_data(dt_cross_evals) 152 | 153 | 154 | Engine() 155 | Engine().get_instance().initialize_env() 156 | # pipeline_name = Path(__file__).stem 157 | pipeline_name = "Remote Testing" 158 | Engine().get_instance().initialize_logger(pipeline_name=pipeline_name) 159 | # Engine().get_instance().spark().conf.set("spark.sql.execution.arrow.enabled", "true") 160 | 161 | run_id = 'test_run_id' 162 | 163 | step_loadData = Step_loadData( 164 | spark=Engine.get_instance().spark(), 165 | run_id=run_id 166 | ) 167 | 168 | step_loadData.initialize( 169 | name_dataset='iris' 170 | ) 171 | 172 | step_crossValidate = Step_crossValidate( 173 | spark=Engine.get_instance().spark(), 174 | run_id=run_id 175 | ) 176 | 177 | step_crossValidate.initialize( 178 | dt=step_loadData.output_data, 179 | pipeline_name=pipeline_name, 180 | appi_ik=Engine().get_instance().appi_ik, 181 | n_iter=1000 182 | ) 183 | 184 | step_crossValidate.output_data.dataframe.toPandas() 185 | -------------------------------------------------------------------------------- /src/tutorial/scripts/install_dbkframework.py: -------------------------------------------------------------------------------- 1 | """Build and installs the dbkframework.""" 2 | 3 | from pathlib import Path 4 | 5 | # import sys 6 | # sys.path.append(str(Path(__file__).parent.parent.joinpath('modules'))) 7 | import os 8 | import json 9 | from dbkcore.core import Log 10 | from dbkenv.core import ResourceClient 11 | from dbkenv.core import Configuration 12 | from dbkenv.core import DatabricksResourceManager 13 | import argparse 14 | 15 | Log(name=Path(__file__).stem) 16 | 17 | 18 | def command_exec(command, ignore=False): 19 | """ 20 | Execute shell command. 21 | 22 | Parameters 23 | ---------- 24 | command : str 25 | Command to execute 26 | ignore : bool, optional 27 | Ignore exception, by default False 28 | 29 | Raises 30 | ------ 31 | Exception 32 | Raises exception if command failes 33 | """ 34 | Log.get_instance().log_info(f'Running command -> {command}') 35 | if not ignore: 36 | if os.system(command) != 0: 37 | raise Exception(f'Failed to execute: {command}') 38 | 39 | 40 | def parse_args(args_list=None): 41 | """ 42 | Parse command line arguments. 43 | 44 | Parameters 45 | ---------- 46 | args_list : [type], optional 47 | Argument list, by default None 48 | 49 | Returns 50 | ------- 51 | ArgumentParser 52 | Arguments parsed 53 | """ 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('-c', '--config_file', help="Full path of cluster's json configuration", type=str, required=True) 56 | args_parsed = parser.parse_args(args_list) 57 | return args_parsed 58 | 59 | 60 | def main(cluster_config_file): 61 | """ 62 | Execute the script. 63 | 64 | Parameters 65 | ---------- 66 | cluster_config_file : str 67 | Path of the configuration file 68 | 69 | Raises 70 | ------ 71 | Exception 72 | Raises when script failes 73 | """ 74 | configuration = Configuration(file_load=True) 75 | with open(cluster_config_file.strip(), 'r') as cl: 76 | cluster_configuration = json.load(cl) 77 | 78 | cluster_name = cluster_configuration['cluster_name'] 79 | 80 | client = ResourceClient( 81 | host=configuration.DATABRICKS_HOST, 82 | personal_token=configuration.DATABRICKS_TOKEN 83 | ) 84 | drm = DatabricksResourceManager( 85 | client=client, 86 | cluster_name=cluster_name, 87 | cluster_configuration=cluster_configuration 88 | ) 89 | 90 | cluster_id = drm.cluster.cluster_id 91 | 92 | drm.cluster.start_cluster_and_wait() 93 | 94 | modules_to_deploy = [ 95 | 'dbkframework' 96 | ] 97 | 98 | pipelines_folder = Path(__file__).\ 99 | parent.\ 100 | parent.\ 101 | parent.\ 102 | absolute().\ 103 | joinpath('pipelines') 104 | 105 | for module in modules_to_deploy: 106 | 107 | package_folder = pipelines_folder.joinpath(module) 108 | dist_folder = package_folder.joinpath('dist') 109 | 110 | setup_file = package_folder.joinpath('setup.py') 111 | 112 | command_string = f"cd {str(package_folder)} && python {str(setup_file)} sdist bdist_wheel" 113 | res = os.system(command_string) 114 | 115 | if res != 0: 116 | raise Exception(f'Failed to build {module}') 117 | 118 | wheel = sorted([v for v in dist_folder.glob('*.whl')], key=lambda i: i.stat().st_ctime, reverse=True)[0] 119 | dbk_whl_name = wheel.name 120 | dbk_whl_root = 'dbfs:/FileStore/dev/artifacts/' 121 | dbk_whl_path = f'{dbk_whl_root}{dbk_whl_name}' 122 | 123 | command_exec(f'databricks fs rm {dbk_whl_root}', ignore=True) 124 | command_exec(f'databricks fs cp -r {wheel} {dbk_whl_path}') 125 | 126 | command_exec(f'databricks libraries uninstall --cluster-id {cluster_id} --whl {dbk_whl_path}') 127 | command_exec(f'databricks libraries install --cluster-id {cluster_id} --whl {dbk_whl_path}') 128 | 129 | command_exec(f'databricks clusters restart --cluster-id {cluster_id}') 130 | 131 | 132 | if __name__ == "__main__": 133 | args = parse_args() 134 | main(cluster_config_file=args.config_file) 135 | -------------------------------------------------------------------------------- /src/tutorial/scripts/local_config.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | # sys.path.append(str(Path(__file__).parent.parent.joinpath('modules'))) 5 | import json 6 | from dbkcore.core import Log 7 | from dbkenv.core import ResourceClient 8 | from dbkenv.core import Configuration 9 | from dbkenv.core import DatabricksResourceManager 10 | from dbkenv.local import DatabricksLocal 11 | import argparse 12 | 13 | 14 | 15 | 16 | Log(name=Path(__file__).stem) 17 | 18 | 19 | def parse_args(args_list=None): 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('-c', '--config_file', help="Full path of cluster's json configuration", type=str, required=True) 22 | args_parsed = parser.parse_args(args_list) 23 | return args_parsed 24 | 25 | 26 | def main(cluster_config_file): 27 | 28 | configuration = Configuration(file_load=True) 29 | # cluster_config_file = str(Path(__file__).parent.joinpath('unittest_cluster.json')) 30 | 31 | with open(cluster_config_file.strip(), 'r') as cl: 32 | cluster_configuration = json.load(cl) 33 | 34 | cluster_name = cluster_configuration['cluster_name'] 35 | 36 | client = ResourceClient( 37 | host=configuration.DATABRICKS_HOST, 38 | personal_token=configuration.DATABRICKS_TOKEN 39 | ) 40 | drm = DatabricksResourceManager( 41 | client=client, 42 | cluster_name=cluster_name, 43 | cluster_configuration=cluster_configuration 44 | ) 45 | 46 | cluster_id = drm.cluster.cluster_id 47 | 48 | local_config = DatabricksLocal( 49 | host=configuration.DATABRICKS_HOST, 50 | databricks_token=configuration.DATABRICKS_TOKEN, 51 | cluster_id=cluster_id, 52 | org_id=configuration.DATABRICKS_ORDGID 53 | ) 54 | local_config.initialize() 55 | 56 | 57 | if __name__ == "__main__": 58 | args = parse_args() 59 | main(cluster_config_file=args.config_file) 60 | -------------------------------------------------------------------------------- /src/tutorial/scripts/set_secrets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Add secret to Databricks. 3 | """ 4 | import sys 5 | from pathlib import Path 6 | 7 | # sys.path.append(str(Path(__file__).parent.parent.joinpath('modules'))) 8 | from dbkenv.core import Configuration 9 | from dbkenv.core import ResourceClient 10 | from dbkenv.core import Secret 11 | from dbkenv.core import Log 12 | import argparse 13 | 14 | 15 | Log(name=Path(__file__).stem) 16 | 17 | 18 | def parse_args(args_list=None): 19 | """ 20 | Parse command line arguments. 21 | 22 | Parameters 23 | ---------- 24 | args_list : [type], optional 25 | [description], by default None 26 | 27 | Returns 28 | ------- 29 | ArgumentParser 30 | Parsed arguments 31 | """ 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('--scope', help="Scope to use", type=str, required=True) 34 | parser.add_argument('--secret_name', help="Name of the secret", type=str, required=True) 35 | parser.add_argument('--secret_value', help="Value of the secret", type=str, required=True) 36 | args_parsed = parser.parse_args(args_list) 37 | return args_parsed 38 | 39 | 40 | def main( 41 | scope: str, 42 | secret_name: str, 43 | secret_value: str 44 | ): 45 | """ 46 | Run main function. 47 | 48 | Parameters 49 | ---------- 50 | scope : str 51 | Scope to use 52 | secret_name : str 53 | Name of the secret 54 | secret_value : str 55 | Value of the secret 56 | """ 57 | configuration = Configuration(file_load=True) 58 | 59 | client = ResourceClient( 60 | host=configuration.DATABRICKS_HOST, 61 | personal_token=configuration.DATABRICKS_TOKEN 62 | ) 63 | secret_client = Secret( 64 | client=client 65 | ) 66 | 67 | scopes = secret_client.scopes() 68 | if scope not in scopes: 69 | secret_client.add_scope( 70 | scope=scope 71 | ) 72 | 73 | secret_client.add_secret( 74 | scope=scope, 75 | secret_name=secret_name, 76 | secret_value=secret_value 77 | ) 78 | 79 | 80 | if __name__ == "__main__": 81 | args = parse_args() 82 | main( 83 | scope=args.scope, 84 | secret_name=args.secret_name, 85 | secret_value=args.secret_value 86 | ) 87 | # main( 88 | # scope='test_scope', 89 | # secret_name='test_name', 90 | # secret_value='test_value' 91 | # ) 92 | -------------------------------------------------------------------------------- /workspace.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ], 7 | "settings": { 8 | "python.venvPath": "/usr/local/lib/python3.7/site-packages/pyspark/jars", 9 | "python.testing.pytestArgs": [ 10 | "src" 11 | ], 12 | "python.testing.unittestEnabled": false, 13 | "python.testing.nosetestsEnabled": false, 14 | "python.testing.pytestEnabled": true, 15 | "python.envFile": "${workspaceFolder}/.env", 16 | "python.analysis.extraPaths": [ 17 | "src/modules" 18 | ], 19 | "workbench.colorCustomizations": { 20 | "activityBar.activeBackground": "#93fcdc", 21 | "activityBar.activeBorder": "#fa45d4", 22 | "activityBar.background": "#93e6fc", 23 | "activityBar.foreground": "#15202b", 24 | "activityBar.inactiveForeground": "#15202b99", 25 | "activityBarBadge.background": "#fa45d4", 26 | "activityBarBadge.foreground": "#15202b", 27 | "statusBar.background": "#93fcdc", 28 | "statusBar.foreground": "#15202b", 29 | "statusBarItem.hoverBackground": "#2fd0fa", 30 | "titleBar.activeBackground": "#93fcdc", 31 | "titleBar.activeForeground": "#15202b", 32 | "titleBar.inactiveBackground": "#93fcdc99", 33 | "titleBar.inactiveForeground": "#15202b99" 34 | }, 35 | "peacock.remoteColor": "#93fcdc" 36 | }, 37 | "extensions": { 38 | "recommendations": [ 39 | "ms-python.python", 40 | "visualstudioexptteam.vscodeintellicode", 41 | "ms-python.vscode-pylance", 42 | "ms-azuretools.vscode-docker", 43 | "ms-vscode-remote.remote-containers" 44 | ] 45 | } 46 | } --------------------------------------------------------------------------------