├── .devcontainer
    ├── Dockerfile
    └── devcontainer.json
├── .gitignore
├── .pylintrc
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── dev.env
├── docs
    ├── README.md
    └── images
    │   ├── AppInsightConnectionString.jpg
    │   ├── AzureResources.JPG
    │   ├── DatabricksNotebookExecution.JPG
    │   ├── DatabricksORGIDandHOSTID.JPG
    │   ├── DatabricksTokenGeneration.jpg
    │   ├── DevContainer.jpg
    │   ├── DockerImageLoad.jpg
    │   ├── InstallExtensions.jpg
    │   ├── MLOps_for_databricks_Solution_Acclerator_logo.JPG
    │   ├── OutputOfTheConfigurationStep.jpg
    │   ├── Overview.JPG
    │   ├── PipelineSteps.JPG
    │   ├── PowershellScreen.jpg
    │   ├── SecretsFileImage.jpg
    │   ├── SuccessfulClusterCreation.JPG
    │   ├── Verify_Python_Interpreter.jpg
    │   ├── cluster-upload-wheel.jpg
    │   ├── databricks-connect-pass.jpg
    │   ├── final.jpg
    │   ├── map01.png
    │   ├── map02.png
    │   ├── map03.png
    │   ├── map04.png
    │   ├── map05.png
    │   ├── map06.png
    │   ├── map07.png
    │   ├── pythonversion.jpg
    │   └── workspaceselection.jpg
├── requirements.txt
├── src
    ├── README.md
    ├── modules
    │   ├── acai_ml
    │   │   ├── __init__.py
    │   │   └── core.py
    │   ├── dbkcore
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── helpers.py
    │   │   └── requirements.txt
    │   ├── dbkdev
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── data_steps.py
    │   │   └── requirements.txt
    │   ├── dbkenv
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── local.py
    │   │   └── requirements.txt
    │   ├── devmaint
    │   │   ├── __init__.py
    │   │   ├── command_line.py
    │   │   ├── docgenerator.py
    │   │   └── requirements.txt
    │   └── tests
    │   │   ├── dbkcore
    │   │       └── test_logger.py
    │   │   ├── dbkenv
    │   │       ├── content
    │   │       │   ├── py_file.py
    │   │       │   └── unittest_notebook.py
    │   │       ├── test_cluster.py
    │   │       └── unittest_cluster.json
    │   │   └── pytest.ini
    ├── pipelines
    │   └── dbkframework
    │   │   ├── documentation.md
    │   │   ├── requirements.txt
    │   │   └── setup.py
    ├── setup
    │   ├── arm-templates
    │   │   ├── parameters.json
    │   │   └── template.json
    │   ├── config
    │   │   └── setup_config.json
    │   ├── configureResources.ps1
    │   ├── deployResources.ps1
    │   └── util
    │   │   ├── DBCluster-Configuration.json
    │   │   ├── Deploy-DBCluster.ps1
    │   │   └── Deploy-DBCluster_using_CLI.ps1
    └── tutorial
    │   ├── README.md
    │   ├── cluster_config.json
    │   ├── create_databricks_secrets.py
    │   ├── deploy.py
    │   └── scripts
    │       ├── __init__.py
    │       ├── create_cluster.py
    │       ├── framework_testing
    │           └── remote_analysis.py
    │       ├── install_dbkframework.py
    │       ├── local_config.py
    │       └── set_secrets.py
└── workspace.code-workspace


/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.166.0/containers/python-3/.devcontainer/base.Dockerfile
 2 | 
 3 | # [Choice] Python version: 3, 3.9, 3.8, 3.7, 3.6
 4 | ARG VARIANT="3"
 5 | FROM mcr.microsoft.com/vscode/devcontainers/python:0-${VARIANT}
 6 | 
 7 | # [Option] Install Node.js
 8 | ARG INSTALL_NODE="true"
 9 | ARG NODE_VERSION="lts/*"
10 | RUN if [ "${INSTALL_NODE}" = "true" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi
11 | 
12 | # [Optional] Uncomment this section to install additional OS packages.
13 | RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
14 |    && apt-get -y install --no-install-recommends software-properties-common \
15 |    && apt-add-repository 'deb http://security.debian.org/debian-security stretch/updates main' \
16 |    && apt-get update \
17 |    && apt-get -y install --no-install-recommends openjdk-8-jdk pandoc
18 | 
19 | RUN pip3 --disable-pip-version-check --no-cache-dir install databricks-connect==7.3.*
20 | 
21 | # [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
22 | COPY requirements.txt /tmp/pip-tmp/
23 | RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
24 |    && rm -rf /tmp/pip-tmp
25 | 
26 | # [Optional] Uncomment this line to install global node packages.
27 | # RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g <your-package-here>" 2>&1


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.166.0/containers/python-3
 3 | {
 4 | 	"name": "Python 3",
 5 | 	"build": {
 6 | 		"dockerfile": "Dockerfile",
 7 | 		"context": "..",
 8 | 		"args": { 
 9 | 			// Update 'VARIANT' to pick a Python version: 3, 3.6, 3.7, 3.8, 3.9
10 | 			"VARIANT": "3.7",
11 | 			// Options
12 | 			"INSTALL_NODE": "false",
13 | 			"NODE_VERSION": "lts/*"
14 | 		}
15 | 	},
16 | 
17 | 	// Set *default* container specific settings.json values on container create.
18 | 	"settings": { 
19 | 		"terminal.integrated.shell.linux": "/bin/bash",
20 | 		"python.pythonPath": "/usr/local/bin/python",
21 | 		"python.venvPath": "/usr/local/lib/python3.7/site-packages/pyspark/jars",
22 | 		"python.envFile": ".env",
23 | 		// "python.linting.enabled": true,
24 | 		"python.linting.pylintEnabled": false,
25 | 		"python.linting.pydocstyleEnabled": true,
26 | 		"python.linting.flake8Enabled": true,
27 | 		"python.linting.flake8Args": [
28 | 			"--ignore=E501, E402"
29 | 		], //["--ignore=E501,E123"]
30 | 		"python.linting.enabled": true,
31 | 		"python.linting.pylamaEnabled": false,
32 | 		"python.linting.pylamaArgs": [
33 | 			"--ignore=E501"
34 | 		],
35 | 		"python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
36 | 		"python.formatting.blackPath": "/usr/local/py-utils/bin/black",
37 | 		"python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
38 | 		"python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
39 | 		"python.linting.flake8Path": "/usr/local/py-utils/bin/flake8",
40 | 		"python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
41 | 		"python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
42 | 		"python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
43 | 		"python.linting.pylintPath": "/usr/local/py-utils/bin/pylint",
44 | 		"python.analysis.extraPaths": [
45 | 			"src/modules"
46 | 		],
47 | 		"python.testing.pytestArgs": [
48 | 			"src"
49 | 		],
50 | 		"python.testing.unittestEnabled": false,
51 | 		"python.testing.nosetestsEnabled": false,
52 | 		"python.testing.pytestEnabled": true
53 | 	},
54 | 
55 | 	// Add the IDs of extensions you want installed when the container is created.
56 | 	"extensions": [
57 | 		"ms-python.python",
58 | 		"visualstudioexptteam.vscodeintellicode",
59 | 		"ms-python.vscode-pylance",
60 | 		"ms-azuretools.vscode-docker",
61 | 		"ms-vscode-remote.remote-containers",
62 | 		"irongeek.vscode-env",
63 | 		"njpwerner.autodocstring"
64 | 	],
65 | 	"runArgs": [
66 | 		"--env-file",
67 | 		".env"
68 | 	],
69 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
70 | 	// "forwardPorts": [],
71 | 
72 | 	// Use 'postCreateCommand' to run commands after the container is created.
73 | 	// "postCreateCommand": "pip3 install --user -r requirements.txt",
74 | 
75 | 	// Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
76 | 	"remoteUser": "vscode"
77 | }
78 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | data/**
  2 | notes.md
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | venv/
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # Environments
 89 | .env
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | *.vscode
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | 
111 | .DS_Store
112 | 
113 | #custom
114 | config.json
115 | 
116 | #appsecret.txt
117 | *appsecret.txt
118 | 
119 | #DBKToken.txt
120 | *DBKtoken.txt
121 | *setup_config.json


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MESSAGES CONTROL]
 2 | disable=C0114, # Missing module docstring
 3 |         C0115, # Missing class docstring
 4 |         C0116, # Missing function docstring
 5 |         R0801, # Similar lines
 6 |         W0511 # TODO's
 7 | 
 8 | [BASIC]
 9 | good-names=i, j,
10 |            k, v,
11 |            f,
12 |            ex,
13 |            Run,
14 |            _,
15 |            df,
16 |            ws, sp,
17 |            X, y, X_train, X_test
18 | 
19 | [FORMAT]
20 | max-line-length=120
21 | max-module-lines=1000
22 | 
23 | [SIMILARITIES]
24 | ignore-comments=yes
25 | ignore-docstrings=yes
26 | ignore-imports=yes
27 | min-similarity-lines=4
28 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Microsoft
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Banner](docs/images/MLOps_for_databricks_Solution_Acclerator_logo.JPG)
  2 | 
  3 | # About this repository
  4 | 
  5 | This repository contains the Databricks development framework for delivering any Data Engineering projects, and machine learning projects based on the Azure Technologies.
  6 | 
  7 | # Details of the accelerator
  8 | 
  9 | The accelerator contains few of the core features of Databricks development which can be extended or reused in any implementation projects with Databricks.
 10 | 
 11 | ![overview](docs/images/Overview.JPG)
 12 | 
 13 | - Logging Framework using the [Opensensus Azure Monitor Exporters](https://github.com/census-instrumentation/opencensus-python/tree/master/contrib/opencensus-ext-azure)
 14 | - Support for Databricks development from VS Code IDE using the [Databricks Connect](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/databricks-connect#visual-studio-code) feature.
 15 | - continuous development with [Python Local Packaging](https://packaging.python.org/tutorials/packaging-projects/)
 16 | - Implementation of the Databricks utilities in VS Code such as dbutils, notebook execution, secret handling.
 17 | - Example Model file which uses the framework end to end.
 18 | 
 19 | 
 20 | # Prerequisites
 21 | 
 22 | To successfully complete your solution, you will need to have access to and or provisioned the following:
 23 | 
 24 | - Access to an Azure subscription
 25 | - Service Principal (valid Client ID and secret ) which has the contributor permission the subscription. We are going to create the resource group using the service principal.
 26 | - VS Code installed.
 27 | - Docker Desktop Installed.
 28 | 
 29 | # Create the Service Principal
 30 | 
 31 | - [Instruction to create the service principal](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#register-an-application-with-azure-ad-and-create-a-service-principal)
 32 | - [Instruction to assign role to the service principal access over the Subscription](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#assign-a-role-to-the-application). Please provide **contributor** access over the subscription.
 33 | - [Instruction to Get application ID and tenant ID for the application you registered](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#get-tenant-and-app-id-values-for-signing-in)
 34 | - [Instruction to create application secret](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#option-2-create-a-new-application-secret). The application Secret is needed at the later part of this setup. Please copy the **value** and store it in a notepad for now.
 35 | 
 36 | # Getting Started
 37 | 
 38 | The below sections provide the step by step approach to set up the solution. As part of this solution, we need the following resources to be provisioned in a resource group.
 39 | 
 40 | 1. Azure Databricks
 41 | 2. Application Insight Instance.
 42 | 3. A log analytics workspace for the App Insight.
 43 | 4. Azure Key Vault to store the secrets.
 44 | 5. A Storage Account.
 45 | 
 46 | ## Section 1: Docker image load in VS Code
 47 | 
 48 | ![map01](docs/images/map01.png)
 49 | 1. Clone the Repository : https://github.com/microsoft/dstoolkit-ml-ops-for-databricks/pulls
 50 | 2. Install Docker Desktop. In this solution, the Visual Code uses the docker image as a remote container to run the solution.
 51 | 3. Create .env file in the root folder, and keep the file blank for now. (root folder is the parent folder of the project)
 52 | 4. In the repo, open the workspace. File: workspace.ode-workspace.
 53 | 
 54 | > Once you click the file, you will get the "Open Workspace" button at right bottom corner in the code editor. Click it to open the solution into the vscode workspace.
 55 | 
 56 | ![workspaceselection](docs/images/workspaceselection.jpg)
 57 | 
 58 | 5. We need to connect to the [docker image as remote container in vs code](https://code.visualstudio.com/docs/remote/attach-container#_attach-to-a-docker-container). In the code repository, we have ./.devcontainer folder that has required docker image file and docker configuration file. Once we load the repo in the vscode, we generally get the prompt. Select "Reopen in Container". Otherwise we can go to the VS code command palette ( ctrl+shift+P in windows), and select the option "Remote-Containers: Rebuild and Reopen in Containers"
 59 | 
 60 | ![DockerImageLoad](docs/images/DockerImageLoad.jpg)
 61 | 
 62 | 6. In the background, it is going to build a docker image. We need to wait for sometime to complete build. the docker image will basically contain the a linux environment which has python 3.7 installed. Please have a look at the configuration file(.devcontainer\devcontainer.json) for more details. 
 63 | 7. Once it is loaded. we will be able to see the python interpreter is loaded successfully. Incase it does not show, we need to load the interpreter manually. To do that, click on the select python interpreter => Entire workspace => /usr/local/bin/python
 64 | 
 65 | ![pythonversion](docs/images/pythonversion.jpg)
 66 | 
 67 | 8. You will be prompted with installing the required extension on the right bottom corner. Install the extensions by clicking on the prompts.
 68 | 
 69 | ![InstallExtensions](docs/images/InstallExtensions.jpg)
 70 | 
 71 | 9. Once the steps are completed, you should be able to see the python extensions as below:
 72 | 
 73 | ![pythonversion](docs/images/pythonversion.jpg)
 74 | 
 75 | ## Section 2: Databricks environment creation
 76 | 
 77 | ![map02](docs/images/map02.png)
 78 | 
 79 | The objectives of this section are:
 80 | 
 81 | - Create the required resources.
 82 |     1. Azure Databricks
 83 |     2. Application Insight Instance.
 84 |     3. A log analytics workspace for the App Insight.
 85 |     4. Azure Key Vault to store the secrets.
 86 |     5. A Storage Account.
 87 | 
 88 | - Create the .env file for the local development.
 89 | 
 90 | > You don't need to create the environment again if you already had a databricks environment. You can directly create the .env file ( Section 4 ) with the details of your environment.
 91 | 
 92 | 1. Go to **src/setup/config/setup_config.json**, and complete the json files with the values; according to your environment. The service principal should be having the contributor access over the subscription you are using. Or if you choose to create the resource group manually, or reuse an existing resource group, then it should have the contributor access on the resource group itself.
 93 | 
 94 | > These details would be used to connect to the Azure Subscription for the resource creation.
 95 | 
 96 | ``` json
 97 | {
 98 |  
 99 |     "applicationID":"deeadfb5-27xxxaad3-9fd39049b450",
100 |     "tenantID":"72f988bf-8xxxxx2d7cd011db47",
101 |     "subscriptionID":"89c37dd8xxxx-1cfb98c0262e",
102 |     "resourceGroupName":"AccleratorDBKMLOps2",
103 |     "resourceGroupLocation":"NorthEurope"
104 | }
105 | ```
106 | 
107 | 2. create the file and provide the client ID secret in this file : **src/setup/vault/appsecret.txt**
108 | 
109 | > Incase you are not able to create the file from the solution, you can directly go to the file explorer to create the file.
110 | >
111 | > NOTE: DBToken.txt will be created in the later section, please ignore it for now.
112 | 
113 | At the end of the secret files creation, the folder structure will like below:
114 | 
115 | ![SecretsFileImage](docs/images/SecretsFileImage.jpg)
116 | 
117 | 3. Open the Powershell ISE in your local machine. We are going to run the Powershell script to create the required resources. The name of the resources are basically having a prefix to the resourcegroup name.
118 | 4. set the root path of the Powershell terminal till setup, and execute the deployResource.ps1
119 | 
120 | ``` powershell
121 | cd "C:\Users\projects\New folder\MLOpsBasic-Databricks\src\setup"
122 | .\deployResources.ps1
123 | ```
124 | 
125 | > If you receive the below error, execute the  command [Set-ExecutionPolicy RemoteSigned]
126 | 
127 | ``` cmd
128 | >.\deployResources.ps1 : File C:\Users\projects\New
129 | folder\MLOpsBasic-Databricks\src\setup\deployResources.ps1 cannot be loaded because running scripts is disabled on this.
130 | ```
131 | > if you get the error module is not found, and if Powershell ISE is not able to recognize any specific Powershell command, then Install the Powershell Az Module. [Instructions](https://docs.microsoft.com/en-us/powershell/azure/install-az-ps?view=azps-6.4.0)
132 | ``` cmd
133 | Install-Module  Az
134 | ```
135 | 
136 | ![PowershellScreen](docs/images/PowershellScreen.jpg)
137 | 
138 | Post successful execution of the script, we can see the resources created successfully in the Azure Subscription.
139 | 
140 | ![AzureResources](docs/images/AzureResources.JPG)
141 | 
142 | 
143 | ## Section 3: Databricks cluster creation
144 | 
145 | ![map03](docs/images/map03.png)
146 | 
147 | 1. To create the databricks cluster we need to have personal Access token created. Go to the Databricks workspace, and get the personal access token from the user setting, and save it in the file src/setup/vault/DBKtoken.txt
148 | 
149 | ![DatabricksTokenGeneration](docs/images/DatabricksTokenGeneration.jpg)
150 | 
151 | 2. Run the following command
152 | 
153 | ``` cmd
154 | cd "C:\Users\projects\New folder\MLOpsBasic-Databricks\src\setup"
155 |  
156 | .\configureResources.ps1
157 | ```
158 | 
159 | 3. At the end of the script execution, we will be able to see the databricks cluster has been created successfully.the config file: src\setup\util\DBCluster-Configuration.json is being used to create the cluster.
160 | 
161 | ![SuccessfulClusterCreation](docs/images/SuccessfulClusterCreation.JPG)
162 | 
163 | 4. Copy the output of the script and paste it to the .env file which we had created previously. Please note that the values of the variables will be different as per your environment configuration. the later section (Section 4) describes the creation of .env file in detail.
164 | 
165 | ![OutputOfTheConfigurationStep](docs/images/OutputOfTheConfigurationStep.jpg)
166 | 
167 | ## Section 4: Create the .env file
168 | 
169 | ![map04](docs/images/map04.png)
170 | 
171 | We need to manually change the databricks host and appI_IK values. Other values should be "as is" from the output of the previous script.
172 | 
173 | - PYTHONPATH: /workspaces/dstoolkit-ml-ops-for-databricks/src/modules [This is  full path to the module folder in the repository.]
174 | - APPI_IK: connection string of the application insight
175 | - DATABRICKS_HOST: The URL of the databricks workspace.
176 | - DATABRICKS_TOKEN: Databricks Personal Access Token which was generated in the previous step.
177 | - DATABRICKS_ORDGID: OrgID of the databricks that can be fetched from the databricks URL.
178 | 
179 | ![DatabricksORGIDandHOSTID](docs/images/DatabricksORGIDandHOSTID.JPG)
180 | 
181 | Application Insight Connection String
182 | 
183 | ![AppInsightConnectionString](docs/images/AppInsightConnectionString.jpg)
184 | 
185 | At the end, our .env file is going to look as below. You can copy the content and change the values according to your environment.
186 | 
187 | ``` conf
188 | PYTHONPATH=/workspaces/dstoolkit-ml-ops-for-databricks/src/modules
189 | APPI_IK=InstrumentationKey=e6221ea6xxxxxxf-8a0985a1502f;IngestionEndpoint=https://northeurope-2.in.applicationinsights.azure.com/
190 | DATABRICKS_HOST=https://adb-7936878321001673.13.azuredatabricks.net
191 | DATABRICKS_TOKEN= <Provide the secret>
192 | DATABRICKS_ORDGID=7936878321001673
193 | ```
194 | 
195 | ## Section 5: Configure the Databricks connect
196 | 
197 | ![map05](docs/images/map05.png)
198 | 
199 | 1. In this step we are going to configure the databricks connect for VS code to connect to databricks. Run the below command for that from the docker (VS Code) terminal.
200 | 
201 | ``` bash
202 | $ python "src/tutorial/scripts/local_config.py" -c "src/tutorial/cluster_config.json"
203 | ```
204 | 
205 | >Note: If you get any error saying that "ModelNotFound : No module names dbkcore". Try to reload the VS code window and see if you are getting prompt  right bottom corner saying that configuration file changes, rebuild the docker image. Rebuild it and then reload the window. Post that you would not be getting any error. Also, check if the python interpreter is being selected properly. They python interpreter path should be **/usr/local/bin/python **
206 | 
207 | ![Verify_Python_Interpreter](docs/images/Verify_Python_Interpreter.jpg)
208 | 
209 | ### Verify
210 | 
211 | 1. You will be able to see the message All tests passed.
212 | 
213 | ![databricks-connect-pass](docs/images/databricks-connect-pass.jpg)
214 | 
215 | ## Section 6: Wheel creation and workspace upload
216 | 
217 | ![map06](docs/images/map06.png)
218 | 
219 | In this section, we will create the private python package and upload it to the databricks environment.
220 | 
221 | 1. Run the below command:
222 | 
223 | ``` bash
224 | python src/tutorial/scripts/install_dbkframework.py -c "src/tutorial/cluster_config.json"
225 | ```
226 | 
227 | Post Execution of the script, we will be able to see the module to be installed.
228 | 
229 | ![cluster-upload-wheel](docs/images/cluster-upload-wheel.jpg)
230 | 
231 | ## Section 7: Using the framework
232 | 
233 | ![map07](docs/images/map07.png)
234 | 
235 | We have a  pipeline that performs the data preparation, unit testing, logging, training of the model.
236 | 
237 | 
238 | ![PipelineSteps](docs/images/PipelineSteps.JPG)
239 | 
240 | 
241 | ### Execution from Local VS Code
242 | 
243 | To check if the framework is working fine or not, let's execute this file : **src/tutorial/scripts/framework_testing/remote_analysis.py** . It is better to execute is using the interactive window. As the Interactive window can show the pandas dataframe which is the output of the script. Otherwise the script can be executed from the Terminal as well.
244 | To run the script from the interactive window, select the whole script => right click => run the selection in the interactive window.
245 | 
246 | Post running the script, we will be able to see the data in the terminal.
247 | 
248 | ![final](docs/images/final.jpg)
249 | 
250 | ### Execution from Databricks
251 | 
252 | In order to run the same notebook in the databricks, we just need to create a databricks secrets for the application insight connection string. 
253 | 
254 | For this, we can execute the below query:
255 | 
256 | ``` bash
257 | python src/tutorial/create_databricks_secrets.py
258 | 
259 | ```
260 | 
261 | After copying the content of the remote_analysis.py in the databricks notebook, we get the output as below:
262 | 
263 | ![DatabricksNotebookExecution](docs/images/DatabricksNotebookExecution.JPG)
264 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/dev.env:
--------------------------------------------------------------------------------
1 | PYTHONPATH=/workspaces/MLOpsBasic-Databricks/src/modules
2 | APPI_IK=""
3 | DATABRICKS_HOST=""
4 | DATABRICKS_TOKEN=""
5 | DATABRICKS_ORDGID=""
6 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # TODO


--------------------------------------------------------------------------------
/docs/images/AppInsightConnectionString.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/AppInsightConnectionString.jpg


--------------------------------------------------------------------------------
/docs/images/AzureResources.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/AzureResources.JPG


--------------------------------------------------------------------------------
/docs/images/DatabricksNotebookExecution.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/DatabricksNotebookExecution.JPG


--------------------------------------------------------------------------------
/docs/images/DatabricksORGIDandHOSTID.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/DatabricksORGIDandHOSTID.JPG


--------------------------------------------------------------------------------
/docs/images/DatabricksTokenGeneration.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/DatabricksTokenGeneration.jpg


--------------------------------------------------------------------------------
/docs/images/DevContainer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/DevContainer.jpg


--------------------------------------------------------------------------------
/docs/images/DockerImageLoad.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/DockerImageLoad.jpg


--------------------------------------------------------------------------------
/docs/images/InstallExtensions.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/InstallExtensions.jpg


--------------------------------------------------------------------------------
/docs/images/MLOps_for_databricks_Solution_Acclerator_logo.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/MLOps_for_databricks_Solution_Acclerator_logo.JPG


--------------------------------------------------------------------------------
/docs/images/OutputOfTheConfigurationStep.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/OutputOfTheConfigurationStep.jpg


--------------------------------------------------------------------------------
/docs/images/Overview.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/Overview.JPG


--------------------------------------------------------------------------------
/docs/images/PipelineSteps.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/PipelineSteps.JPG


--------------------------------------------------------------------------------
/docs/images/PowershellScreen.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/PowershellScreen.jpg


--------------------------------------------------------------------------------
/docs/images/SecretsFileImage.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/SecretsFileImage.jpg


--------------------------------------------------------------------------------
/docs/images/SuccessfulClusterCreation.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/SuccessfulClusterCreation.JPG


--------------------------------------------------------------------------------
/docs/images/Verify_Python_Interpreter.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/Verify_Python_Interpreter.jpg


--------------------------------------------------------------------------------
/docs/images/cluster-upload-wheel.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/cluster-upload-wheel.jpg


--------------------------------------------------------------------------------
/docs/images/databricks-connect-pass.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/databricks-connect-pass.jpg


--------------------------------------------------------------------------------
/docs/images/final.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/final.jpg


--------------------------------------------------------------------------------
/docs/images/map01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map01.png


--------------------------------------------------------------------------------
/docs/images/map02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map02.png


--------------------------------------------------------------------------------
/docs/images/map03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map03.png


--------------------------------------------------------------------------------
/docs/images/map04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map04.png


--------------------------------------------------------------------------------
/docs/images/map05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map05.png


--------------------------------------------------------------------------------
/docs/images/map06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map06.png


--------------------------------------------------------------------------------
/docs/images/map07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/map07.png


--------------------------------------------------------------------------------
/docs/images/pythonversion.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/pythonversion.jpg


--------------------------------------------------------------------------------
/docs/images/workspaceselection.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/docs/images/workspaceselection.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file may be used to create an environment using:
 2 | opencensus-ext-azure>=1.0.7
 3 | # databricks-connect==7.3.*
 4 | databricks-cli>=0.14.3
 5 | typeguard>=2.12.0
 6 | pytest>=6.2.3
 7 | jupyter>=1.0.0
 8 | python-dotenv>=0.17.0
 9 | pypandoc>=1.4
10 | pdoc3>=0.7.4
11 | pandas>=1.2.4
12 | setuptools>=56.0.0
13 | pydataset>=0.2.0
14 | scikit-learn>=0.24.1
15 | PyArrow>=0.15.1


--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
1 | # TODO


--------------------------------------------------------------------------------
/src/modules/acai_ml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/modules/acai_ml/__init__.py


--------------------------------------------------------------------------------
/src/modules/acai_ml/core.py:
--------------------------------------------------------------------------------
  1 | from dbkdev.core import DevelopmentClient
  2 | from dbkdev.core import DevelopmentEngine
  3 | from dbkcore.core import Log
  4 | from dbkcore.core import trace
  5 | from dbkcore.core import Singleton
  6 | from dbkdev.core import IdeEnvironment
  7 | from pyspark.sql import SparkSession
  8 | from typing import Dict
  9 | 
 10 | 
 11 | class Engine(metaclass=Singleton):
 12 | 
 13 |     """
 14 |     This is the core of the framework.
 15 |     It configures the environment to interact with the remote Databricks.
 16 |     """
 17 | 
 18 |     def __init__(self):
 19 |         """
 20 |         Instantiate the current object
 21 | 
 22 |         """
 23 |         self.__ide_environment = DevelopmentEngine().get_instance().ide_environment
 24 |         self.appi_ik = None
 25 | 
 26 |     def initialize_env(self):
 27 |         """
 28 |         Initializes the DevelopmentClient.
 29 |         That is, sets the dbutils and spark context accordingly if the code is runt on cluster or locally.
 30 |         """
 31 |         DevelopmentClient(
 32 |             dbutils=DevelopmentEngine().get_instance().dbutils,
 33 |             spark=DevelopmentEngine().get_instance().spark,
 34 |             ide_environment=self.__ide_environment
 35 |         )
 36 | 
 37 |     # def initialize_import(self):
 38 |     #     # Setting pipeline module path
 39 |     #     if self.__ide_environment == IdeEnvironment.DATABRICKS:
 40 |     #         import sys
 41 |     #         sys.path.append(str(self.pipelines_lib_path))
 42 | 
 43 |     def initialize_logger(
 44 |         self,
 45 |         pipeline_name: str,
 46 |         appi_ik_scope: str = 'config',
 47 |         appi_ik_secret: str = 'APPI_IK'
 48 |     ):
 49 |         """
 50 |         Initializes the logger
 51 | 
 52 |         Parameters
 53 |         ----------
 54 |         pipeline_name : str
 55 |             Name to use with the logger. It will be the base name used for all the upcoming logs and tracing
 56 |         appi_ik_scope : str, optional
 57 |             Databricks secret scope where the Application Insight key is stored, by default "dds"
 58 |         appi_ik_secret : str, optional
 59 |             Databricks secret name where the Application Insight key is stored, by default "appiik"
 60 | 
 61 |         Raises
 62 |         ------
 63 |         ValueError
 64 |             Unknown Ide Environment used
 65 |         """
 66 |         # Configuring application insight key
 67 |         if self.__ide_environment == IdeEnvironment.LOCAL:
 68 |             from dbkenv.core import Configuration
 69 |             configurations = Configuration()
 70 |             self.appi_ik = configurations.APPINSIGHT_CONNECTIONSTRING
 71 |         elif self.__ide_environment == IdeEnvironment.DATABRICKS:
 72 |             self.appi_ik = DevelopmentEngine().get_instance().dbutils.secrets.get(appi_ik_scope, appi_ik_secret)
 73 |         else:
 74 |             raise ValueError(f'ide_environment unknown: {self.__ide_environment}')
 75 |         # Instantiating logger
 76 |         Log(pipeline_name, self.appi_ik)
 77 | 
 78 |     def spark(self) -> SparkSession:
 79 |         """
 80 |         Current spark context
 81 | 
 82 |         Returns
 83 |         -------
 84 |         SparkSession
 85 |             Spark context
 86 |         """
 87 |         return DevelopmentClient().get_instance().spark
 88 | 
 89 |     def dbutils(self):
 90 |         """
 91 |         Current dbutils
 92 | 
 93 |         Returns
 94 |         -------
 95 |         DBUtils
 96 |             The DBUtils
 97 |         """
 98 |         return DevelopmentClient().get_instance().dbutils
 99 | 
100 |     @classmethod
101 |     def get_instance(cls):
102 |         """
103 |         Current singleton Engine
104 | 
105 |         Returns
106 |         -------
107 |         Engine
108 |             The Engine
109 |         """
110 |         return Engine()
111 | 
112 |     @staticmethod
113 |     def ide_environment() -> IdeEnvironment:
114 |         """
115 |         Current Ide Environment
116 | 
117 |         Returns
118 |         -------
119 |         IdeEnvironment
120 |             The Ide Environment
121 |         """
122 |         return DevelopmentClient().get_instance().ide_environment
123 | 
124 |     @staticmethod
125 |     def is_ide_dataricks() -> bool:
126 |         """
127 |         Checks if the current environment is Databricks
128 | 
129 |         Returns
130 |         -------
131 |         bool
132 |             Check result
133 |         """
134 |         return DevelopmentClient().get_instance().ide_environment == IdeEnvironment.DATABRICKS
135 | 
136 |     @staticmethod
137 |     def is_ide_local() -> bool:
138 |         """
139 |         Checks if the current environment is Local
140 | 
141 |         Returns
142 |         -------
143 |         bool
144 |             Check result
145 |         """
146 |         return DevelopmentClient().get_instance().ide_environment == IdeEnvironment.LOCAL
147 | 
148 |     def run_notebook_with_retry(self, notebook: str, args: Dict, timeout=86400, max_retries=3):
149 |         """
150 |         Runs the specified notebook through dbutils
151 | 
152 |         Parameters
153 |         ----------
154 |         notebook : str
155 |             Name or path of the notebook
156 |         args : Dict
157 |             [description]
158 |         timeout : int, optional
159 |             [description], by default 86400
160 |         max_retries : int, optional
161 |             [description], by default 3
162 | 
163 |         Returns
164 |         -------
165 |         [type]
166 |             [description]
167 | 
168 |         Raises
169 |         ------
170 |         e
171 |             [description]
172 |         """
173 |         num_retries = 0
174 |         while True:
175 |             try:
176 |                 return DevelopmentClient().get_instance().dbutils.notebook.run(notebook, timeout, args)
177 |             except Exception as e:
178 |                 if num_retries > max_retries:
179 |                     raise e
180 |                 else:
181 |                     print("Retrying error"), e
182 |                     num_retries += 1
183 | 
184 |     @trace
185 |     # TODO: rename check
186 |     def run_notebook(self, notebook: str, args: Dict, timeout=86400, error_raise=True):
187 |         try:
188 |             res = DevelopmentClient().get_instance().dbutils.notebook.run(notebook, timeout, args)
189 |         except Exception as e:
190 |             res = f"Notebook {notebook} failed"
191 |             Log().get_instance().log_error(res)
192 |             if error_raise:
193 |                 raise e
194 |         return res
195 | 


--------------------------------------------------------------------------------
/src/modules/dbkcore/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/modules/dbkcore/__init__.py


--------------------------------------------------------------------------------
/src/modules/dbkcore/core.py:
--------------------------------------------------------------------------------
  1 | from opencensus.ext.azure.trace_exporter import AzureExporter
  2 | from opencensus.trace.samplers import AlwaysOnSampler
  3 | from opencensus.trace.tracer import Tracer
  4 | from opencensus.trace.span import Span
  5 | from opencensus.ext.azure.log_exporter import AzureLogHandler
  6 | import logging
  7 | from logging import Logger
  8 | from abc import abstractmethod
  9 | from typeguard import typechecked
 10 | from .helpers import is_json_serializable
 11 | from datetime import datetime
 12 | import functools as _functools
 13 | from typing import Any, List, Union
 14 | from collections import OrderedDict
 15 | import json
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | class Singleton(type):
 22 |     """Create a singleton."""
 23 | 
 24 |     _instances = OrderedDict()
 25 | 
 26 |     def __call__(cls, *args, **kwargs):
 27 |         """
 28 |         Instantiate the singleton.
 29 | 
 30 |         Returns
 31 |         -------
 32 |         any
 33 |             Parameters of the singleton
 34 |         """
 35 |         if cls not in cls._instances:
 36 |             cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
 37 |         return cls._instances[cls]
 38 | 
 39 | 
 40 | class Log(metaclass=Singleton):
 41 |     """Helper class for Application Insight Logger."""
 42 | 
 43 |     def __init__(self, name: str, connection_string: str = None):
 44 |         """
 45 |         Create a new Log object.
 46 | 
 47 |         Parameters
 48 |         ----------
 49 |         name : str
 50 |             Name used by the logger for tracing
 51 |         connection_string : [type], optional
 52 |             Application Insight's connection string
 53 |         """
 54 |         self.name = name
 55 |         self.__connection_string = connection_string
 56 | 
 57 |         # config_integration.trace_integrations(['logging'])
 58 |         # [Documentation](https://docs.microsoft.com/it-it/azure/azure-monitor/app/opencensus-python#logs)
 59 |         # [Documentation](https://docs.microsoft.com/it-it/azure/azure-monitor/app/opencensus-python#trace)
 60 |         self.__logger = self._get_logger()
 61 |         self.__tracer = self._get_tracer()
 62 | 
 63 |     def _get_logger(self) -> Logger:
 64 |         """
 65 |         Create the logger with an Azure Handler for Application Insight.
 66 | 
 67 |         Returns
 68 |         -------
 69 |         Logger
 70 |             Current logger
 71 |         """
 72 |         logger = logging.getLogger(name=self.name)
 73 |         logger.setLevel(logging.DEBUG)
 74 |         if self.__connection_string:
 75 |             handler = AzureLogHandler(connection_string=self.__connection_string)
 76 |             # handler.export_interval = 1
 77 |             # handler.max_batch_size = 1
 78 |             # handler.setFormatter(logging.Formatter('%(traceId)s:%(spanId)s:%(message)s'))
 79 |             logger.addHandler(handler)
 80 |         return logger
 81 | 
 82 |     def _get_tracer(self) -> Tracer:
 83 |         """
 84 |         Create the Opencencus Tracer with Azure Exporter.
 85 | 
 86 |         Returns
 87 |         -------
 88 |         Tracer
 89 |             Opencencus Tracer
 90 |         """
 91 |         if self.__connection_string:
 92 |             tracer = Tracer(
 93 |                 exporter=AzureExporter(connection_string=self.__connection_string),
 94 |                 sampler=AlwaysOnSampler()
 95 |             )
 96 |         else:
 97 |             tracer = None
 98 |         return tracer
 99 | 
100 |     @classmethod
101 |     def get_instance(cls):
102 |         """Current instance"""
103 |         return Log()
104 | 
105 |     @typechecked
106 |     def trace_function(self, name: str, kwargs: dict) -> Union[Span, None]:
107 |         """
108 |         Traces a function
109 | 
110 |         Parameters
111 |         ----------
112 |         name : str
113 |             Name of the function used for tracing
114 | 
115 |         name : kwargs
116 |             The parameters of the function
117 | 
118 |         Returns
119 |         -------
120 |         Span
121 |             A Span that can be used for customizing logging
122 |         """
123 |         tracer = self.__tracer
124 |         if tracer:
125 |             span = self.__tracer.span(name=name)
126 |             if kwargs:
127 |                 for key, value in kwargs.items():
128 |                     # if hasattr(value, 'to_json_logger'):
129 |                     #     value = value.to_json_logger()
130 |                     if not is_json_serializable(value):
131 |                         value = str(value)
132 |                     span.add_attribute(key, value)
133 |                     # self.log_info(f"TRACING:{key}:{value}")
134 |         else:
135 |             span = None
136 |         return span
137 | 
138 |     @property
139 |     def tracer(self) -> Tracer:
140 |         """
141 |         Tracer that will be used.
142 | 
143 |         Returns
144 |         -------
145 |         Tracer
146 |             The tracer
147 |         """
148 |         return self.__tracer
149 | 
150 |     @property
151 |     def logger(self) -> Logger:
152 |         """
153 |         Logger that will be used.
154 | 
155 |         Returns
156 |         -------
157 |         Logger
158 |             This logger
159 |         """
160 |         return self.__logger
161 | 
162 |     def __log_message(self, message: str, prefix: str) -> str:
163 |         if prefix:
164 |             msg = f'{prefix}:{message}'
165 |         else:
166 |             msg = f'{message}'
167 |         # res = f"{self.name}:{msg}"
168 |         return msg
169 | 
170 |     def log_info(self, message: str, prefix="", custom_dimension: dict = None):
171 |         """
172 |         Log a message as info.
173 | 
174 |         Parameters
175 |         ----------
176 |         message : str
177 |             The message
178 |         """
179 |         msg = self.__log_message(message=message, prefix=prefix)
180 |         local_msg = f'{msg}\nDetails: {json.dumps(custom_dimension, indent=4)}' if custom_dimension else msg
181 |         print(f'INFO:{local_msg}')
182 |         properties = {'custom_dimensions': custom_dimension}
183 |         self.__logger.info(msg, extra=properties)
184 | 
185 |     def log_debug(self, message: str, prefix="", custom_dimension: dict = None):
186 |         """
187 |         Log a message as debug.
188 | 
189 |         Parameters
190 |         ----------
191 |         message : str
192 |             The message
193 |         """
194 |         msg = self.__log_message(message=message, prefix=prefix)
195 |         local_msg = f'{msg}|Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg
196 |         print(f'DEBUG:{local_msg}')
197 |         # logging.debug(msg=f'{msg}| Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg)
198 |         properties = {'custom_dimensions': custom_dimension if custom_dimension else {}}
199 |         self.__logger.debug(msg=msg, extra=properties)
200 | 
201 |     def log_warning(self, message: str, prefix="", custom_dimension: dict = None):
202 |         """
203 |         Log a message as warning.
204 | 
205 |         Parameters
206 |         ----------
207 |         message : str
208 |             The message
209 |         """
210 |         msg = self.__log_message(message=message, prefix=prefix)
211 |         local_msg = f'{msg}|Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg
212 |         print(f'WARNING:{local_msg}')
213 |         # logging.warning(msg=f'{msg}| Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg)
214 |         properties = {'custom_dimensions': custom_dimension if custom_dimension else {}}
215 |         self.__logger.warning(msg=msg, extra=properties)
216 | 
217 |     def log_error(self, message: str, include_stack=True, prefix="", custom_dimension: dict = None):
218 |         """
219 |         Log a message as error.
220 | 
221 |         Parameters
222 |         ----------
223 |         message : str
224 |             The message
225 |         """
226 |         msg = self.__log_message(message=message, prefix=prefix)
227 |         local_msg = f'{msg}|Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg
228 |         print(f'ERROR:{local_msg}')
229 |         # logging.error(msg=f'{msg}| Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg, exc_info=include_stack)
230 |         properties = {'custom_dimensions': custom_dimension if custom_dimension else {}}
231 |         self.__logger.error(msg=msg, exc_info=include_stack, extra=properties)
232 | 
233 |     def log_critical(self, message: str, prefix="", custom_dimension: dict = None):
234 |         """
235 |         Log a message as critical.
236 | 
237 |         Parameters
238 |         ----------
239 |         message : str
240 |             The message
241 |         """
242 |         msg = self.__log_message(message=message, prefix=prefix)
243 |         local_msg = f'{msg}|Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg
244 |         print(f'CRITICAL:{local_msg}')
245 |         properties = {'custom_dimensions': custom_dimension if custom_dimension else {}}
246 |         logging.critical(msg=f'{msg}| Custom dimensions: {json.dumps(custom_dimension)}' if custom_dimension else msg)
247 |         self.__logger.critical(msg=msg, extra=properties)
248 | 
249 | 
250 | @typechecked
251 | def trace(original_function: Any = None, *, attrs_refact: List[str] = None):
252 |     """
253 |     Log the function call.
254 | 
255 |     Parameters
256 |     ----------
257 |     original_function : Any, optional
258 |         Function to trace, by default None
259 |     attrs_refact : List[str], optional
260 |         List of parameters to hide from logging, by default None
261 |     """
262 | 
263 |     def __log(func, fn_k, *args, **kwargs):
264 |         start = datetime.utcnow()
265 |         # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:START @ {start}", custom_dimension=fn_k)
266 |         res = func(*args, **kwargs)
267 |         end = datetime.utcnow()
268 |         elapsed = end - start
269 |         # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:COMPLETE @ {end}:ELAPSED {elapsed}", custom_dimension=fn_k)
270 |         fn_k['elapsed'] = str(elapsed)
271 |         fn_k['module'] = str(func.__module__)
272 |         fn_k['qualname'] = str(func.__qualname__)
273 |         Log.get_instance().log_info(f"Executed function {func.__module__}.{func.__qualname__}", custom_dimension=fn_k)
274 |         return res
275 | 
276 |     """Decorator for tracing functions (link)[https://stackoverflow.com/a/24617244]"""
277 |     def _decorate(func):
278 |         @_functools.wraps(func)
279 |         def wrapper(*args, **kwargs):
280 |             fn_k = {}
281 |             # if not attrs_refact:
282 |             #     fn_k = kwargs
283 |             # else:
284 |             for key, value in kwargs.items():
285 |                 v = value if is_json_serializable(value) else 'not serializable'
286 |                 if attrs_refact:
287 |                     if key in attrs_refact:
288 |                         v = '***'
289 |                 fn_k[key] = v
290 |                 # if key not in attrs_refact:
291 |                 #     fn_k[key] = value
292 |                 # else:
293 |                 #     fn_k[key] = '***'
294 |             if Log.get_instance().tracer:
295 |                 with Log.get_instance().trace_function(
296 |                     name=func.__name__,
297 |                     kwargs=fn_k
298 |                 ):
299 |                     return __log(func, fn_k, *args, **kwargs)
300 |                     # start = datetime.utcnow()
301 |                     # # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:START @ {start}", custom_dimension=fn_k)
302 |                     # res = func(*args, **kwargs)
303 |                     # end = datetime.utcnow()
304 |                     # elapsed = end - start
305 |                     # # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:COMPLETE @ {end}:ELAPSED {elapsed}", custom_dimension=fn_k)
306 |                     # fn_k['elapsed'] = str(elapsed)
307 |                     # fn_k['module'] = str(func.__module__)
308 |                     # fn_k['qualname'] = str(func.__qualname__)
309 |                     # Log.get_instance().log_info(f"Executed function {func.__module__}.{func.__qualname__}", custom_dimension=fn_k)
310 |                     # return res
311 |             else:
312 |                 return __log(func, fn_k, *args, **kwargs)
313 |                 # start = datetime.utcnow()
314 |                 # # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:START @ {start}", custom_dimension=fn_k)
315 |                 # res = func(*args, **kwargs)
316 |                 # end = datetime.utcnow()
317 |                 # elapsed = end - start
318 |                 # # Log.get_instance().log_info(f"MODULE:{func.__module__}:FN:{func.__qualname__}:COMPLETE @ {end}:ELAPSED {elapsed}", custom_dimension=fn_k)
319 |                 # fn_k['elapsed'] = str(elapsed)
320 |                 # fn_k['module'] = str(func.__module__)
321 |                 # fn_k['qualname'] = str(func.__qualname__)
322 |                 # Log.get_instance().log_info(f"Executed function {func.__module__}.{func.__qualname__}", custom_dimension=fn_k)
323 |         return wrapper
324 | 
325 |     if original_function:
326 |         return _decorate(original_function)
327 | 
328 |     return _decorate
329 | 
330 | 
331 | @typechecked
332 | # class BaseObject(ABC):
333 | # TODO: if works, remove ABC class
334 | class BaseObject():
335 |     """
336 |     Base class to use with any object new object.
337 |     It implements the method log which will be used for logging
338 | 
339 |     """
340 | 
341 |     @abstractmethod
342 |     def log(self, prefix="", suffix=""):
343 |         """
344 |         Specifices how to log the object
345 |         """
346 |         pass
347 | 
348 |     @classmethod
349 |     def class_name(cls) -> str:
350 |         return cls.__name__.lower()
351 | 


--------------------------------------------------------------------------------
/src/modules/dbkcore/helpers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Various utilities for speed up development
 3 | """
 4 | import os
 5 | from pathlib import Path
 6 | import json
 7 | from typing import Any
 8 | 
 9 | 
10 | 
11 | 
12 | def current_directory() -> str:
13 |     """
14 |     Get current directory.
15 | 
16 |     Returns
17 |     -------
18 |     str
19 |         The current directory path
20 |     """
21 |     return os.path.dirname(os.path.realpath(__file__))
22 | 
23 | 
24 | def add_folder_in_current_directory(folder_name: str) -> bool:
25 |     """
26 |     Add a folder in the current directory.
27 | 
28 |     Parameters
29 |     ----------
30 |     folder_name : str
31 |         New folder name
32 | 
33 |     Returns
34 |     -------
35 |     bool
36 |         True if success
37 |     """
38 |     output_folder = os.path.join(current_directory(), folder_name)
39 |     os.makedirs(output_folder)
40 |     return True
41 | 
42 | 
43 | def is_json_serializable(x: Any) -> bool:
44 |     """
45 |     Check if the object is serializable.
46 | 
47 |     Parameters
48 |     ----------
49 |     x : Any
50 |         Object to validate
51 | 
52 |     Returns
53 |     -------
54 |     bool
55 |         True if success
56 |     """
57 |     try:
58 |         json.dumps(x)
59 |         return True
60 |     except Exception:
61 |         return False
62 | 
63 | 
64 | 
65 | # TODO: remove
66 | # def load_envs(current_file: Path):
67 | #     """
68 | #     Helper function for local development
69 | 
70 | #     Parameters
71 | #     ----------
72 | #     current_file : Path
73 | #         Paht of the current file
74 | #     """
75 | #     from dotenv import load_dotenv
76 | 
77 | #     root_folder = "analytics"
78 | #     found_env = False
79 | #     base_env = current_file.parent.absolute()
80 | 
81 | #     while not found_env:
82 | #         matches = [f for f in base_env.glob("*.env")]
83 | #         # print(matches)
84 | #         if matches:
85 | #             env_file = [f for f in base_env.glob("*.env")][0]
86 | #             print(env_file)
87 | #             load_dotenv(env_file, override=True, verbose=True)
88 | #             found_env = True
89 | #             print("Environment file found")
90 | #         elif base_env.name == root_folder:
91 | #             break
92 | #         else:
93 | #             base_env = base_env.parent
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/src/modules/dbkcore/requirements.txt:
--------------------------------------------------------------------------------
1 | opencensus-ext-azure>=1.0.2
2 | typeguard==2.7.1


--------------------------------------------------------------------------------
/src/modules/dbkdev/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/modules/dbkdev/__init__.py


--------------------------------------------------------------------------------
/src/modules/dbkdev/core.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql import DataFrame, SparkSession
  2 | from dbkcore.core import trace, Log
  3 | import re
  4 | from pathlib import Path
  5 | from typing import Any, List
  6 | from typeguard import typechecked
  7 | from dbkcore.core import Singleton
  8 | from enum import Enum
  9 | import pkg_resources
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | class IdeEnvironment(str, Enum):
 16 |     LOCAL = "local"
 17 |     DATABRICKS = "databricks"
 18 | 
 19 | 
 20 | @typechecked
 21 | class DevelopmentClient(metaclass=Singleton):
 22 |     """
 23 |     Client to use for local Databricks' local development
 24 |     """
 25 | 
 26 |     # @trace
 27 |     def __init__(
 28 |         self,
 29 |         dbutils,
 30 |         spark: SparkSession,
 31 |         ide_environment: IdeEnvironment
 32 |         # deployment_environment: DeploymentEnvironment
 33 |     ):
 34 |         """
 35 |         Instantiates this object
 36 | 
 37 |         Parameters
 38 |         ----------
 39 |         dbutils : Dbutils
 40 |             The Dbutils instance to use
 41 |         spark : SparkSession
 42 |             The SparkSession to use
 43 |         ide_environment : IdeEnvironment
 44 |             The environment used
 45 |         deployment_environment : DeploymentEnvironment
 46 |             The deployment environment
 47 |         """
 48 |         self.__spark = spark
 49 |         self.__dbutils = dbutils
 50 |         self.__ide_environment = ide_environment
 51 | 
 52 |     @property
 53 |     def spark(self) -> SparkSession:
 54 |         return self.__spark
 55 | 
 56 |     @property
 57 |     def dbutils(self) -> Any:
 58 |         return self.__dbutils
 59 | 
 60 |     @property
 61 |     def __storage_account_name(self) -> str:
 62 |         res = self.dbutils.secrets.get(
 63 |             scope="storage",
 64 |             key="name"
 65 |         )
 66 |         return res
 67 | 
 68 |     @property
 69 |     def __blob_container_name(self) -> str:
 70 |         res = self.dbutils.secrets.get(
 71 |             scope="storage",
 72 |             key="container"
 73 |         )
 74 |         return res
 75 | 
 76 |     @property
 77 |     def __storage_account_access_key(self) -> str:
 78 |         res = self.dbutils.secrets.get(
 79 |             scope="storage",
 80 |             key="key"
 81 |         )
 82 |         return res
 83 | 
 84 |     @property
 85 |     def ide_environment(self) -> IdeEnvironment:
 86 |         return self.__ide_environment
 87 | 
 88 |     @trace(attrs_refact=['dbkea_token'])
 89 |     def set_dbkea(self, dbkea_token: str):
 90 |         """
 91 |         To use when the environment is LOCAL for using the dbutils secrets
 92 | 
 93 |         Parameters
 94 |         ----------
 95 |         dbkea_token : str
 96 |             The token
 97 |         """
 98 |         self.__dbkea_token = dbkea_token
 99 |         self.dbutils.secrets.setToken(dbkea_token)
100 | 
101 |     @property
102 |     def mount_name(self) -> str:
103 |         """
104 |         Standard name of the root mount for the configured storage account and container
105 | 
106 |         Returns
107 |         -------
108 |         str
109 |             [description]
110 |         """
111 |         res = "{}_{}".format(self.__storage_account_name, self.__blob_container_name)
112 |         return res
113 | 
114 |     @property
115 |     def mount_path(self) -> str:
116 |         """
117 |         Standard mount path
118 | 
119 |         Returns
120 |         -------
121 |         str
122 |             The path
123 |         """
124 |         res = 'dbfs:/mnt/{}/'.format(self.mount_name)
125 |         return res
126 | 
127 |     @trace
128 |     def read_csv(self, file_path: str) -> DataFrame:
129 |         # blob_path = self.__blob_path(file_path)
130 |         blob_df = self.spark.read.format("csv").\
131 |             option("inferSchema", "true").\
132 |             option("header", "true").\
133 |             option("delimiter", ",").\
134 |             option("charset", "utf-8").load(file_path)
135 |         return blob_df
136 | 
137 |     @trace
138 |     def read_parquet(self, file_path: str) -> DataFrame:
139 |         return self.spark.read.parquet(file_path)
140 | 
141 |     @trace
142 |     def save_temp_table(
143 |         self,
144 |         dataframe: DataFrame,
145 |         # schema: str,
146 |         table_name: str,
147 |         cache=True
148 |     ):
149 |         # TODO: Documentation
150 | 
151 |         # self.create_schema(schema)
152 |         # dbk_table_name = f"{schema}_{table_name}"
153 |         Log.get_instance().log_info(f"Creating temp table: {table_name}")
154 |         if cache:
155 |             dataframe.cache().createOrReplaceGlobalTempView(table_name)
156 |         else:
157 |             dataframe.createOrReplaceGlobalTempView(table_name)
158 | 
159 |     @trace
160 |     def load_temp_table(
161 |         self,
162 |         # schema: str,
163 |         table_name: str
164 |     ) -> DataFrame:
165 |         # TODO: Documentation
166 | 
167 |         Log.get_instance().log_info(f"Loading temp table: {table_name}")
168 |         # self.create_schema(schema)
169 |         # dbk_table_name = f"{schema}.{table_name}"
170 |         global_temp_db = self.spark.conf.get("spark.sql.globalTempDatabase")
171 |         # dt = self.spark.conf.get(f"spark.sql.{dbk_table_name}")
172 |         dt = self.spark.read.table(f"{global_temp_db}.{table_name}")
173 |         return dt
174 | 
175 |     @trace
176 |     def save_delta_table(
177 |         self,
178 |         dataframe: DataFrame,
179 |         schema: str,
180 |         table_name: str,
181 |         output_path: Path,
182 |         partition_columns: List[str] = None,
183 |         mode: str = 'overwrite',
184 |         overwrite_schema: bool = False
185 |     ):
186 |         """
187 |         Saves the dataframe as a delta table in an external location
188 |         Parameters
189 |         ----------
190 |         dataframe : DataFrame
191 |             The dataframe
192 |         schema : str
193 |             Destination schema
194 |         table_name : str
195 |             Destination schema
196 |         output_path : Path
197 |             Folder where to save the dataframe
198 |         partition_columns: List[str]
199 |             Columns to use for partitioning, default is None
200 |         mode: str
201 |            e.g. append, overwrite, passed to dataframe.write.saveAsTable
202 |         """
203 |         self.create_schema(schema)
204 |         dbk_table_name = f"{schema}.{table_name}"
205 |         # mnt_path = str(Path('mnt', mount_name, root_path, schema, table_name))
206 |         # path = f"dbfs:/{mnt_path}"
207 |         path = output_path.joinpath(dbk_table_name.replace('.', '_'))
208 |         self._save_table(
209 |             dataframe=dataframe,
210 |             table_name=dbk_table_name,
211 |             path=str(path),
212 |             format='delta',
213 |             mode=mode,
214 |             partition_columns=partition_columns,
215 |             overwrite_schema=overwrite_schema
216 |         )
217 | 
218 |     @trace
219 |     def _save_table(
220 |         self,
221 |         dataframe: DataFrame,
222 |         table_name: str,
223 |         path: str,
224 |         format: str,
225 |         mode: str,
226 |         partition_columns: List[str] = None,
227 |         overwrite_schema=False
228 |     ):
229 |         """
230 |         Saves the given dataframe into a delta table
231 | 
232 |         Parameters
233 |         ----------
234 |         dataframe : DataFrame
235 |             Dataframe to save
236 |         schema : str
237 |             Schema into save
238 |         table_name : str
239 |             Name of the table
240 |         """
241 |         # TODO: Update documentation
242 |         if table_name is None or "":
243 |             raise Exception("Table name missing")
244 |         if not path:
245 |             raise Exception("Path missing")
246 |         # Create hive table from a dataframe (for the final ETL process)
247 |         # self.spark.sql("DROP TABLE IF EXISTS {}".format(table_name))
248 |         dataframe.write.saveAsTable(table_name, mode=mode, format=format, path=path, partitionBy=partition_columns, overwriteSchema=overwrite_schema)
249 | 
250 |     @trace
251 |     def table_exists(
252 |         self,
253 |         schema_name: str,
254 |         table_name: str
255 |     ):
256 |         return table_name in [t.name for t in self.spark.catalog.listTables(schema_name)]
257 | 
258 |     @trace
259 |     def list_tables(self, schema: str) -> List[str]:
260 |         """
261 |         List the tables in the given schema
262 | 
263 |         Parameters
264 |         ----------
265 |         schema : str
266 |             The Databricks schema
267 | 
268 |         Returns
269 |         -------
270 |         List[str]
271 |             List of tables
272 |         """
273 |         df = self.spark.sql("show tables in {}".format(schema)).toPandas()
274 |         table_name = df["tableName"]  # ! Could be wrong
275 |         return list(table_name)
276 | 
277 |     @trace
278 |     def list_databases(self) -> List[str]:
279 |         """
280 |         Gets the list of Databricks databases (a.k.a. schemas)
281 | 
282 |         Returns
283 |         -------
284 |         List[str]
285 |             List of schemas
286 |         """
287 |         df = self.spark.sql("show schemas").toPandas()
288 |         databases = df["databaseName"].tolist()
289 |         return databases
290 | 
291 |     @trace
292 |     def create_schema(self, schema_databricks: str):
293 |         """
294 |         Creates a schema in Databricks
295 | 
296 |         Parameters
297 |         ----------
298 |         schema_databricks : str
299 |             Name of the schema
300 |         """
301 |         self.spark.sql("CREATE SCHEMA IF NOT EXISTS {}".format(schema_databricks))
302 | 
303 |     @trace
304 |     def mount_exists(self, mount_name: str) -> bool:
305 |         mounts = self.list_mounts()
306 |         names = [m.name.replace('/', '') for m in mounts]
307 |         res = True if mount_name in names else False
308 |         return res
309 | 
310 |     @trace
311 |     def list_mounts(self) -> list:
312 |         mounts = self.files('/mnt')
313 |         return mounts
314 | 
315 |     @trace
316 |     def files(self, path: str) -> list:
317 |         files = self.dbutils.fs.ls(path)
318 |         return files
319 | 
320 |     @classmethod
321 |     def get_instance(cls):
322 |         return DevelopmentClient()
323 | 
324 | 
325 | class DevelopmentEngine(metaclass=Singleton):
326 | 
327 |     def __init__(self):
328 |         self.spark = self.__get_spark()
329 |         dbutils, ide_environment = self.__get_dbutils(self.spark)
330 |         self.dbutils = dbutils
331 |         self.ide_environment = ide_environment
332 | 
333 |     def __get_spark(self) -> SparkSession:
334 |         MAX_MEMORY = "5g"
335 |         spark = SparkSession.builder.\
336 |             config("spark.executor.memory", MAX_MEMORY).\
337 |             config("spark.driver.memory", MAX_MEMORY).\
338 |             config("spark.driver.maxResultSize", MAX_MEMORY).\
339 |             getOrCreate()
340 |         return spark
341 |         # import IPython
342 |         # user_ns = IPython.get_ipython().user_ns
343 |         # if "spark" in user_ns:
344 |         #     return user_ns["spark"]
345 |         # else:
346 |         #     from pyspark.sql import SparkSession
347 |         #     user_ns["spark"] = SparkSession.builder.getOrCreate()
348 |         #     return user_ns["spark"]
349 | 
350 |     def __get_dbutils(self, spark: SparkSession):
351 |         try:
352 |             from pyspark.dbutils import DBUtils
353 |             dbutils = DBUtils(spark)
354 |             env = IdeEnvironment.LOCAL if "databricks-connect" in [i.key for i in pkg_resources.working_set] else IdeEnvironment.DATABRICKS
355 |         except ImportError:
356 |             import IPython
357 |             dbutils = IPython.get_ipython().user_ns["dbutils"]
358 |             env = IdeEnvironment.DATABRICKS
359 |         return (dbutils, env)
360 | 
361 |     @classmethod
362 |     def get_instance(cls):
363 |         return DevelopmentEngine()
364 | 


--------------------------------------------------------------------------------
/src/modules/dbkdev/data_steps.py:
--------------------------------------------------------------------------------
  1 | from dbkcore.core import BaseObject, trace, Log
  2 | from pyspark.sql.dataframe import DataFrame as PyDataFrame
  3 | import pyspark.sql.functions as F
  4 | from pyspark.sql import SparkSession
  5 | from pyspark.sql.utils import AnalysisException
  6 | from enum import Enum
  7 | # from abc import ABC, abstractmethod
  8 | from abc import abstractmethod
  9 | import pandas as pd
 10 | from pandas import DataFrame as PdDataFrame
 11 | from pandas.api.types import is_numeric_dtype
 12 | from typing import Union, List
 13 | from typeguard import typechecked
 14 | from pathlib import Path
 15 | import functools as _functools
 16 | 
 17 | 
 18 | 
 19 | 
 20 | class DataDirection(str, Enum):
 21 |     IN = "in"
 22 |     OUT = "out"
 23 | 
 24 | 
 25 | class DataStepDataframe(BaseObject):
 26 | 
 27 |     # TODO: Change name to name_or_path
 28 | 
 29 |     def __init__(self, name: str, dataframe: Union[PyDataFrame, PdDataFrame], cache=False):
 30 |         self.name = name
 31 |         self.dataframe = dataframe
 32 |         # Note: it has been removed for spark since the dataframe has to be read
 33 |         self.__rows = None
 34 |         self.columns_count = len(dataframe.columns) if isinstance(dataframe, PyDataFrame) else dataframe.shape[1]
 35 |         self.columns_names = dataframe.columns
 36 |         self.cache = cache
 37 |         self.__columns_negative = None
 38 |         self.__columns_null = None
 39 | 
 40 |     def to_pandas(self) -> PdDataFrame:
 41 |         if self.is_pandas:
 42 |             return self.dataframe
 43 |         elif self.is_pyspark:
 44 |             return self.dataframe.toPandas()
 45 | 
 46 |     @property
 47 |     def rows(self):
 48 |         if not self.__rows:
 49 |             # self.__rows = self.dataframe.cache().count() if self.cache else dataframe.count()
 50 |             # TODO: improve me
 51 |             if self.cache:
 52 |                 self.__rows = self.dataframe.cache().count() if isinstance(self.dataframe, PyDataFrame) else self.dataframe.shape[0]    
 53 |             else:
 54 |                 self.__rows = self.dataframe.count() if isinstance(self.dataframe, PyDataFrame) else self.dataframe.shape[0]
 55 |         return self.__rows
 56 | 
 57 |     @trace
 58 |     def columns_negative(self) -> List[str]:
 59 |         """
 60 |         Identifies the columns with negative values
 61 | 
 62 |         Returns
 63 |         -------
 64 |         List[str]
 65 |             Column names
 66 |         """
 67 |         columns = []
 68 | 
 69 |         if not self.__columns_negative:
 70 |             if isinstance(self.dataframe, PyDataFrame):
 71 |                 for column in self.columns_names:
 72 |                     count = 0
 73 |                     try:
 74 |                         count = self.dataframe.filter((F.col(column) < 0)).count()
 75 |                     except AnalysisException:
 76 |                         pass
 77 |                     if count > 0:
 78 |                         columns.append(column)
 79 |             elif isinstance(self.dataframe, pd.DataFrame):
 80 |                 for column in self.columns_names:
 81 |                     if is_numeric_dtype(self.dataframe[column]):
 82 |                         dt_filtered = self.dataframe[self.dataframe[column] < 0]
 83 |                         count = dt_filtered.shape[0]
 84 |                         if count > 0:
 85 |                             columns.append(column)
 86 |             self.__columns_negative = columns
 87 |         return self.__columns_negative
 88 | 
 89 |     @trace
 90 |     def columns_null(self) -> List[str]:
 91 |         """
 92 |         Identifies the columns with null values
 93 | 
 94 |         Returns
 95 |         -------
 96 |         List[str]
 97 |             Column names
 98 |         """
 99 |         if not self.__columns_null:
100 |             columns = []
101 |             if isinstance(self.dataframe, PyDataFrame):
102 |                 for column in self.columns_names:
103 |                     count = self.dataframe.filter(F.col(column).isNull()).count()
104 |                     if count > 0:
105 |                         columns.append(column)
106 |             elif isinstance(self.dataframe, pd.DataFrame):
107 |                 nan_cols = self.dataframe.columns[self.dataframe.isna().any()].tolist()
108 |                 columns.extend(nan_cols)
109 |             self.__columns_null = columns
110 |         return self.__columns_null
111 | 
112 |     @property
113 |     def is_pandas(self) -> bool:
114 |         return isinstance(self.dataframe, PdDataFrame)
115 | 
116 |     @property
117 |     def is_pyspark(self) -> bool:
118 |         return isinstance(self.dataframe, PyDataFrame)
119 | 
120 |     def log_in(self):
121 |         self.log(direction=DataDirection.IN)
122 | 
123 |     def log_out(self):
124 |         self.log(direction=DataDirection.OUT)
125 | 
126 |     def log(self, direction: DataDirection):
127 |         dt_name = self.name
128 | 
129 |         # dt_tag_prefix = "DT:{}".format(direction.upper(), dt_name)
130 |         # dt_name_tag = "{}:NAME".format(dt_tag_prefix)
131 | 
132 |         # dt_rows_tag = "{}:ROWS:COUNT".format(dt_tag_prefix)
133 |         # if isinstance(self.dataframe, PyDataFrame):
134 |         #     dt_rows = self.dataframe.count()
135 |         # elif isinstance(self.dataframe, PdDataFrame):
136 |         #     dt_rows = self.dataframe.shape[0]
137 | 
138 |         # dt_columns_tag = "{}:COLUMNS:COUNT".format(dt_tag_prefix)
139 |         # if isinstance(self.dataframe, PyDataFrame):
140 |         #     dt_columns = len(self.dataframe.columns)
141 |         # elif isinstance(self.dataframe, PdDataFrame):
142 |         #     dt_columns = self.dataframe.shape[1]
143 | 
144 |         # dt_columns_names_tag = "{}:COLUMNS:NAMES".format(dt_tag_prefix)
145 |         # dt_columns_names = ', '.join(self.dataframe.columns)
146 | 
147 |         # Log.get_instance().log_info(f"{dt_name_tag}:{dt_name}", prefix=direction, custom_dimension=dimensions)
148 |         # Log.get_instance().log_info(f"{dt_rows_tag}:{dt_rows}", prefix=direction)
149 |         # Log.get_instance().log_info(f"{dt_columns_tag}:{dt_columns}", prefix=direction)
150 |         # Log.get_instance().log_info(f"{dt_columns_names_tag}:{dt_columns_names}", prefix=direction)
151 |         
152 |         dimensions = {
153 |             'dataset_name': dt_name,
154 |             'rows_count': self.rows,
155 |             'columns_count': self.columns_count,
156 |             'columns_name': self.columns_names,
157 |             'direction': direction
158 |         }
159 |         Log.get_instance().log_info(f"Processed dataset {dt_name} with {direction.upper()} direction", custom_dimension=dimensions)
160 | 
161 | 
162 | def apply_test(func):
163 |     """
164 |     Execute test function after the initialize.
165 | 
166 |     Notes
167 |     -----
168 |     [Example](https://stackoverflow.com/a/15196410)
169 |     """
170 |     @_functools.wraps(func)
171 |     def wrapper(self, *args, **kwargs):
172 |         res = func(self, *args, **kwargs)
173 |         self.tests()
174 |         return res
175 |     return wrapper
176 | 
177 | 
178 | def pre_apply_test(func):
179 |     """
180 |     Execute test function before the initialize.
181 | 
182 |     Notes
183 |     -----
184 |     [Example](https://stackoverflow.com/a/15196410)
185 |     """
186 |     @_functools.wraps(func)
187 |     def wrapper(self, *args, **kwargs):        
188 |         self.tests()
189 |         res = func(self, *args, **kwargs)
190 |         return res
191 |     return wrapper
192 | 
193 | 
194 | def log_output(func):
195 |     """
196 |     Decorator for executing test in sequence
197 | 
198 |     Notes
199 |     -----
200 |     [Example](https://stackoverflow.com/a/15196410)
201 |     """
202 |     @_functools.wraps(func)
203 |     def wrapper(self, *args, **kwargs):
204 |         res = func(self, *args, **kwargs)
205 |         self.tests()
206 |         return res
207 |     return wrapper
208 | 
209 | 
210 | # TODO: if works, remove ABC class
211 | # class DataStep(ABC):
212 | @typechecked
213 | class DataStep():
214 |     """
215 |     Creates a datastep to be used in a pipeline
216 | 
217 |     Parameters
218 |     ----------
219 |     metaclass : [type], optional
220 |         [description], by default abc.ABCMeta
221 | 
222 |     Raises
223 |     ------
224 |     Exception
225 |         [description]
226 |     """
227 | 
228 |     @trace
229 |     def __init__(
230 |         self,
231 |         spark: SparkSession,
232 |         run_id: str
233 |     ):
234 |         self.spark = spark
235 |         self.run_id = run_id
236 | 
237 |     @property
238 |     def display_name(self) -> str:
239 |         res = type(self).__name__
240 |         return res
241 | 
242 |     @trace
243 |     def spark_read_table(self, name: str) -> DataStepDataframe:
244 |         dt = self.spark.read.table(name)
245 |         datastep_dataframe = DataStepDataframe(name=name, dataframe=dt)
246 |         datastep_dataframe.log(DataDirection.IN)
247 |         return datastep_dataframe
248 | 
249 |     @trace
250 |     def spark_read_temp_table(self, name: str) -> DataStepDataframe:
251 |         global_temp_db = self.spark.conf.get(f"spark.sql.globalTempDatabase")
252 |         dt = self.spark.read.table(f'{global_temp_db}.{name}')
253 |         datastep_dataframe = DataStepDataframe(name=name, dataframe=dt)
254 |         datastep_dataframe.log_in()
255 |         return datastep_dataframe
256 | 
257 |     @trace
258 |     def spark_read_parquet_path(self, path: Path, cache=False) -> DataStepDataframe:
259 |         path_str = str(path)
260 |         dt = self.spark.read.parquet(path_str)
261 |         datastep_dataframe = DataStepDataframe(name=path_str, dataframe=dt, cache=cache)
262 |         datastep_dataframe.log_in()
263 |         return datastep_dataframe
264 | 
265 |     @trace
266 |     def pandas_read_csv(self, path: Path) -> DataStepDataframe:
267 |         datastep_dataframe = self.spark_read_csv(path)
268 |         datastep_dataframe.dataframe = datastep_dataframe.dataframe.toPandas()
269 |         datastep_dataframe.log_in()
270 |         return datastep_dataframe
271 | 
272 |     @trace
273 |     def spark_read_csv(self, path: Path) -> DataStepDataframe:
274 |         path_str = str(path)
275 |         dt = self.spark.read.format("csv").\
276 |             option("inferSchema", "true").\
277 |             option("header", "true").\
278 |             option("delimiter", ",").\
279 |             option("charset", "utf-8").load(path_str)
280 |         datastep_dataframe = DataStepDataframe(name=path_str, dataframe=dt)
281 |         datastep_dataframe.log_in()
282 |         return datastep_dataframe
283 | 
284 |     @trace
285 |     def test_rows_leq(self, dt_1: DataStepDataframe, dt_2: DataStepDataframe):
286 |         assert dt_1.rows < dt_2.rows,\
287 |             "ROWS CHECK: {dt_1_name} ({dt_1_rows}) < {dt_2_name} ({dt_2_rows})".format(
288 |                 dt_1_name=dt_1.name,
289 |                 dt_1_rows=dt_1.rows,
290 |                 dt_2_name=dt_2.name,
291 |                 dt_2_rows=dt_2.rows)
292 |         print("Asserted")
293 | 
294 |     @trace
295 |     def test_rows_eq(self, dt_1: DataStepDataframe, dt_2: DataStepDataframe):
296 |         assert dt_1.rows == dt_2.rows,\
297 |             "ROWS CHECK: {dt_1_name} ({dt_1_rows}) == {dt_2_name} ({dt_2_rows})".format(
298 |                 dt_1_name=dt_1.name,
299 |                 dt_1_rows=dt_1.rows,
300 |                 dt_2_name=dt_2.name,
301 |                 dt_2_rows=dt_2.rows)
302 |         print("Asserted")
303 | 
304 |     @trace
305 |     def test_rows_geq(self, dt_1: DataStepDataframe, dt_2: DataStepDataframe):
306 |         assert dt_1.rows >= dt_2.rows,\
307 |             "ROWS CHECK: {dt_1_name} ({dt_1_rows}) >= {dt_2_name} ({dt_2_rows})".format(
308 |                 dt_1_name=dt_1.name,
309 |                 dt_1_rows=dt_1.rows,
310 |                 dt_2_name=dt_2.name,
311 |                 dt_2_rows=dt_2.rows)
312 |         print("Asserted")
313 | 
314 |     @trace
315 |     def test_rows_diff(self, dt_1: DataStepDataframe, dt_2: DataStepDataframe):
316 |         assert dt_1.rows != dt_2.rows,\
317 |             "ROWS CHECK: {dt_1_name} ({dt_1_rows}) >= {dt_2_name} ({dt_2_rows})".format(
318 |                 dt_1_name=dt_1.name,
319 |                 dt_1_rows=dt_1.rows,
320 |                 dt_2_name=dt_2.name,
321 |                 dt_2_rows=dt_2.rows)
322 |         print("Asserted")
323 | 
324 |     @trace
325 |     def test_negative_values(self, cols: List[str], dt: DataStepDataframe):
326 |         for col in cols:
327 |             assert col not in dt.columns_negative(), f"Dataset {dt.name} has columns with negative values -> {col}"
328 | 
329 |     @trace
330 |     def test_null_values(self, cols: List[str], dt: DataStepDataframe):
331 |         for col in cols:
332 |             assert col not in dt.columns_null(), f"Dataset {dt.name} has columns with null values -> {col}"
333 | 
334 |     @trace
335 |     def test_is_dataframe_empty(self, df: PyDataFrame):
336 |         count = df.count()
337 |         assert count > 0, "the dataframe count is zero"
338 | 
339 |     @property
340 |     def output_data(self) -> DataStepDataframe:
341 |         return self.__output_data
342 | 
343 |     @trace
344 |     def check_output(self, **kwargs):
345 |         if kwargs:
346 |             for key, value in kwargs.items():
347 |                 if isinstance(value, (PyDataFrame, PdDataFrame)):
348 |                     msg = "Output Pandas or PySpark dataframe must be encapsulated into DataStepDataframe"
349 |                     Log.get_instance().log_error(msg)
350 |                     raise ValueError(msg)
351 |                 elif isinstance(value, DataStepDataframe):
352 |                     value.log(direction=DataDirection.OUT)
353 |                 else:
354 |                     Log.get_instance().log_info(f'{key}:{value}')
355 |                 # setattr(self, key, value)
356 | 
357 |     @trace
358 |     def set_output_data(self, dataframe: Union[PyDataFrame, PdDataFrame], name='', cache: bool = False):
359 |         name = self.display_name if not name else name
360 |         self.__output_data = DataStepDataframe(
361 |             name=name,
362 |             dataframe=dataframe,
363 |             cache=cache)
364 |         self.__output_data.log_out()
365 | 
366 |     @trace
367 |     @abstractmethod
368 |     def initialize(self):
369 |         """
370 |         Define the DataStep logic.
371 |         """
372 |         pass
373 | 
374 |     @trace
375 |     @abstractmethod
376 |     def tests(self):
377 |         """
378 |         Define all the the tests that this step must pass
379 |         """
380 |         pass
381 | 


--------------------------------------------------------------------------------
/src/modules/dbkdev/requirements.txt:
--------------------------------------------------------------------------------
1 | pyspark
2 | pandas>=0.24.2


--------------------------------------------------------------------------------
/src/modules/dbkenv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/modules/dbkenv/__init__.py


--------------------------------------------------------------------------------
/src/modules/dbkenv/core.py:
--------------------------------------------------------------------------------
  1 | import databricks_cli.sdk as _dbkcli
  2 | import databricks_cli.sdk.service as _dss
  3 | from dbkcore.core import trace
  4 | from dbkcore.core import Log
  5 | # from dbkcore.core import Log
  6 | import os as _os
  7 | import base64 as _base64
  8 | import tempfile as _tempfile
  9 | import time as _time
 10 | import enum as _enum
 11 | import typing as _typing
 12 | from dotenv import load_dotenv
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | class Configuration():
 19 |     """Retrieve the keys used from the package from the local environment."""
 20 | 
 21 |     def __init__(self, file_load: bool = False):
 22 |         """
 23 |         Initialize the configuration class.
 24 | 
 25 |         Parameters
 26 |         ----------
 27 |         file_load : bool, optional
 28 |             Search .env file and loads it, by default False
 29 |         """
 30 | 
 31 |         if file_load:
 32 |             load_dotenv(override=True)
 33 |         # self._APPINSIGHT_CONNECTIONSTRING = _os.getenv("APPI_IK")
 34 |         # self.DATABRICKS_HOST = _os.getenv('DATABRICKS_HOST')
 35 |         # self.DATABRICKS_TOKEN = _os.getenv('DATABRICKS_TOKEN')
 36 |         # self.DATABRICKS_ORDGID = _os.getenv('DATABRICKS_ORDGID')
 37 |         # self.AZURE_SUBSCRIPTIONID = _os.getenv('AZURE_SUBSCRIPTIONID')
 38 |         # self.RESOURCEGROUP_NAME = _os.getenv('RESOURCEGROUP_NAME')
 39 |         # self.RESOURCEGROUP_REGION = _os.getenv('RESOURCEGROUP_REGION')
 40 | 
 41 |     @property
 42 |     def APPINSIGHT_CONNECTIONSTRING(self) -> str:
 43 |         """
 44 |         Application insight connection string.
 45 | 
 46 |         Returns
 47 |         -------
 48 |         str
 49 |             The connection string
 50 |         """
 51 |         res = _os.environ["APPI_IK"]
 52 |         return res
 53 | 
 54 |     @property
 55 |     def DATABRICKS_HOST(self) -> str:
 56 |         """
 57 |         Databricks host url.
 58 | 
 59 |         Returns
 60 |         -------
 61 |         str
 62 |             The host url
 63 |         """
 64 |         res = _os.environ["DATABRICKS_HOST"]
 65 |         return res
 66 | 
 67 |     @property
 68 |     def DATABRICKS_TOKEN(self) -> str:
 69 |         """
 70 |         Databricks personal roken.
 71 | 
 72 |         Returns
 73 |         -------
 74 |         str
 75 |             The token
 76 |         """
 77 |         res = _os.environ["DATABRICKS_TOKEN"]
 78 |         return res
 79 | 
 80 |     @property
 81 |     def DATABRICKS_ORDGID(self) -> str:
 82 |         """
 83 |         Databricks organization id.
 84 | 
 85 |         Returns
 86 |         -------
 87 |         str
 88 |             The id
 89 |         """
 90 |         res = _os.environ["DATABRICKS_ORDGID"]
 91 |         return res
 92 | 
 93 | 
 94 | class DatabricksEnvironment(str, _enum.Enum):
 95 |     """
 96 |     Describes the type of environment used
 97 |     """
 98 |     LOCAL = 'local'
 99 |     DATABRICKS = 'databricks'
100 | 
101 | 
102 | class Package():
103 |     """
104 |     Rapresents the python package installed in databricks
105 | 
106 |     Reference
107 |     ---------
108 | 
109 |     Documentation [link](https://docs.databricks.com/dev-tools/api/latest/libraries.html#example-response)
110 |     """
111 | 
112 |     @trace
113 |     def __init__(self, origin: str, package: str, repo: str):
114 |         """
115 |         Creates this object
116 | 
117 |         Parameters
118 |         ----------
119 |         origin : str
120 |             Origin of the package
121 |         package : str
122 |             Name and version of the package
123 |         repo : str
124 |             The repository of the package
125 |         """
126 |         super().__init__()
127 |         self.origin = origin
128 |         self.package = package
129 |         self.repo = repo
130 | 
131 |     @trace
132 |     def to_api_json(self):
133 |         return {
134 |             self.origin: {
135 |                 'package': self.package,
136 |                 'repo': self.repo
137 |             }
138 |         }
139 | 
140 | 
141 | class ResourceClient():
142 |     """
143 |     Client used to interact with a Databricks cluster
144 |     """
145 | 
146 |     @trace(attrs_refact=['personal_token'])
147 |     def __init__(self, host: str, personal_token: str):
148 |         """
149 |         Instantiates this object
150 | 
151 |         Parameters
152 |         ----------
153 |         host : str
154 |             Host of the cluster
155 |         personal_token : str
156 |             Databricks personal token
157 |         """
158 |         super().__init__()
159 |         self.host = host
160 |         self.personal_token = personal_token
161 |         self.__api_client = None
162 | 
163 |     @property
164 |     def apiClient(self) -> _dbkcli.ApiClient:
165 |         """
166 |         Creates the Databricks API client
167 | 
168 |         Returns
169 |         -------
170 |         ApiClient
171 |             The client
172 |         """
173 |         if not self.__api_client:
174 |             self.__api_client = _dbkcli.ApiClient(
175 |                 host=self.host,
176 |                 token=self.personal_token
177 |             )
178 |         return self.__api_client
179 | 
180 | 
181 | class Cluster():
182 |     """
183 |     Manages a Databricks cluster
184 |     """
185 | 
186 |     @trace
187 |     def __init__(
188 |         self,
189 |         client: ResourceClient,
190 |         cluster_name: str,
191 |         cluster_configuration: dict
192 |     ):
193 |         """
194 |         Instantiates this object
195 | 
196 |         Parameters
197 |         ----------
198 |         client : ResourceClient
199 |             A ResourceClient object
200 |         cluster_name : str
201 |             Name of the cluster
202 |         cluster_configuration : dict
203 |             Dictionary that contains the cluster configuration
204 |         appinsight_instrumentation_key : str
205 |             Application Insights' instrumentation key
206 |         """
207 |         self.client = client
208 |         self.__cluster_name = cluster_name
209 |         self.cluster_configuration = cluster_configuration
210 |         # self.appinsight_instrumentation_key = appinsight_instrumentation_key
211 |         self.__cluster_service = None
212 | 
213 |     @property
214 |     def cluster_service(self) -> _dss.ClusterService:
215 |         if not self.__cluster_service:
216 |             self.__cluster_service = _dss.ClusterService(self.client.apiClient)
217 |         return self.__cluster_service
218 | 
219 |     @trace
220 |     def create_cluster_and_wait(self, redeploy=False) -> bool:
221 |         """
222 |         Creates the cluster and waits for its done
223 | 
224 |         Parameters
225 |         ----------
226 |         redeploy: bool
227 |             Redeploy the cluster
228 |         """
229 |         if self.cluster_configuration:
230 |             deploy = False
231 |             if self.cluster_exists():
232 |                 if redeploy:
233 |                     print("Cluster {} exists. Dropping and recreating".format(self.cluster_name))
234 |                     self.delete_cluster()
235 |                     deploy = True
236 |                 else:
237 |                     deploy = False
238 |             else:
239 |                 deploy = True
240 | 
241 |             if deploy:
242 |                 spark_env_vars = self.cluster_configuration["spark_env_vars"]
243 |                 # spark_env_vars["APPINSIGHT_CONNECTIONSTRING"] = self.appinsight_instrumentation_key
244 |                 spark_env_vars["EXECUTER"] = f"DATABRICKS_{self.cluster_name}"
245 |                 self.cluster_service.create_cluster(
246 |                     num_workers=self.cluster_configuration.get("num_workers"),
247 |                     autoscale=self.cluster_configuration.get("autoscale"),
248 |                     cluster_name=self.cluster_configuration.get("cluster_name"),
249 |                     spark_version=self.cluster_configuration.get("spark_version"),
250 |                     spark_conf=self.cluster_configuration.get("spark_conf"),
251 |                     node_type_id=self.cluster_configuration.get("node_type_id"),
252 |                     driver_node_type_id=self.cluster_configuration.get("driver_node_type_id"),
253 |                     spark_env_vars=spark_env_vars,
254 |                     autotermination_minutes=self.cluster_configuration.get("autotermination_minutes"),
255 |                     enable_elastic_disk=self.cluster_configuration.get("enable_elastic_disk")
256 |                 )
257 | 
258 |                 searched_times = 0
259 |                 while not self.cluster_exists():
260 |                     searched_times += 1
261 |                     _time.sleep(10)
262 |                     if searched_times > 10:
263 |                         raise Exception("Cluster failed to deploy")
264 |                 self.start_cluster_and_wait()
265 |         else:
266 |             print("Can't deploy since cluster configuration is missing")
267 |         return True
268 | 
269 |     @trace
270 |     def databricks_list_clusters(self) -> _typing.List[str]:
271 |         """
272 |         List clusters in Databricks
273 | 
274 |         Returns
275 |         -------
276 |         List[str]
277 |             List of clusters
278 |         """
279 |         return self.cluster_service.list_clusters()
280 | 
281 |     @property
282 |     def cluster_name(self) -> str:
283 |         if self.cluster_configuration:
284 |             return self.cluster_configuration.get("cluster_name")
285 |         else:
286 |             return self.__cluster_name
287 | 
288 |     @trace
289 |     def cluster_started(self) -> bool:
290 |         """
291 |         Checks if the cluster is started
292 | 
293 |         Returns
294 |         -------
295 |         bool
296 |             True if started
297 |         """
298 |         started = False
299 |         if self.cluster_state() == "RUNNING":
300 |             started = True
301 |         return started
302 | 
303 |     @trace
304 |     def cluster_exists(self) -> bool:
305 |         """
306 |         Checks is the cluster exists
307 | 
308 |         Returns
309 |         -------
310 |         bool
311 |             True if exists
312 |         """
313 |         exists = False
314 |         if self.cluster_id:
315 |             exists = True
316 |         return exists
317 | 
318 |     @trace
319 |     def install_package(self, packages: _typing.List[Package]) -> str:
320 |         """
321 |         Installs the given packages
322 | 
323 |         Parameters
324 |         ----------
325 |         packages : List[Package]
326 |             The packages to install
327 | 
328 |         Returns
329 |         -------
330 |         str
331 |             Result from Databricks API call
332 |         """
333 |         mls = _dss.ManagedLibraryService(self.client.apiClient)
334 |         pkgs = [p.to_api_json() for p in packages]
335 |         res = mls.install_libraries(self.cluster_id, pkgs)
336 |         return res
337 | 
338 |     @property
339 |     def cluster_id(self) -> str:
340 |         """
341 |         Retrieves cluster's id
342 | 
343 |         Returns
344 |         -------
345 |         str
346 |             Id of the cluster
347 |         """
348 |         cs = self.cluster_service
349 |         cluster_list = cs.list_clusters()
350 |         id = None
351 |         if cluster_list:
352 |             matches = [c['cluster_id'] for c in cluster_list["clusters"] if c['cluster_name'] == self.cluster_name]
353 |             if matches:
354 |                 id = matches[0]
355 |         self.__cluster_id = id
356 |         return self.__cluster_id
357 | 
358 |     @trace
359 |     def delete_cluster_and_wait(self) -> bool:
360 |         """
361 |         Deletes the cluster and waits for completion
362 | 
363 |         """
364 |         cs = self.cluster_service
365 |         id = self.cluster_id
366 | 
367 |         is_deleted = False if id else True
368 | 
369 |         if not is_deleted:
370 |             cs.permanent_delete_cluster(id)
371 | 
372 |             requests = 0
373 |             seconds_interval = 20
374 |             timeout_requests = 20
375 | 
376 |             is_deleted = True
377 |             while self.cluster_exists():
378 |                 requests += 1
379 |                 _time.sleep(seconds_interval)
380 |                 message = "Waiting from {} seconds. Timeout at {}".format(
381 |                     seconds_interval * requests,
382 |                     timeout_requests * seconds_interval
383 |                 )
384 | 
385 |                 Log.get_instance().log_info(message=message)
386 | 
387 |                 if requests > 20:
388 |                     is_deleted = False
389 |         return is_deleted
390 | 
391 |     @trace
392 |     def cluster_state(self) -> str:
393 |         """
394 |         Checks cluster state
395 | 
396 |         Returns
397 |         -------
398 |         str
399 |             State of the cluster
400 |         """
401 |         cs = self.cluster_service
402 |         cluster_state = cs.get_cluster(self.cluster_id)["state"]
403 |         return cluster_state
404 | 
405 |     @trace
406 |     def start_cluster_and_wait(self) -> bool:
407 |         """
408 |         Starts the cluster and wait for it completion
409 | 
410 |         Returns
411 |         -------
412 |         bool
413 |             True if started
414 |         """
415 |         cluster_id = self.cluster_id
416 |         cs = self.cluster_service
417 | 
418 |         if self.cluster_state() == "RUNNING":
419 |             return "Already running"
420 |         elif self.cluster_state() == "PENDING":
421 |             _time.sleep(20)
422 |             # Waiting cluster to start
423 |             requests = 0
424 |             seconds_interval = 20
425 |             timeout_requests = 20
426 |             while self.cluster_state() == "PENDING":
427 |                 requests += 1
428 |                 _time.sleep(seconds_interval)
429 |                 message = "Waiting from {} seconds. Timeout at {}".format(
430 |                     seconds_interval * requests,
431 |                     timeout_requests * seconds_interval
432 |                 )
433 |                 Log.get_instance().log_info(message=message)
434 |                 if requests > 20:
435 |                     raise Exception("Cluster not started")
436 |             return True
437 |         elif self.cluster_state() in ["TERMINATED", "TERMINATING"]:
438 |             cs.start_cluster(cluster_id)
439 |             _time.sleep(20)
440 |             # Waiting cluster to start
441 |             requests = 0
442 |             seconds_interval = 20
443 |             timeout_requests = 20
444 |             while self.cluster_state() == "PENDING":
445 |                 requests += 1
446 |                 _time.sleep(seconds_interval)
447 |                 message = "Waiting from {} seconds. Timeout at {}".format(
448 |                     seconds_interval * requests,
449 |                     timeout_requests * seconds_interval
450 |                 )
451 |                 Log.get_instance().log_info(message=message)
452 |                 if requests > 20:
453 |                     raise Exception("Cluster not started")
454 |             return True
455 |         else:
456 |             raise Exception("Unmanaged state: {}".format(self.cluster_state()))
457 | 
458 | 
459 | class Jobs():
460 |     """
461 |     Rapresents a Databricks Job
462 |     """
463 |     @trace
464 |     def __init__(
465 |         self,
466 |         client: ResourceClient
467 |     ):
468 |         """
469 |         Instantiates this object
470 | 
471 |         Parameters
472 |         ----------
473 |         client : ResourceClient
474 |             A client
475 |         """
476 |         self.client = client
477 |         self.__jobs_service = _dss.JobsService(self.client.apiClient)
478 | 
479 |     @trace
480 |     def run_notebook_and_wait(
481 |         self,
482 |         destination_path: str,
483 |         cluster_id: str,
484 |         delete_run=False,
485 |     ) -> str:
486 |         """
487 |         Run a notebooks and waits for its completion
488 | 
489 |         Parameters
490 |         ----------
491 |         destination_path : str
492 |             Notebooks path
493 |         cluster_id : str
494 |             Cluster's id
495 |         delete_run : bool, optional
496 |             Deletes the run onces it's completed, by default False
497 | 
498 |         Returns
499 |         -------
500 |         str
501 |             Result from Databricks API call
502 |         """
503 |         djs = self.__jobs_service
504 |         destination_path_dbfs = destination_path
505 |         base = _os.path.basename(destination_path)
506 |         filename = _os.path.splitext(base)[0]
507 | 
508 |         job = djs.create_job(
509 |             name=filename,
510 |             existing_cluster_id=cluster_id,
511 |             notebook_task={"notebook_path": destination_path_dbfs}
512 |         )
513 | 
514 |         job_id = job["job_id"]
515 |         run = djs.run_now(job_id=job_id)
516 |         run_id = run['run_id']
517 |         run_status = djs.get_run(run_id)
518 |         # run_state = run_status['state']
519 | 
520 |         while run_status['state']["life_cycle_state"] != "TERMINATED":
521 |             _time.sleep(10)
522 |             run_status = djs.get_run(run_id)
523 |             if run_status['state']["life_cycle_state"] == 'INTERNAL_ERROR':
524 |                 raise Exception(run_status['state']["life_cycle_state"])
525 | 
526 |         output = None
527 | 
528 |         if run_status['state']['result_state'] == 'SUCCESS':
529 |             output = djs.get_run_output(run_id).get('notebook_output').get('result')
530 |         elif run_status['state']['result_state'] == 'FAILED':
531 |             output = "FAILED"
532 | 
533 |         if delete_run:
534 |             djs.delete_job(job_id)
535 |             djs.delete_run(run_id)
536 |         return output
537 | 
538 | 
539 | class Secret():
540 |     """
541 |     Manages Databricks' secrets
542 |     """
543 | 
544 |     @trace
545 |     def __init__(
546 |         self,
547 |         client: ResourceClient
548 |     ):
549 |         """
550 |         Instantiates this object.
551 | 
552 |         Parameters
553 |         ----------
554 |         client : ResourceClient
555 |             A ResourceClient object
556 |         """
557 |         self.client = client
558 |         self.__secret_service = None
559 | 
560 |     @property
561 |     def secret_service(self):
562 |         if not self.__secret_service:
563 |             self.__secret_service = _dss.SecretService(self.client.apiClient)
564 |         return self.__secret_service
565 | 
566 |     @trace
567 |     def delete_scope(self, scope: str) -> str:
568 |         """
569 |         Deletes the given scope
570 | 
571 |         Parameters
572 |         ----------
573 |         scope : str
574 |             Scope's name
575 | 
576 |         Returns
577 |         -------
578 |         str
579 |             Result from Databricks API call
580 |         """
581 |         dbs = self.secret_service
582 |         res = dbs.delete_scope(scope)
583 |         return res
584 | 
585 |     @trace
586 |     def add_scope(self, scope: str) -> str:
587 |         """
588 |         Creates the scope
589 | 
590 |         Parameters
591 |         ----------
592 |         scope : str
593 |             Scope's name
594 | 
595 |         Returns
596 |         -------
597 |         str
598 |             Result from Databricks API call
599 |         """
600 |         res = self.secret_service.create_scope(
601 |             scope,
602 |             initial_manage_principal='users'
603 |         )
604 |         return res
605 | 
606 |     @trace(attrs_refact=['secret_value'])
607 |     def add_secret(self, scope: str, secret_name: str, secret_value: str) -> str:
608 |         """
609 |         Adds a secret to the given scope.
610 |         If a secret already exists with the same name, it will be overwritten.
611 | 
612 |         Note
613 |         ----
614 |         The server encrypts the secret using the secret scope’s encryption settings before storing it. You must have WRITE or MANAGE permission on the secret scope.
615 | 
616 |         Parameters
617 |         ----------
618 |         scope : str
619 |             Name of the scope
620 |         secret_name : str
621 |             Name of the secret
622 |         secret_value : str
623 |             Value of the secret
624 | 
625 |         Returns
626 |         -------
627 |         str
628 |             Result from Databricks API call
629 |         """
630 |         dbs = self.secret_service
631 |         res = dbs.put_secret(
632 |             scope=scope,
633 |             key=secret_name,
634 |             string_value=secret_value,
635 |             bytes_value=None
636 |         )
637 |         return res
638 | 
639 |     def scopes(self) -> _typing.List[str]:
640 |         """
641 |         Retrieve list of scopes.
642 | 
643 |         Returns
644 |         -------
645 |         List[str]
646 |             List of scopes
647 |         """
648 |         dbs = self.secret_service
649 |         sc = dbs.list_scopes()
650 |         if sc:
651 |             scopes = [s["name"] for s in sc["scopes"]]
652 |         else:
653 |             scopes = []
654 |         return scopes
655 | 
656 |     @trace(attrs_refact=['secrets'])
657 |     def create_scope_secrets(self, scope: str, secrets: dict):
658 |         """
659 |         Insert a secret under the provided scope with the given name.
660 |         If the scope already exists, it will be dropped and recreated.
661 |         If a secret already exists with the same name, it will be overwritten.
662 | 
663 |         Notes
664 |         -----
665 |         The server encrypts the secret using the secret scope’s encryption settings before storing it. You must have WRITE or MANAGE permission on the secret scope.
666 | 
667 |         Parameters
668 |         ----------
669 |         scope : str
670 |             Name of the scope
671 |         secret_names : [str]
672 |             Secrets names that must be searched in the Key Vault
673 |         """
674 | 
675 |         scopes = self.scopes()
676 |         dbs = self.secret_service
677 |         # The only way to update a secret is to delete it and reload it
678 |         if scope in scopes:
679 |             dbs.delete_scope(scope)
680 | 
681 |         self.add_scope(scope, dbs)
682 |         for name, value in secrets.items():
683 |             self.add_secret(
684 |                 scope=scope,
685 |                 name=name,
686 |                 value=value
687 |             )
688 | 
689 |     @trace
690 |     def list_scopes_secrets(self) -> _typing.Dict:
691 |         """
692 |         Returns all the scopes and their secrets
693 | 
694 |         Returns
695 |         -------
696 |         dict
697 |             The secrets {"secret": ...}
698 |         """
699 |         dbs = self.secret_service
700 |         scopes = dbs.list_scopes()["scopes"]
701 |         sc_names = [sc["name"] for sc in scopes]
702 |         res = {}
703 |         for name in sc_names:
704 |             sc_secrets = dbs.list_secrets(name)
705 |             secrets = None
706 |             if sc_secrets:
707 |                 secrets = [k["key"] for k in sc_secrets["secrets"]]
708 |             res[name] = secrets
709 |         return res
710 | 
711 | 
712 | class Workspace():
713 |     """
714 |     Manages a Databricks Workspace
715 |     """
716 | 
717 |     def __init__(
718 |         self,
719 |         client: ResourceClient
720 |     ):
721 |         """
722 |         Instantiates this object
723 | 
724 |         Parameters
725 |         ----------
726 |         client : ResourceClient
727 |             A ResourceClient object
728 |         """
729 |         self.client = client
730 |         self.__workspace_service = None
731 | 
732 |     @property
733 |     def workspace_service(self) -> _dss.WorkspaceService:
734 |         """
735 |         The Databricks Workspace Service
736 | 
737 |         Returns
738 |         -------
739 |         WorkspaceService
740 |             Workspace service from Databricks API
741 |         """
742 |         if not self.__workspace_service:
743 |             self.__workspace_service = _dss.WorkspaceService(self.client.apiClient)
744 |         return self.__workspace_service
745 | 
746 |     @trace
747 |     def upload_content(
748 |         self,
749 |         destination_path: str,
750 |         content: str,
751 |         format="SOURCE",  # TODO: Rename to content_format
752 |         language="PYTHON",
753 |         overwrite=True
754 |     ) -> str:
755 |         """
756 |         Uploads content to the workspace
757 | 
758 |         Parameters
759 |         ----------
760 |         destination_path : str
761 |             Destination of the file
762 |         content : str
763 |             File's content as string
764 |         format : str, optional
765 |             Databricks file format, by default "SOURCE"
766 |         overwrite : bool, optional
767 |             Overwrite the file if exists, by default True
768 | 
769 |         Returns
770 |         -------
771 |         str
772 |             Result from Databricks API call
773 |         """
774 |         base64_bytes = _base64.b64encode(content.encode("utf-8"))
775 |         base64_string = base64_bytes.decode('utf-8')
776 |         dws = self.workspace_service
777 |         file_dir = _os.path.dirname(destination_path)
778 |         dws.mkdirs(file_dir)
779 |         res = dws.import_workspace(
780 |             path=destination_path,
781 |             content=base64_string,
782 |             format=format,
783 |             language=language,
784 |             overwrite=overwrite
785 |         )
786 |         return res
787 | 
788 |     @trace
789 |     def list_content(self, destination_folder: str) -> _typing.List[str]:
790 |         """
791 |         Lists the content in the given folder
792 | 
793 |         Parameters
794 |         ----------
795 |         destination_folder : str
796 |             Folder to check
797 | 
798 |         Returns
799 |         -------
800 |         List[str]
801 |             List of files
802 |         """
803 |         dws = self.workspace_service
804 |         content = dws.list(destination_folder)
805 |         return content
806 | 
807 |     @trace
808 |     def delete_content(self, destination_path: str) -> str:
809 |         """
810 |         Deletes the content at the given path
811 | 
812 |         Parameters
813 |         ----------
814 |         destination_path : str
815 |             Path of the file
816 | 
817 |         Returns
818 |         -------
819 |         str
820 |             Result from Databricks API call
821 |         """
822 |         dws = self.workspace_service
823 |         res = dws.delete(path=destination_path)
824 |         return res
825 | 
826 |     @trace
827 |     def make_dir(self, path_dir: str) -> str:
828 |         """
829 |         Makes a directory in the given folder
830 | 
831 |         Parameters
832 |         ----------
833 |         path_dir : str
834 |             Base directory to which add the folder
835 | 
836 |         Returns
837 |         -------
838 |         str
839 |             Result from Databricks API call
840 |         """
841 |         dws = self.workspace_service
842 |         res = dws.mkdirs(path_dir)
843 |         return res
844 | 
845 | 
846 | class DatabricksResourceManager():
847 |     """
848 |     The orchestrator for managing the Databricks resources with ease
849 |     """
850 | 
851 |     @trace
852 |     def __init__(
853 |         self,
854 |         client: ResourceClient,
855 |         cluster_name: str,
856 |         cluster_configuration: dict,
857 |         log_to_appi: bool = False
858 |     ):
859 |         """
860 |         Instantiates this object
861 | 
862 |         Parameters
863 |         ----------
864 |         client : ResourceClient
865 |             A ResourceClient object
866 |         cluster_name : str
867 |             Name of the cluster
868 |         cluster_configuration : Dict, optional
869 |             The configuration of the cluster, by default None
870 |         log_to_appi: bool
871 |             Log to application insights
872 |         """
873 |         self.client = client
874 | 
875 |         self.cluster = Cluster(
876 |             client=client,
877 |             cluster_name=cluster_name,
878 |             cluster_configuration=cluster_configuration
879 |             # appinsight_instrumentation_key="",
880 |         )
881 | 
882 |         self.jobs = Jobs(client=self.client)
883 |         self.secret = Secret(client=self.client)
884 |         self.workspace = Workspace(client=self.client)
885 |         self.log_to_appi = log_to_appi
886 | 
887 |     @trace
888 |     def __dkea_token(self) -> str:
889 |         """
890 |         Returns the DKEA token needed for using dbutils secrets locally.
891 | 
892 |         Returns
893 |         -------
894 |         str
895 |             The token
896 |         """
897 |         code = '''
898 |         v = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
899 |         k = v.replace("Some(", "").replace(")", "")
900 |         key = k[:4] + '-' + k[4:]
901 | 
902 |         dbutils.notebook.exit(key)
903 |         '''
904 |         output_notebook = self.run_python_code_on_notebook(code)
905 |         dkea = output_notebook.replace('-', '')
906 |         return dkea
907 | 
908 |     @trace
909 |     def run_python_code_on_notebook(self, code: str) -> str:
910 |         """
911 |         Runs python code as a notebook
912 | 
913 |         Parameters
914 |         ----------
915 |         code : str
916 |             The code to execute
917 | 
918 |         Returns
919 |         -------
920 |         str
921 |             Results from Databricks API call
922 |         """
923 |         # TODO: Add _and_wait in the name
924 |         temp_name = "{}.py".format(next(_tempfile._get_candidate_names()))
925 |         defult_tmp_dir = _tempfile._get_default_tempdir()
926 |         temp_file_path = _os.path.join(defult_tmp_dir, temp_name)
927 |         self.workspace.upload_content(temp_file_path, code)
928 |         output = self.jobs.run_notebook_and_wait(
929 |             destination_path=temp_file_path,
930 |             cluster_id=self.cluster.cluster_id,
931 |             delete_run=True
932 |         )
933 |         self.workspace.delete_content(temp_file_path)
934 |         return output
935 | 
936 |     @trace
937 |     def run_notebook_and_wait(
938 |         self,
939 |         destination_path: str,
940 |         delete_run=False
941 |     ):
942 |         return self.jobs.run_notebook_and_wait(
943 |             destination_path=destination_path,
944 |             cluster_id=self.cluster.cluster_id,
945 |             delete_run=delete_run
946 |         )
947 | 


--------------------------------------------------------------------------------
/src/modules/dbkenv/local.py:
--------------------------------------------------------------------------------
 1 | import pyspark.databricks_connect as _dbc
 2 | 
 3 | 
 4 | 
 5 | 
 6 | class DatabricksLocal:
 7 |     """
 8 |     Sets up the local environment to use a remote instance of Databricks
 9 |     """
10 |     def __init__(
11 |         self,
12 |         host: str,
13 |         databricks_token: str,
14 |         cluster_id: str,
15 |         org_id: str,
16 |         port=15001
17 |     ):
18 |         """
19 |         Instantiates this object.
20 | 
21 |         Parameters
22 |         ----------
23 |         host : str
24 |             Databricks host
25 |         databricks_token : str
26 |             Personal token
27 |         cluster_id : str
28 |             Cluster's id
29 |         org_id : str
30 |             Organization id
31 |         port : int, optional
32 |             Port for connection, by default 15001
33 |         """
34 |         self.host = host
35 |         self.databricks_token = databricks_token
36 |         self.cluster_id = cluster_id
37 |         self.org_id = org_id
38 |         self.port = port
39 | 
40 |     def initialize(self):
41 |         """Initialize the configuration."""
42 |         _dbc.save_config(
43 |             host=self.host,
44 |             token=self.databricks_token,
45 |             cluster=self.cluster_id,
46 |             org_id=self.org_id,
47 |             port=self.port
48 |         )
49 |         _dbc.test()
50 |         return True
51 | 


--------------------------------------------------------------------------------
/src/modules/dbkenv/requirements.txt:
--------------------------------------------------------------------------------
1 | databricks-cli==0.9.1
2 | databricks-connect==7.*


--------------------------------------------------------------------------------
/src/modules/devmaint/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/modules/devmaint/__init__.py


--------------------------------------------------------------------------------
/src/modules/devmaint/command_line.py:
--------------------------------------------------------------------------------
 1 | from .docgenerator import generate_documentation
 2 | import argparse
 3 | from argparse import Namespace
 4 | 
 5 | 
 6 | 
 7 | 
 8 | 
 9 | def arguments() -> Namespace:
10 |     parser = argparse.ArgumentParser(
11 |         prog='docgen',
12 |         description='Generates the documentation of the given package.'
13 |     )
14 |     parser.add_argument('-p', type=str, help='package parent folder')
15 |     parser.add_argument('-n', type=str, help='package name')
16 |     parser.add_argument('-o', default='documentation.md', help="output filename")
17 |     args = parser.parse_args()
18 |     return args
19 | 
20 | 
21 | def main():
22 |     """
23 |     Main function that runs this script
24 |     """
25 | 
26 |     args = arguments()
27 | 
28 |     package_parent_path = args.p.strip()
29 |     package_name = args.n.strip()
30 |     output_name = args.o.strip()
31 | 
32 |     outpath = generate_documentation(
33 |         package_parent_path=package_parent_path,
34 |         package_name=package_name,
35 |         output_name=output_name
36 |     )
37 | 
38 |     print(f'Documentation saved in "{outpath}"')
39 | 


--------------------------------------------------------------------------------
/src/modules/devmaint/docgenerator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module is used to create the documentation of the library
  3 | in a compatible way with Azure DevOps Wiki.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | Make sure that you have installed pandoc in Ubuntu
  8 | 
  9 | ```bash
 10 | sudo apt-get install pandoc
 11 | ```
 12 | """
 13 | 
 14 | import pdoc
 15 | from pdoc import Module
 16 | from pypandoc import convert_text
 17 | import pypandoc as pp
 18 | import re
 19 | from typing import List
 20 | import sys
 21 | from pathlib import Path
 22 | 
 23 | 
 24 | 
 25 | 
 26 | def recursive_mods(mod: Module) -> List[Module]:
 27 |     """
 28 |     Gets all the submodules of the given module
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     mod : pdoc.Module
 33 |         The father module
 34 | 
 35 |     Yields
 36 |     -------
 37 |     [pdoc.Module]
 38 |         List of the module with the submodules
 39 |     """
 40 |     yield mod
 41 |     for submod in mod.submodules():
 42 |         yield from recursive_mods(submod)
 43 | 
 44 | 
 45 | def normalize_md(text: str) -> str:
 46 |     """
 47 |     Applies a series of normalization steps to a text
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     text : str
 52 |         Text to manipulate
 53 | 
 54 |     Returns
 55 |     -------
 56 |     str
 57 |         The new text
 58 |     """
 59 |     # Replace \r\n with \n
 60 |     res = re.sub(r"(\r\n)", '\n', text)
 61 |     # Remove multiple lines
 62 |     res = re.sub(r"(\n{2,})", '\n', res)
 63 |     # Fix title =
 64 |     res = re.sub(r"(\n\n)=", '\n', res)
 65 |     # Fix title -
 66 |     res = re.sub(r"(\n\n)-", '\n', res)
 67 |     # Fix parameters
 68 |     res = re.sub(r"(^\*{2,})", '\n**', res)
 69 |     return res
 70 | 
 71 | 
 72 | def normalize_pandoc(text: str) -> str:
 73 |     """
 74 |     Series of normalization steps for pandoc generated text
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     text : str
 79 |         Pandoc text
 80 | 
 81 |     Returns
 82 |     -------
 83 |     str
 84 |         Normalized pandoc text
 85 |     """
 86 |     text_norm = re.sub(r"(\r\n)", '\n', text)
 87 |     text_norm = re.sub(r"( {4})", '\t', text_norm)
 88 |     text_norm = re.sub(r'(-----\n\nGenerated by.*)', '----', text_norm)
 89 |     return text_norm
 90 | 
 91 | 
 92 | def set_methods_ado_links(text: str) -> str:
 93 |     """
 94 |     Add links to the markdown text compatible with Azure DevOps
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     text : str
 99 |         [description]
100 | 
101 |     Returns
102 |     -------
103 |     str
104 |         [description]
105 |     """
106 |     text_links = text
107 |     links = set(re.findall(r"\(#(.*)\)", text))
108 |     for link in links:
109 |         text_links = text_links.replace("(#{})".format(link), "(#module-`{}`)".format(link))
110 |     return text_links
111 | 
112 | 
113 | def convert_format(text: str, f='markdown+abbreviations', t='commonmark') -> str:
114 |     """
115 |     Converts a text from a format to another
116 | 
117 |     Parameters
118 |     ----------
119 |     text : str
120 |         The text to convert
121 |     f : str, optional
122 |         The from format, by default 'markdown+abbreviations'
123 |     t : str, optional
124 |         [description], by default 'commonmark'
125 | 
126 |     Returns
127 |     -------
128 |     str
129 |         [description]
130 |     """
131 |     return convert_text(text, format=f, to=t)
132 | 
133 | 
134 | def modules_documentation_pdf(modules: List[Module], f='markdown+abbreviations', t='commonmark') -> str:
135 |     doc_string = pdoc._render_template(template_name='/pdf.mako', modules=modules)
136 |     md_raw = convert_format(doc_string)
137 |     md = normalize_pandoc(md_raw)
138 |     md = set_methods_ado_links(md)
139 |     return md
140 | 
141 | 
142 | # def modules_documentation_text(modules: [pdoc.Module], f='markdown+abbreviations', t='commonmark') -> str:
143 | #     """
144 | #     An attempt to create the documentation starting from the text extracted from pdoc.
145 | #     !Not to use
146 | 
147 | #     Parameters
148 | #     ----------
149 | #     modules : list(pdoc.Module)
150 | #         List of modules from which extract the documentation
151 | #     f : str, optional
152 | #         From pandoc format, by default 'markdown+abbreviations'
153 | #     t : str, optional
154 | #         To pandoc format, by default 'commonmark'
155 | 
156 | #     Returns
157 | #     -------
158 | #     str
159 | #         [description]
160 | #     """
161 | #     docs = extract_documentation_as_text(modules)
162 | #     md_raw = convert_text(docs, format=f, to=t)
163 | #     md = normalize_pandoc(md_raw)
164 | #     md = set_methods_ado_links(md)
165 | #     return md
166 | 
167 | 
168 | def extract_documentation_as_text(modules: List[Module]) -> str:
169 |     """
170 |     Extracts the documentation as text from the given modules.
171 |     Than these are merged with a separator.
172 | 
173 |     Parameters
174 |     ----------
175 |     modules : list(pdoc.Modules)
176 |         The modules for which create the documentation
177 | 
178 |     Returns
179 |     -------
180 |     str
181 |         The documentation as plain text
182 |     """
183 |     module_texts = "___".join([x.text() for x in modules])
184 |     return module_texts
185 | 
186 | 
187 | def extract_modules(modules_parent: str) -> List[Module]:
188 |     """
189 |     Extracts recursively all the modules that are part of the given module
190 | 
191 |     Parameters
192 |     ----------
193 |     modules_parent : str
194 |         Name of the module from which start the extraction
195 | 
196 |     Returns
197 |     -------
198 |     [pdoc.Module]
199 |         List of pdoc modules
200 |     """
201 |     context = pdoc.Context()
202 |     pdoc.link_inheritance(context)
203 |     mod = pdoc.Module(modules_parent, context=context)
204 |     modules_ls = [m for m in recursive_mods(mod)]
205 |     return modules_ls
206 | 
207 | 
208 | def test_markdown_convertions(text: str, module_name: str):
209 |     pandoc_from, pandoc_to = pp.get_pandoc_formats()
210 |     res = []
211 |     for f in [x for x in pandoc_from if 'mark' in x]:
212 |         for t in [x for x in pandoc_to if 'mark' in x]:
213 |             text_new = normalize_pandoc(convert_format(text, f, t))
214 |             res.append((text_new, f, t))
215 |             with open('docs_test\\test_doc_{}_{}_{}.md'.format(module_name, f, t), 'w', encoding="utf-8") as out:
216 |                 out.write(text_new)
217 | 
218 |     return True
219 | 
220 | 
221 | def create_adow_documentation(package: str) -> str:
222 |     """
223 |     Creates the documentation to use with Azure DevOps Wiki (adow)
224 | 
225 |     Parameters
226 |     ----------
227 |     module : str
228 |         The name of the module for which create the documentation
229 | 
230 |     Returns
231 |     -------
232 |     str
233 |         The documentation
234 |     """
235 |     modules = extract_modules(package)
236 |     modules_documentation = [modules_documentation_pdf([module]) for module in modules]
237 |     package_documentation = "".join(modules_documentation)
238 |     return package_documentation
239 | 
240 | 
241 | def generate_documentation(package_parent_path: str, package_name: str, output_name: str) -> Path:
242 |     """
243 |     Generates and saves the documentation
244 | 
245 |     Parameters
246 |     ----------
247 |     package_root_path : str
248 |         Root folder of the package
249 |     package_name : str
250 |         Name of the package
251 |     output_name : str
252 |         Output path of the file
253 |     """
254 |     output_path = Path(package_parent_path).joinpath(output_name)
255 |     sys.path.append(package_parent_path)
256 |     doc = create_adow_documentation(package_name)
257 |     with open(str(output_path), 'w+', encoding="utf-8") as out:
258 |         out.write(doc)
259 |     return output_path
260 | 


--------------------------------------------------------------------------------
/src/modules/devmaint/requirements.txt:
--------------------------------------------------------------------------------
1 | 
2 | pypandoc>=1.4
3 | pdoc3>=0.7.4


--------------------------------------------------------------------------------
/src/modules/tests/dbkcore/test_logger.py:
--------------------------------------------------------------------------------
 1 | """Test script."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | sys.path.append(str(Path(__file__).parent.parent.parent))
 6 | 
 7 | import os
 8 | from dotenv import load_dotenv
 9 | import pytest
10 | load_dotenv(override=True)
11 | 
12 | 
13 | from dbkcore.core import trace
14 | from dbkcore.core import Log
15 | Log("Unit Tests", os.environ['APPI_IK'])
16 | 
17 | 
18 | @trace
19 | def func_to_test_div(a, b):
20 |     res = a / b
21 |     return res
22 | 
23 | 
24 | def test_log_info():
25 |     Log.get_instance().log_info("Test info")
26 | 
27 | 
28 | def test_log_debug():
29 |     Log.get_instance().log_debug("Test debug")
30 | 
31 | 
32 | def test_log_critical():
33 |     Log.get_instance().log_critical("Test critical")
34 | 
35 | 
36 | def test_log_warning():
37 |     Log.get_instance().log_warning("Test warning")
38 | 
39 | 
40 | def test_log_error():
41 |     Log.get_instance().log_error("Test error")
42 | 
43 | 
44 | def test_trace():
45 |     func_to_test_div(a=3, b=2)
46 | 
47 | 
48 | def test_trace_error():
49 |     with pytest.raises(ZeroDivisionError):
50 |         try:
51 |             func_to_test_div(a=1, b=0)
52 |         except ZeroDivisionError as e:
53 |             Log.get_instance().log_error(f"Function call failed: {e}")
54 |             raise e
55 | 


--------------------------------------------------------------------------------
/src/modules/tests/dbkenv/content/py_file.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | 
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | from __future__ import print_function
19 | 
20 | import sys
21 | from random import random
22 | from operator import add
23 | 
24 | from pyspark.sql import SparkSession
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     """
29 |         Usage: pi [partitions]
30 |     """
31 |     spark = SparkSession\
32 |         .builder\
33 |         .appName("PythonPi")\
34 |         .getOrCreate()
35 | 
36 |     partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
37 |     n = 100000 * partitions
38 | 
39 |     def f(_):
40 |         x = random() * 2 - 1
41 |         y = random() * 2 - 1
42 |         return 1 if x ** 2 + y ** 2 <= 1 else 0
43 | 
44 |     count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
45 |     print("Pi is roughly %f" % (4.0 * count / n))


--------------------------------------------------------------------------------
/src/modules/tests/dbkenv/content/unittest_notebook.py:
--------------------------------------------------------------------------------
1 | dbutils.notebook.exit("success")


--------------------------------------------------------------------------------
/src/modules/tests/dbkenv/test_cluster.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from pathlib import Path
  3 | 
  4 | sys.path.append(str(Path(__file__).parent.parent.parent))
  5 | 
  6 | from dbkcore.core import Log
  7 | from dbkenv.core import ResourceClient
  8 | from dbkenv.core import Configuration
  9 | from dbkenv.core import DatabricksResourceManager
 10 | from dbkenv.local import DatabricksLocal
 11 | import json
 12 | import os
 13 | import pytest
 14 | 
 15 | # import time
 16 | 
 17 | 
 18 | def clients():
 19 |     configuration = Configuration(file_load=True)
 20 |     cluster_config_file = str(Path(__file__).parent.joinpath('unittest_cluster.json'))
 21 | 
 22 |     with open(cluster_config_file, 'r') as cl:
 23 |         cluster_configuration = json.load(cl)
 24 | 
 25 |     cluster_name = cluster_configuration['cluster_name']
 26 |     # instantiate the logger
 27 |     Log(
 28 |         name='unittest',
 29 |         connection_string=configuration.APPINSIGHT_CONNECTIONSTRING
 30 |     )
 31 |     client = ResourceClient(
 32 |         host=configuration.DATABRICKS_HOST,
 33 |         personal_token=configuration.DATABRICKS_TOKEN
 34 |     )
 35 |     drm = DatabricksResourceManager(
 36 |         client=client,
 37 |         cluster_name=cluster_name,
 38 |         cluster_configuration=cluster_configuration
 39 |     )
 40 | 
 41 |     return drm
 42 | 
 43 | 
 44 | def test_cluster_create():
 45 |     assert clients().cluster.create_cluster_and_wait(), "Cluster not created"
 46 | 
 47 | 
 48 | # def test_cluster_start():
 49 | #     assert clients().cluster.cluster_started(), "Failed to start cluster"
 50 | 
 51 | 
 52 | def test_local_dev():
 53 |     configuration = Configuration(file_load=True)
 54 |     dbc = DatabricksLocal(
 55 |         host=configuration.DATABRICKS_HOST,
 56 |         databricks_token=configuration.DATABRICKS_TOKEN,
 57 |         cluster_id=clients().cluster.cluster_id,
 58 |         org_id=configuration.DATABRICKS_ORDGID
 59 |     )
 60 |     success = dbc.initialize()
 61 |     assert success, "Failed to configure locally"
 62 | 
 63 | 
 64 | # Test content
 65 | source_file_name = 'unittest_notebook.py'
 66 | source_file_path = str(Path(__file__).parent.joinpath('content', source_file_name))
 67 | with open(source_file_path, 'r') as file:
 68 |     data = file.read()
 69 | 
 70 | destination_dir = "/unittesting"
 71 | destination_file_path = os.path.join(destination_dir, source_file_name)
 72 | 
 73 | 
 74 | def test_file_upload():
 75 |     clients().workspace.make_dir(destination_dir)
 76 |     clients().workspace.upload_content(destination_file_path, data)
 77 |     content = clients().workspace.list_content(destination_folder=destination_file_path)
 78 |     elements_in_folder = [os.path.basename(e["path"]) for e in content['objects']]
 79 |     assert source_file_name in elements_in_folder, "Failed to upload the file"
 80 | 
 81 | 
 82 | def test_file_run():
 83 |     output = clients().run_notebook_and_wait(
 84 |         destination_path=destination_file_path,
 85 |         delete_run=True
 86 |     )
 87 |     assert output == "success", "Failed to upload and run notebook"
 88 | 
 89 | 
 90 | def test_file_delete():
 91 |     content = clients().workspace.list_content(destination_folder=destination_dir)
 92 |     if not content:
 93 |         pytest.skip("Folder is empty")
 94 |     elif source_file_name not in [os.path.basename(e["path"]) for e in content['objects']]:
 95 |         pytest.skip("File not in folder")
 96 | 
 97 |     clients().workspace.delete_content(destination_file_path)
 98 |     content = clients().workspace.list_content(destination_folder=destination_dir)
 99 |     elements_in_folder = []
100 |     if content: 
101 |         elements_in_folder = [os.path.basename(e["path"]) for e in content['objects']]
102 |     assert source_file_name not in elements_in_folder, "Failed to upload the file"
103 |     clients().workspace.delete_content(destination_dir)
104 | 
105 | 
106 | def test_run_code():
107 |     code = '''
108 |         a = 1
109 |         b = 2
110 |         c = a + b
111 |         dbutils.notebook.exit(c)
112 |     '''
113 |     output = clients().run_python_code_on_notebook(code)
114 |     assert output == "3", "Failed to compute code"
115 | 
116 | 
117 | def test_cluster_delete():
118 |     assert clients().cluster.delete_cluster_and_wait(), "Failed to delete cluster"
119 | 


--------------------------------------------------------------------------------
/src/modules/tests/dbkenv/unittest_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "autoscale": {
 3 |         "min_workers": 1,
 4 |         "max_workers": 4
 5 |     },
 6 |     "num_workers": 1,
 7 |     "cluster_name": "unittest_cluster",
 8 |     "spark_version": "7.3.x-cpu-ml-scala2.12",
 9 |     "spark_conf": {
10 |         "spark.databricks.delta.preview.enabled": "true"
11 |     },
12 |     "node_type_id": "Standard_DS3_v2",
13 |     "driver_node_type_id": "Standard_DS3_v2",
14 |     "ssh_public_keys": [],
15 |     "custom_tags": {},
16 |     "spark_env_vars": {
17 |         "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
18 |     },
19 |     "autotermination_minutes": 60,
20 |     "enable_elastic_disk": true
21 | }


--------------------------------------------------------------------------------
/src/modules/tests/pytest.ini:
--------------------------------------------------------------------------------
1 | # content of pytest.ini
2 | [pytest]
3 | markers =
4 |     cenmaint: test for cenmaint package
5 |     cenlocal: test for cenlocal package
6 |     cendatabricks: test for cendatabricks package


--------------------------------------------------------------------------------
/src/pipelines/dbkframework/documentation.md:
--------------------------------------------------------------------------------
   1 | # Module `dbkcore`
   2 | 
   3 | ## Sub-modules
   4 | 
   5 |   - [dbkcore.core](#module-`dbkcore.core`)
   6 |   - [dbkcore.helpers](#module-`dbkcore.helpers`)
   7 | 
   8 | ----
   9 | # Module `dbkcore.core`
  10 | 
  11 | ## Functions
  12 | 
  13 | ### Function `trace`
  14 | 
  15 | > 
  16 | > 
  17 | >	 def trace(
  18 | >		 original_function: Any = None,
  19 | >		 *,
  20 | >		 attrs_refact: List[str] = None
  21 | >	 )
  22 | 
  23 | Log the function call.
  24 | 
  25 | ###### Parameters
  26 | 
  27 |   - **`original_function`** : <code>Any</code>, optional  
  28 | 	Function to trace, by default None
  29 |   - **`attrs_refact`** : <code>List\[str\]</code>, optional  
  30 | 	List of parameters to hide from logging, by default None
  31 | 
  32 | ## Classes
  33 | 
  34 | ### Class `BaseObject`
  35 | 
  36 | > 
  37 | > 
  38 | >	 class BaseObject
  39 | 
  40 | Base class to use with any object new object. It implements the method
  41 | log which will be used for logging
  42 | 
  43 | #### Static methods
  44 | 
  45 | ##### `Method class_name`
  46 | 
  47 | > 
  48 | > 
  49 | >	 def class_name(
  50 | >		 cls
  51 | >	 ) ‑> str
  52 | 
  53 | #### Methods
  54 | 
  55 | ##### Method `log`
  56 | 
  57 | > 
  58 | > 
  59 | >	 def log(
  60 | >		 self,
  61 | >		 prefix='',
  62 | >		 suffix=''
  63 | >	 )
  64 | 
  65 | Specifices how to log the object
  66 | 
  67 | ### Class `Log`
  68 | 
  69 | > 
  70 | > 
  71 | >	 class Log(
  72 | >		 name: str,
  73 | >		 connection_string: str = None
  74 | >	 )
  75 | 
  76 | Helper class for Application Insight Logger.
  77 | 
  78 | Create a new Log object.
  79 | 
  80 | #### Parameters
  81 | 
  82 |   - **`name`** : <code>str</code>  
  83 | 	Name used by the logger for tracing
  84 |   - **`connection_string`** : <code>\[type\]</code>, optional  
  85 | 	Application Insight’s connection string
  86 | 
  87 | #### Instance variables
  88 | 
  89 | ##### Variable `logger`
  90 | 
  91 | Type: `logging.Logger`
  92 | 
  93 | Logger that will be used.
  94 | 
  95 | ###### Returns
  96 | 
  97 |   - <code>Logger</code>  
  98 | 	This logger
  99 | 
 100 | ##### Variable `tracer`
 101 | 
 102 | Type: `opencensus.trace.tracer.Tracer`
 103 | 
 104 | Tracer that will be used.
 105 | 
 106 | ###### Returns
 107 | 
 108 |   - <code>Tracer</code>  
 109 | 	The tracer
 110 | 
 111 | #### Static methods
 112 | 
 113 | ##### `Method get_instance`
 114 | 
 115 | > 
 116 | > 
 117 | >	 def get_instance()
 118 | 
 119 | Current instance
 120 | 
 121 | #### Methods
 122 | 
 123 | ##### Method `log_critical`
 124 | 
 125 | > 
 126 | > 
 127 | >	 def log_critical(
 128 | >		 self,
 129 | >		 message: str,
 130 | >		 prefix='',
 131 | >		 custom_dimension: dict = None
 132 | >	 )
 133 | 
 134 | Log a message as critical.
 135 | 
 136 | ###### Parameters
 137 | 
 138 |   - **`message`** : <code>str</code>  
 139 | 	The message
 140 | 
 141 | ##### Method `log_debug`
 142 | 
 143 | > 
 144 | > 
 145 | >	 def log_debug(
 146 | >		 self,
 147 | >		 message: str,
 148 | >		 prefix='',
 149 | >		 custom_dimension: dict = None
 150 | >	 )
 151 | 
 152 | Log a message as debug.
 153 | 
 154 | ###### Parameters
 155 | 
 156 |   - **`message`** : <code>str</code>  
 157 | 	The message
 158 | 
 159 | ##### Method `log_error`
 160 | 
 161 | > 
 162 | > 
 163 | >	 def log_error(
 164 | >		 self,
 165 | >		 message: str,
 166 | >		 include_stack=True,
 167 | >		 prefix='',
 168 | >		 custom_dimension: dict = None
 169 | >	 )
 170 | 
 171 | Log a message as error.
 172 | 
 173 | ###### Parameters
 174 | 
 175 |   - **`message`** : <code>str</code>  
 176 | 	The message
 177 | 
 178 | ##### Method `log_info`
 179 | 
 180 | > 
 181 | > 
 182 | >	 def log_info(
 183 | >		 self,
 184 | >		 message: str,
 185 | >		 prefix='',
 186 | >		 custom_dimension: dict = None
 187 | >	 )
 188 | 
 189 | Log a message as info.
 190 | 
 191 | ###### Parameters
 192 | 
 193 |   - **`message`** : <code>str</code>  
 194 | 	The message
 195 | 
 196 | ##### Method `log_warning`
 197 | 
 198 | > 
 199 | > 
 200 | >	 def log_warning(
 201 | >		 self,
 202 | >		 message: str,
 203 | >		 prefix='',
 204 | >		 custom_dimension: dict = None
 205 | >	 )
 206 | 
 207 | Log a message as warning.
 208 | 
 209 | ###### Parameters
 210 | 
 211 |   - **`message`** : <code>str</code>  
 212 | 	The message
 213 | 
 214 | ##### Method `trace_function`
 215 | 
 216 | > 
 217 | > 
 218 | >	 def trace_function(
 219 | >		 self,
 220 | >		 name: str,
 221 | >		 kwargs: dict
 222 | >	 ) ‑> Optional[opencensus.trace.span.Span]
 223 | 
 224 | Traces a function
 225 | 
 226 | ###### Parameters
 227 | 
 228 |   - **`name`** : <code>str</code>  
 229 | 	Name of the function used for tracing
 230 |   - **`name`** : <code>kwargs</code>  
 231 | 	The parameters of the function
 232 | 
 233 | ###### Returns
 234 | 
 235 |   - <code>Span</code>  
 236 | 	A Span that can be used for customizing logging
 237 | 
 238 | ### Class `Singleton`
 239 | 
 240 | > 
 241 | > 
 242 | >	 class Singleton(
 243 | >		 *args,
 244 | >		 **kwargs
 245 | >	 )
 246 | 
 247 | Create a singleton.
 248 | 
 249 | #### Ancestors (in MRO)
 250 | 
 251 |   - [builtins.type](#module-`builtins.type`)
 252 | 
 253 | ----
 254 | # Module `dbkcore.helpers`
 255 | 
 256 | Various utilities for speed up development
 257 | 
 258 | ## Functions
 259 | 
 260 | ### Function `add_folder_in_current_directory`
 261 | 
 262 | > 
 263 | > 
 264 | >	 def add_folder_in_current_directory(
 265 | >		 folder_name: str
 266 | >	 ) ‑> bool
 267 | 
 268 | Add a folder in the current directory.
 269 | 
 270 | ###### Parameters
 271 | 
 272 |   - **`folder_name`** : <code>str</code>  
 273 | 	New folder name
 274 | 
 275 | ###### Returns
 276 | 
 277 |   - <code>bool</code>  
 278 | 	True if success
 279 | 
 280 | ### Function `current_directory`
 281 | 
 282 | > 
 283 | > 
 284 | >	 def current_directory() ‑> str
 285 | 
 286 | Get current directory.
 287 | 
 288 | ###### Returns
 289 | 
 290 |   - <code>str</code>  
 291 | 	The current directory path
 292 | 
 293 | ### Function `is_json_serializable`
 294 | 
 295 | > 
 296 | > 
 297 | >	 def is_json_serializable(
 298 | >		 x: Any
 299 | >	 ) ‑> bool
 300 | 
 301 | Check if the object is serializable.
 302 | 
 303 | ###### Parameters
 304 | 
 305 |   - **`x`** : <code>Any</code>  
 306 | 	Object to validate
 307 | 
 308 | ###### Returns
 309 | 
 310 |   - <code>bool</code>  
 311 | 	True if success
 312 | 
 313 | ----
 314 | 
 315 | 
 316 | # Module `dbkdev`
 317 | 
 318 | ## Sub-modules
 319 | 
 320 |   - [dbkdev.core](#module-`dbkdev.core`)
 321 |   - [dbkdev.data\_steps](#module-`dbkdev.data_steps`)
 322 | 
 323 | ----
 324 | # Module `dbkdev.core`
 325 | 
 326 | ## Classes
 327 | 
 328 | ### Class `DevelopmentClient`
 329 | 
 330 | > 
 331 | > 
 332 | >	 class DevelopmentClient(
 333 | >		 dbutils,
 334 | >		 spark: pyspark.sql.session.SparkSession,
 335 | >		 ide_environment: dbkdev.core.IdeEnvironment
 336 | >	 )
 337 | 
 338 | Client to use for local Databricks’ local development
 339 | 
 340 | Instantiates this object
 341 | 
 342 | #### Parameters
 343 | 
 344 |   - **`dbutils`** : <code>Dbutils</code>  
 345 | 	The Dbutils instance to use
 346 |   - **`spark`** : <code>SparkSession</code>  
 347 | 	The SparkSession to use
 348 |   - **`ide_environment`**
 349 | 	: <code>[IdeEnvironment](#module-`dbkdev.core.IdeEnvironment "dbkdev.core.IdeEnvironment"`)</code>  
 350 | 	The environment used
 351 |   - **`deployment_environment`** : <code>DeploymentEnvironment</code>  
 352 | 	The deployment environment
 353 | 
 354 | #### Instance variables
 355 | 
 356 | ##### Variable `dbutils`
 357 | 
 358 | Type: `Any`
 359 | 
 360 | ##### Variable `ide_environment`
 361 | 
 362 | Type: `dbkdev.core.IdeEnvironment`
 363 | 
 364 | ##### Variable `mount_name`
 365 | 
 366 | Type: `str`
 367 | 
 368 | Standard name of the root mount for the configured storage account and
 369 | container
 370 | 
 371 | ###### Returns
 372 | 
 373 |   - <code>str</code>  
 374 | 	\[description\]
 375 | 
 376 | ##### Variable `mount_path`
 377 | 
 378 | Type: `str`
 379 | 
 380 | Standard mount path
 381 | 
 382 | ###### Returns
 383 | 
 384 |   - <code>str</code>  
 385 | 	The path
 386 | 
 387 | ##### Variable `spark`
 388 | 
 389 | Type: `pyspark.sql.session.SparkSession`
 390 | 
 391 | #### Static methods
 392 | 
 393 | ##### `Method get_instance`
 394 | 
 395 | > 
 396 | > 
 397 | >	 def get_instance()
 398 | 
 399 | #### Methods
 400 | 
 401 | ##### Method `create_schema`
 402 | 
 403 | > 
 404 | > 
 405 | >	 def create_schema(
 406 | >		 self,
 407 | >		 schema_databricks: str
 408 | >	 )
 409 | 
 410 | Creates a schema in Databricks
 411 | 
 412 | ###### Parameters
 413 | 
 414 |   - **`schema_databricks`** : <code>str</code>  
 415 | 	Name of the schema
 416 | 
 417 | ##### Method `files`
 418 | 
 419 | > 
 420 | > 
 421 | >	 def files(
 422 | >		 self,
 423 | >		 path: str
 424 | >	 ) ‑> list
 425 | 
 426 | ##### Method `list_databases`
 427 | 
 428 | > 
 429 | > 
 430 | >	 def list_databases(
 431 | >		 self
 432 | >	 ) ‑> List[str]
 433 | 
 434 | Gets the list of Databricks databases (a.k.a. schemas)
 435 | 
 436 | ###### Returns
 437 | 
 438 |   - <code>List\[str\]</code>  
 439 | 	List of schemas
 440 | 
 441 | ##### Method `list_mounts`
 442 | 
 443 | > 
 444 | > 
 445 | >	 def list_mounts(
 446 | >		 self
 447 | >	 ) ‑> list
 448 | 
 449 | ##### Method `list_tables`
 450 | 
 451 | > 
 452 | > 
 453 | >	 def list_tables(
 454 | >		 self,
 455 | >		 schema: str
 456 | >	 ) ‑> List[str]
 457 | 
 458 | List the tables in the given schema
 459 | 
 460 | ###### Parameters
 461 | 
 462 |   - **`schema`** : <code>str</code>  
 463 | 	The Databricks schema
 464 | 
 465 | ###### Returns
 466 | 
 467 |   - <code>List\[str\]</code>  
 468 | 	List of tables
 469 | 
 470 | ##### Method `load_temp_table`
 471 | 
 472 | > 
 473 | > 
 474 | >	 def load_temp_table(
 475 | >		 self,
 476 | >		 table_name: str
 477 | >	 ) ‑> pyspark.sql.dataframe.DataFrame
 478 | 
 479 | ##### Method `mount_exists`
 480 | 
 481 | > 
 482 | > 
 483 | >	 def mount_exists(
 484 | >		 self,
 485 | >		 mount_name: str
 486 | >	 ) ‑> bool
 487 | 
 488 | ##### Method `read_csv`
 489 | 
 490 | > 
 491 | > 
 492 | >	 def read_csv(
 493 | >		 self,
 494 | >		 file_path: str
 495 | >	 ) ‑> pyspark.sql.dataframe.DataFrame
 496 | 
 497 | ##### Method `read_parquet`
 498 | 
 499 | > 
 500 | > 
 501 | >	 def read_parquet(
 502 | >		 self,
 503 | >		 file_path: str
 504 | >	 ) ‑> pyspark.sql.dataframe.DataFrame
 505 | 
 506 | ##### Method `save_delta_table`
 507 | 
 508 | > 
 509 | > 
 510 | >	 def save_delta_table(
 511 | >		 self,
 512 | >		 dataframe: pyspark.sql.dataframe.DataFrame,
 513 | >		 schema: str,
 514 | >		 table_name: str,
 515 | >		 output_path: pathlib.Path,
 516 | >		 partition_columns: List[str] = None,
 517 | >		 mode: str = 'overwrite',
 518 | >		 overwrite_schema: bool = False
 519 | >	 )
 520 | 
 521 | Saves the dataframe as a delta table in an external location
 522 | \#\#\#\#\#\# Parameters
 523 | 
 524 |   - **`dataframe`** : <code>DataFrame</code>  
 525 | 	The dataframe
 526 |   - **`schema`** : <code>str</code>  
 527 | 	Destination schema
 528 |   - **`table_name`** : <code>str</code>  
 529 | 	Destination schema
 530 |   - **`output_path`** : <code>Path</code>  
 531 | 	Folder where to save the dataframe
 532 |   - **`partition_columns`** : <code>List\[str\]</code>  
 533 | 	Columns to use for partitioning, default is None
 534 |   - **`mode`** : <code>str</code>  
 535 | 	 
 536 | 
 537 | e.g. append, overwrite, passed to dataframe.write.saveAsTable
 538 | 
 539 | ##### Method `save_temp_table`
 540 | 
 541 | > 
 542 | > 
 543 | >	 def save_temp_table(
 544 | >		 self,
 545 | >		 dataframe: pyspark.sql.dataframe.DataFrame,
 546 | >		 table_name: str,
 547 | >		 cache=True
 548 | >	 )
 549 | 
 550 | ##### Method `set_dbkea`
 551 | 
 552 | > 
 553 | > 
 554 | >	 def set_dbkea(
 555 | >		 self,
 556 | >		 dbkea_token: str
 557 | >	 )
 558 | 
 559 | To use when the environment is LOCAL for using the dbutils secrets
 560 | 
 561 | ###### Parameters
 562 | 
 563 |   - **`dbkea_token`** : <code>str</code>  
 564 | 	The token
 565 | 
 566 | ##### Method `table_exists`
 567 | 
 568 | > 
 569 | > 
 570 | >	 def table_exists(
 571 | >		 self,
 572 | >		 schema_name: str,
 573 | >		 table_name: str
 574 | >	 )
 575 | 
 576 | ### Class `DevelopmentEngine`
 577 | 
 578 | > 
 579 | > 
 580 | >	 class DevelopmentEngine
 581 | 
 582 | #### Static methods
 583 | 
 584 | ##### `Method get_instance`
 585 | 
 586 | > 
 587 | > 
 588 | >	 def get_instance()
 589 | 
 590 | ### Class `IdeEnvironment`
 591 | 
 592 | > 
 593 | > 
 594 | >	 class IdeEnvironment(
 595 | >		 value,
 596 | >		 names=None,
 597 | >		 *,
 598 | >		 module=None,
 599 | >		 qualname=None,
 600 | >		 type=None,
 601 | >		 start=1
 602 | >	 )
 603 | 
 604 | An enumeration.
 605 | 
 606 | #### Ancestors (in MRO)
 607 | 
 608 |   - [builtins.str](#module-`builtins.str`)
 609 |   - [enum.Enum](#module-`enum.Enum`)
 610 | 
 611 | #### Class variables
 612 | 
 613 | ##### Variable `DATABRICKS`
 614 | 
 615 | ##### Variable `LOCAL`
 616 | 
 617 | ----
 618 | # Module `dbkdev.data_steps`
 619 | 
 620 | ## Functions
 621 | 
 622 | ### Function `apply_test`
 623 | 
 624 | > 
 625 | > 
 626 | >	 def apply_test(
 627 | >		 func
 628 | >	 )
 629 | 
 630 | Execute test function after the initialize.
 631 | 
 632 | ###### Notes
 633 | 
 634 | [Example](https://stackoverflow.com/a/15196410)
 635 | 
 636 | ### Function `log_output`
 637 | 
 638 | > 
 639 | > 
 640 | >	 def log_output(
 641 | >		 func
 642 | >	 )
 643 | 
 644 | Decorator for executing test in sequence
 645 | 
 646 | ###### Notes
 647 | 
 648 | [Example](https://stackoverflow.com/a/15196410)
 649 | 
 650 | ### Function `pre_apply_test`
 651 | 
 652 | > 
 653 | > 
 654 | >	 def pre_apply_test(
 655 | >		 func
 656 | >	 )
 657 | 
 658 | Execute test function before the initialize.
 659 | 
 660 | ###### Notes
 661 | 
 662 | [Example](https://stackoverflow.com/a/15196410)
 663 | 
 664 | ## Classes
 665 | 
 666 | ### Class `DataDirection`
 667 | 
 668 | > 
 669 | > 
 670 | >	 class DataDirection(
 671 | >		 value,
 672 | >		 names=None,
 673 | >		 *,
 674 | >		 module=None,
 675 | >		 qualname=None,
 676 | >		 type=None,
 677 | >		 start=1
 678 | >	 )
 679 | 
 680 | An enumeration.
 681 | 
 682 | #### Ancestors (in MRO)
 683 | 
 684 |   - [builtins.str](#module-`builtins.str`)
 685 |   - [enum.Enum](#module-`enum.Enum`)
 686 | 
 687 | #### Class variables
 688 | 
 689 | ##### Variable `IN`
 690 | 
 691 | ##### Variable `OUT`
 692 | 
 693 | ### Class `DataStep`
 694 | 
 695 | > 
 696 | > 
 697 | >	 class DataStep(
 698 | >		 spark: pyspark.sql.session.SparkSession,
 699 | >		 run_id: str
 700 | >	 )
 701 | 
 702 | Creates a datastep to be used in a pipeline
 703 | 
 704 | #### Parameters
 705 | 
 706 |   - **`metaclass`** : <code>\[type\]</code>, optional  
 707 | 	\[description\], by default abc.ABCMeta
 708 | 
 709 | #### Raises
 710 | 
 711 |   - <code>Exception</code>  
 712 | 	\[description\]
 713 | 
 714 | #### Instance variables
 715 | 
 716 | ##### Variable `display_name`
 717 | 
 718 | Type: `str`
 719 | 
 720 | ##### Variable `output_data`
 721 | 
 722 | Type: `dbkdev.data_steps.DataStepDataframe`
 723 | 
 724 | #### Methods
 725 | 
 726 | ##### Method `check_output`
 727 | 
 728 | > 
 729 | > 
 730 | >	 def check_output(
 731 | >		 self,
 732 | >		 **kwargs
 733 | >	 )
 734 | 
 735 | ##### Method `initialize`
 736 | 
 737 | > 
 738 | > 
 739 | >	 def initialize(
 740 | >		 self
 741 | >	 )
 742 | 
 743 | Define the DataStep logic.
 744 | 
 745 | ##### Method `pandas_read_csv`
 746 | 
 747 | > 
 748 | > 
 749 | >	 def pandas_read_csv(
 750 | >		 self,
 751 | >		 path: pathlib.Path
 752 | >	 ) ‑> dbkdev.data_steps.DataStepDataframe
 753 | 
 754 | ##### Method `set_output_data`
 755 | 
 756 | > 
 757 | > 
 758 | >	 def set_output_data(
 759 | >		 self,
 760 | >		 dataframe: Union[pyspark.sql.dataframe.DataFrame, pandas.core.frame.DataFrame],
 761 | >		 name='',
 762 | >		 cache: bool = False
 763 | >	 )
 764 | 
 765 | ##### Method `spark_read_csv`
 766 | 
 767 | > 
 768 | > 
 769 | >	 def spark_read_csv(
 770 | >		 self,
 771 | >		 path: pathlib.Path
 772 | >	 ) ‑> dbkdev.data_steps.DataStepDataframe
 773 | 
 774 | ##### Method `spark_read_parquet_path`
 775 | 
 776 | > 
 777 | > 
 778 | >	 def spark_read_parquet_path(
 779 | >		 self,
 780 | >		 path: pathlib.Path,
 781 | >		 cache=False
 782 | >	 ) ‑> dbkdev.data_steps.DataStepDataframe
 783 | 
 784 | ##### Method `spark_read_table`
 785 | 
 786 | > 
 787 | > 
 788 | >	 def spark_read_table(
 789 | >		 self,
 790 | >		 name: str
 791 | >	 ) ‑> dbkdev.data_steps.DataStepDataframe
 792 | 
 793 | ##### Method `spark_read_temp_table`
 794 | 
 795 | > 
 796 | > 
 797 | >	 def spark_read_temp_table(
 798 | >		 self,
 799 | >		 name: str
 800 | >	 ) ‑> dbkdev.data_steps.DataStepDataframe
 801 | 
 802 | ##### Method `test_is_dataframe_empty`
 803 | 
 804 | > 
 805 | > 
 806 | >	 def test_is_dataframe_empty(
 807 | >		 self,
 808 | >		 df: pyspark.sql.dataframe.DataFrame
 809 | >	 )
 810 | 
 811 | ##### Method `test_negative_values`
 812 | 
 813 | > 
 814 | > 
 815 | >	 def test_negative_values(
 816 | >		 self,
 817 | >		 cols: List[str],
 818 | >		 dt: dbkdev.data_steps.DataStepDataframe
 819 | >	 )
 820 | 
 821 | ##### Method `test_null_values`
 822 | 
 823 | > 
 824 | > 
 825 | >	 def test_null_values(
 826 | >		 self,
 827 | >		 cols: List[str],
 828 | >		 dt: dbkdev.data_steps.DataStepDataframe
 829 | >	 )
 830 | 
 831 | ##### Method `test_rows_diff`
 832 | 
 833 | > 
 834 | > 
 835 | >	 def test_rows_diff(
 836 | >		 self,
 837 | >		 dt_1: dbkdev.data_steps.DataStepDataframe,
 838 | >		 dt_2: dbkdev.data_steps.DataStepDataframe
 839 | >	 )
 840 | 
 841 | ##### Method `test_rows_eq`
 842 | 
 843 | > 
 844 | > 
 845 | >	 def test_rows_eq(
 846 | >		 self,
 847 | >		 dt_1: dbkdev.data_steps.DataStepDataframe,
 848 | >		 dt_2: dbkdev.data_steps.DataStepDataframe
 849 | >	 )
 850 | 
 851 | ##### Method `test_rows_geq`
 852 | 
 853 | > 
 854 | > 
 855 | >	 def test_rows_geq(
 856 | >		 self,
 857 | >		 dt_1: dbkdev.data_steps.DataStepDataframe,
 858 | >		 dt_2: dbkdev.data_steps.DataStepDataframe
 859 | >	 )
 860 | 
 861 | ##### Method `test_rows_leq`
 862 | 
 863 | > 
 864 | > 
 865 | >	 def test_rows_leq(
 866 | >		 self,
 867 | >		 dt_1: dbkdev.data_steps.DataStepDataframe,
 868 | >		 dt_2: dbkdev.data_steps.DataStepDataframe
 869 | >	 )
 870 | 
 871 | ##### Method `tests`
 872 | 
 873 | > 
 874 | > 
 875 | >	 def tests(
 876 | >		 self
 877 | >	 )
 878 | 
 879 | Define all the the tests that this step must pass
 880 | 
 881 | ### Class `DataStepDataframe`
 882 | 
 883 | > 
 884 | > 
 885 | >	 class DataStepDataframe(
 886 | >		 name: str,
 887 | >		 dataframe: Union[pyspark.sql.dataframe.DataFrame, pandas.core.frame.DataFrame],
 888 | >		 cache=False
 889 | >	 )
 890 | 
 891 | Base class to use with any object new object. It implements the method
 892 | log which will be used for logging
 893 | 
 894 | #### Ancestors (in MRO)
 895 | 
 896 |   - [dbkcore.core.BaseObject](#module-`dbkcore.core.BaseObject`)
 897 | 
 898 | #### Instance variables
 899 | 
 900 | ##### Variable `is_pandas`
 901 | 
 902 | Type: `bool`
 903 | 
 904 | ##### Variable `is_pyspark`
 905 | 
 906 | Type: `bool`
 907 | 
 908 | ##### Variable `rows`
 909 | 
 910 | #### Methods
 911 | 
 912 | ##### Method `columns_negative`
 913 | 
 914 | > 
 915 | > 
 916 | >	 def columns_negative(
 917 | >		 self
 918 | >	 ) ‑> List[str]
 919 | 
 920 | Identifies the columns with negative values
 921 | 
 922 | ###### Returns
 923 | 
 924 |   - <code>List\[str\]</code>  
 925 | 	Column names
 926 | 
 927 | ##### Method `columns_null`
 928 | 
 929 | > 
 930 | > 
 931 | >	 def columns_null(
 932 | >		 self
 933 | >	 ) ‑> List[str]
 934 | 
 935 | Identifies the columns with null values
 936 | 
 937 | ###### Returns
 938 | 
 939 |   - <code>List\[str\]</code>  
 940 | 	Column names
 941 | 
 942 | ##### Method `log`
 943 | 
 944 | > 
 945 | > 
 946 | >	 def log(
 947 | >		 self,
 948 | >		 direction: dbkdev.data_steps.DataDirection
 949 | >	 )
 950 | 
 951 | Specifices how to log the object
 952 | 
 953 | ##### Method `log_in`
 954 | 
 955 | > 
 956 | > 
 957 | >	 def log_in(
 958 | >		 self
 959 | >	 )
 960 | 
 961 | ##### Method `log_out`
 962 | 
 963 | > 
 964 | > 
 965 | >	 def log_out(
 966 | >		 self
 967 | >	 )
 968 | 
 969 | ##### Method `to_pandas`
 970 | 
 971 | > 
 972 | > 
 973 | >	 def to_pandas(
 974 | >		 self
 975 | >	 ) ‑> pandas.core.frame.DataFrame
 976 | 
 977 | ----
 978 | 
 979 | 
 980 | # Module `acai_ml`
 981 | 
 982 | ## Sub-modules
 983 | 
 984 |   - [acai\_ml.core](#module-`acai_ml.core`)
 985 | 
 986 | ----
 987 | # Module `acai_ml.core`
 988 | 
 989 | ## Classes
 990 | 
 991 | ### Class `Engine`
 992 | 
 993 | > 
 994 | > 
 995 | >	 class Engine
 996 | 
 997 | This is the core of the framework. It configures the environment to interact with the remote Databricks cluster.
 998 | 
 999 | Instantiate the current object
1000 | 
1001 | #### Static methods
1002 | 
1003 | ##### `Method get_instance`
1004 | 
1005 | > 
1006 | > 
1007 | >	 def get_instance()
1008 | 
1009 | Current singleton Engine
1010 | 
1011 | ###### Returns
1012 | 
1013 |   - <code>[Engine](#module-`acai_ml.core.Engine "acai_ml.core.Engine"`)</code>  
1014 | 	The Engine
1015 | 
1016 | ##### `Method ide_environment`
1017 | 
1018 | > 
1019 | > 
1020 | >	 def ide_environment() ‑> dbkdev.core.IdeEnvironment
1021 | 
1022 | Current Ide Environment
1023 | 
1024 | ###### Returns
1025 | 
1026 |   - <code>IdeEnvironment</code>  
1027 | 	The Ide Environment
1028 | 
1029 | ##### `Method is_ide_dataricks`
1030 | 
1031 | > 
1032 | > 
1033 | >	 def is_ide_dataricks() ‑> bool
1034 | 
1035 | Checks if the current environment is Databricks
1036 | 
1037 | ###### Returns
1038 | 
1039 |   - <code>bool</code>  
1040 | 	Check result
1041 | 
1042 | ##### `Method is_ide_local`
1043 | 
1044 | > 
1045 | > 
1046 | >	 def is_ide_local() ‑> bool
1047 | 
1048 | Checks if the current environment is Local
1049 | 
1050 | ###### Returns
1051 | 
1052 |   - <code>bool</code>  
1053 | 	Check result
1054 | 
1055 | #### Methods
1056 | 
1057 | ##### Method `dbutils`
1058 | 
1059 | > 
1060 | > 
1061 | >	 def dbutils(
1062 | >		 self
1063 | >	 )
1064 | 
1065 | Current dbutils
1066 | 
1067 | ###### Returns
1068 | 
1069 |   - <code>DBUtils</code>  
1070 | 	The DBUtils
1071 | 
1072 | ##### Method `initialize_env`
1073 | 
1074 | > 
1075 | > 
1076 | >	 def initialize_env(
1077 | >		 self
1078 | >	 )
1079 | 
1080 | Initializes the DevelopmentClient. That is, sets the dbutils and spark
1081 | context accordingly if the code is runt on cluster or locally.
1082 | 
1083 | ##### Method `initialize_logger`
1084 | 
1085 | > 
1086 | > 
1087 | >	 def initialize_logger(
1088 | >		 self,
1089 | >		 pipeline_name: str,
1090 | >		 appi_ik_scope: str = 'config',
1091 | >		 appi_ik_secret: str = 'APPI_IK'
1092 | >	 )
1093 | 
1094 | Initializes the logger
1095 | 
1096 | ###### Parameters
1097 | 
1098 |   - **`pipeline_name`** : <code>str</code>  
1099 | 	Name to use with the logger. It will be the base name used for all
1100 | 	the upcoming logs and tracing
1101 |   - **`appi_ik_scope`** : <code>str</code>, optional  
1102 | 	Databricks secret scope where the Application Insight key is stored,
1103 | 	by default “dds”
1104 |   - **`appi_ik_secret`** : <code>str</code>, optional  
1105 | 	Databricks secret name where the Application Insight key is stored,
1106 | 	by default “appiik”
1107 | 
1108 | ###### Raises
1109 | 
1110 |   - <code>ValueError</code>  
1111 | 	Unknown Ide Environment used
1112 | 
1113 | ##### Method `run_notebook`
1114 | 
1115 | > 
1116 | > 
1117 | >	 def run_notebook(
1118 | >		 self,
1119 | >		 notebook: str,
1120 | >		 args: Dict[~KT, ~VT],
1121 | >		 timeout=86400,
1122 | >		 error_raise=True
1123 | >	 )
1124 | 
1125 | ##### Method `run_notebook_with_retry`
1126 | 
1127 | > 
1128 | > 
1129 | >	 def run_notebook_with_retry(
1130 | >		 self,
1131 | >		 notebook: str,
1132 | >		 args: Dict[~KT, ~VT],
1133 | >		 timeout=86400,
1134 | >		 max_retries=3
1135 | >	 )
1136 | 
1137 | Runs the specified notebook through dbutils
1138 | 
1139 | ###### Parameters
1140 | 
1141 |   - **`notebook`** : <code>str</code>  
1142 | 	Name or path of the notebook
1143 |   - **`args`** : <code>Dict</code>  
1144 | 	\[description\]
1145 |   - **`timeout`** : <code>int</code>, optional  
1146 | 	\[description\], by default 86400
1147 |   - **`max_retries`** : <code>int</code>, optional  
1148 | 	\[description\], by default 3
1149 | 
1150 | ###### Returns
1151 | 
1152 | \[type\] \[description\]
1153 | 
1154 | ###### Raises
1155 | 
1156 |   - <code>e</code>  
1157 | 	\[description\]
1158 | 
1159 | ##### Method `spark`
1160 | 
1161 | > 
1162 | > 
1163 | >	 def spark(
1164 | >		 self
1165 | >	 ) ‑> pyspark.sql.session.SparkSession
1166 | 
1167 | Current spark context
1168 | 
1169 | ###### Returns
1170 | 
1171 |   - <code>SparkSession</code>  
1172 | 	Spark context
1173 | 
1174 | ----
1175 | 


--------------------------------------------------------------------------------
/src/pipelines/dbkframework/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file may be used to create an environment using:
2 | opencensus-ext-azure>=1.0.7
3 | typeguard>=2.12.0
4 | pandas>=1.2.4
5 | pyspark
6 | pydataset


--------------------------------------------------------------------------------
/src/pipelines/dbkframework/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from datetime import datetime
 3 | from pathlib import Path
 4 | 
 5 | current_file = Path(__file__).absolute()
 6 | print(f'Current file path: {current_file}')
 7 | # current_file_folder = Path(os.getcwd())
 8 | current_file_folder = Path(__file__).parent.absolute()
 9 | print(f"Current folder: {current_file_folder}")
10 | 
11 | path_readme = current_file_folder.joinpath('documentation.md')
12 | modules_root = current_file.parent.parent.parent.joinpath('modules')
13 | 
14 | import sys
15 | sys.path.append(str(current_file.parent.parent.parent.joinpath('modules')))
16 | from devmaint.docgenerator import create_adow_documentation
17 | 
18 | package_name = current_file_folder.stem
19 | 
20 | modules_to_use = ['dbkcore', 'dbkdev', 'acai_ml']
21 | path_requirements = current_file_folder.joinpath('requirements.txt')
22 | package_dir = {}
23 | documentations = []
24 | 
25 | with open(path_requirements, "r") as fh:
26 |     requirements = [l.strip() for l in fh.readlines()]
27 | 
28 | requirements = [rq for rq in requirements if (rq) and (rq.startswith('#') is False)]
29 | 
30 | packages = []
31 | 
32 | for module in modules_to_use:
33 |     module_path = modules_root.joinpath(module)
34 |     packages = packages + setuptools.find_namespace_packages(where=modules_root, include=[f'{module}*'])
35 |     package_dir[module] = module_path
36 |     doc = create_adow_documentation(str(module_path))
37 |     documentations.append(doc)
38 | 
39 | documentation = '\n\n'.join(documentations)
40 | 
41 | with open(str(path_readme), 'w', encoding="utf-8") as out:
42 |     out.write(documentation)
43 | 
44 | today = datetime.today()
45 | version = f'{today:%Y}{today:%m}{today:%d}_{today:%H}{today:%M}{today:%S}'
46 | 
47 | setuptools.setup(
48 |     name=package_name,
49 |     version=version,
50 |     author="Davide Fornelli",
51 |     author_email="daforne@microsoft.com",
52 |     description="Core library for logging and using proper base object",
53 |     # long_description=documentation,
54 |     long_description_content_type="text/markdown",
55 |     packages=packages,
56 |     package_dir=package_dir,
57 |     install_requires=requirements,
58 |     python_requires='~=3.7.6'
59 | )
60 | 


--------------------------------------------------------------------------------
/src/setup/arm-templates/parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#",
 3 |     "contentVersion": "1.0.0.0",
 4 |     "parameters": {
 5 |         "location": {
 6 |             "value": "northeurope"
 7 |         },
 8 |         "resource_group": {
 9 |             "value": "rg-dbk-dev-001"
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/src/setup/arm-templates/template.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
  3 |     "contentVersion": "1.0.0.0",
  4 |     "parameters": {
  5 |         "location": {
  6 |             "type": "string"
  7 |         },
  8 |         "resource_group": {
  9 |             "type": "string"
 10 |         },
 11 |         "key_vault": {
 12 |             "defaultValue": "kv",
 13 |             "type": "String"
 14 |         },
 15 |         "application_insights": {
 16 |             "defaultValue": "ai",
 17 |             "type": "String"
 18 |         },
 19 |         "databricks_workspace": {
 20 |             "defaultValue": "dbkworkspace",
 21 |             "type": "String"
 22 |         },
 23 |         "storage_account": {
 24 |             "defaultValue": "sa",
 25 |             "type": "String"
 26 |         },
 27 |         "log_analytics_workspace": {
 28 |             "defaultValue": "law",
 29 |             "type": "String"
 30 |         }
 31 |     },
 32 |     "variables": {
 33 | 
 34 |     },
 35 |     "resources": [
 36 |         {
 37 |             "type": "Microsoft.Resources/resourceGroups",
 38 |             "apiVersion": "2020-10-01",
 39 |             "location": "[parameters('location')]",
 40 |             "name": "[parameters('resource_group')]",
 41 |             "properties": {}
 42 |         },
 43 |         {
 44 |             "name": "nestedDeployment1",
 45 |             "type": "Microsoft.Resources/deployments",
 46 |             "apiVersion": "2020-10-01",
 47 |             "resourceGroup": "[parameters('resource_group')]",
 48 |             "dependsOn": [
 49 |                 "[resourceId('Microsoft.Resources/resourceGroups/', parameters('resource_group'))]"
 50 |             ],
 51 |             "properties": {
 52 |                 "expressionEvaluationOptions": {
 53 |                     "scope": "inner"
 54 |                 },
 55 |                 "mode": "Incremental",
 56 |                 "parameters": {
 57 |                     "resource_group": {
 58 |                         "value": "[parameters('resource_group')]"
 59 |                     },
 60 |                     "databricks_workspace": {
 61 |                         "value": "[toLower(parameters('databricks_workspace'))]"
 62 |                     },
 63 |                     "key_vault": {
 64 |                         "value": "[toLower(parameters('key_vault'))]"
 65 |                     },
 66 |                     "application_insights": {
 67 |                         "value": "[tolower(parameters('application_insights'))]"
 68 |                     },
 69 |                     "log_analytics_workspace": {
 70 |                         "value": "[tolower(parameters('log_analytics_workspace'))]"
 71 |                     },
 72 |                     "storage_account": {
 73 |                         "value": "[tolower(parameters('storage_account'))]"
 74 |                     }
 75 |                 },
 76 |                 "template": {
 77 |                     "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
 78 |                     "contentVersion": "1.0.0.0",
 79 |                     "parameters": {
 80 |                         "resource_group": {
 81 |                             "type": "string"
 82 |                         },
 83 |                         "key_vault": {
 84 |                             "type": "String"
 85 |                         },
 86 |                         "application_insights": {
 87 |                             "type": "String"
 88 |                         },
 89 |                         "databricks_workspace": {
 90 |                             "type": "String"
 91 |                         },
 92 |                         "storage_account": {
 93 |                             "type": "String"
 94 |                         },
 95 |                         "log_analytics_workspace": {
 96 |                             "type": "String"
 97 |                         }
 98 |                     },
 99 |                     "variables": {
100 |                         "var_dbk_workspace_name": "[tolower(concat(resourceGroup().name, '-', parameters('databricks_workspace')))]",
101 |                         "var_dbk_managedResourceGroupName": "[tolower(concat(variables('var_dbk_workspace_name'), '-databricks-rg-', uniqueString(variables('var_dbk_workspace_name'), resourceGroup().id)))]",
102 |                         "var_dbk_managedResourceGroupId": "[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('var_dbk_managedResourceGroupName'))]",
103 |                         "var_storage": {
104 |                             "storageAccounts": {
105 |                                 "name": "[tolower(concat(replace(parameters('resource_group'), '-', ''), parameters('storage_account')))]",
106 |                                 // "name": "[replace(replace(tolower(parameters('storageAccountNamePrefix')), '-',''),'.','')]",
107 |                                 "type": "Standard_LRS"
108 |                             }
109 |                         },
110 |                         "var_application_insights": {
111 |                             "law_name": "[tolower(concat(parameters('resource_group'), '-', parameters('log_analytics_workspace')))]",
112 |                             "name": "[tolower(concat(parameters('resource_group'), '-', parameters('application_insights')))]"
113 |                         }
114 |                     },
115 |                     "resources": [
116 |                         {
117 |                             "type": "Microsoft.Databricks/workspaces",
118 |                             "apiVersion": "2018-04-01",
119 |                             "name": "[variables('var_dbk_workspace_name')]",
120 |                             "location": "[resourceGroup().location]",
121 |                             "sku": {
122 |                                 "name": "standard"
123 |                             },
124 |                             "properties": {
125 |                                 "managedResourceGroupId": "[variables('var_dbk_managedResourceGroupId')]"
126 |                                 // "authorizations": [
127 |                                 //     {
128 |                                 //         "principalId": "9a74af6f-d153-4348-988a-e2672920bee9",
129 |                                 //         "roleDefinitionId": "8e3af657-a8ff-443c-a75c-2fe8c4bcb635"  // Owner
130 |                                 //     }
131 |                                 // ]
132 |                                 // "createdBy": {},
133 |                                 // "updatedBy": {},
134 |                                 // "createdDateTime": "2021-04-09T09:29:22.5851863Z"
135 |                             }
136 |                         },
137 |                         {
138 |                             "type": "Microsoft.KeyVault/vaults",
139 |                             "apiVersion": "2020-04-01-preview",
140 |                             "name": "[concat(resourceGroup().name, parameters('key_vault'))]",
141 |                             "location": "[resourceGroup().location]",
142 |                             "properties": {
143 |                                 "sku": {
144 |                                     "family": "A",
145 |                                     "name": "Standard"
146 |                                 },
147 |                                 "tenantId": "[subscription().tenantId]",
148 |                                 "accessPolicies": [],
149 |                                 "enabledForDeployment": false,
150 |                                 "enabledForDiskEncryption": false,
151 |                                 "enabledForTemplateDeployment": false,
152 |                                 "enableSoftDelete": true,
153 |                                 "softDeleteRetentionInDays": 90,
154 |                                 "enableRbacAuthorization": false,
155 |                                 "vaultUri": "[concat('https://', parameters('key_vault'), '.vault.azure.net/')]"
156 |                             }
157 |                         },
158 |                         {
159 |                             "type": "microsoft.operationalinsights/workspaces",
160 |                             "apiVersion": "2020-10-01",
161 |                             "name": "[variables('var_application_insights').law_name]",
162 |                             "location": "[resourceGroup().location]",
163 |                             "properties": {
164 |                                 "sku": {
165 |                                     "name": "pergb2018"
166 |                                 },
167 |                                 "retentionInDays": 30,
168 |                                 "features": {
169 |                                     "legacy": 0,
170 |                                     "searchVersion": 1,
171 |                                     "enableLogAccessUsingOnlyResourcePermissions": true
172 |                                 },
173 |                                 "workspaceCapping": {
174 |                                     "dailyQuotaGb": -1
175 |                                 },
176 |                                 "publicNetworkAccessForIngestion": "Enabled",
177 |                                 "publicNetworkAccessForQuery": "Enabled"
178 |                             }
179 |                         },
180 |                         {
181 |                             "type": "Microsoft.Storage/storageAccounts",
182 |                             "apiVersion": "2021-01-01",
183 |                             "name": "[variables('var_storage').storageAccounts.name]",
184 |                             "location": "[resourceGroup().location]",
185 |                             "sku": {
186 |                                 "name": "Standard_RAGRS",
187 |                                 "tier": "Standard"
188 |                             },
189 |                             "kind": "StorageV2",
190 |                             "properties": {
191 |                                 "minimumTlsVersion": "TLS1_2",
192 |                                 "allowBlobPublicAccess": true,
193 |                                 "allowSharedKeyAccess": true,
194 |                                 "isHnsEnabled": true,
195 |                                 "networkAcls": {
196 |                                     "bypass": "AzureServices",
197 |                                     "virtualNetworkRules": [],
198 |                                     "ipRules": [],
199 |                                     "defaultAction": "Allow"
200 |                                 },
201 |                                 "supportsHttpsTrafficOnly": true,
202 |                                 "encryption": {
203 |                                     "services": {
204 |                                         "file": {
205 |                                             "keyType": "Account",
206 |                                             "enabled": true
207 |                                         },
208 |                                         "blob": {
209 |                                             "keyType": "Account",
210 |                                             "enabled": true
211 |                                         }
212 |                                     },
213 |                                     "keySource": "Microsoft.Storage"
214 |                                 },
215 |                                 "accessTier": "Hot"
216 |                             }
217 |                         },
218 |                         {
219 |                             "type": "microsoft.insights/components",
220 |                             "apiVersion": "2020-02-02-preview",
221 |                             "name": "[variables('var_application_insights').name]",
222 |                             "location": "[resourceGroup().location]",
223 |                             "dependsOn": [
224 |                                 "[resourceId('microsoft.operationalinsights/workspaces', variables('var_application_insights').law_name)]"
225 |                             ],
226 |                             "kind": "web",
227 |                             "properties": {
228 |                                 "Application_Type": "web",
229 |                                 "Flow_Type": "Redfield",
230 |                                 "Request_Source": "IbizaAIExtension",
231 |                                 "WorkspaceResourceId": "[resourceId('microsoft.operationalinsights/workspaces', variables('var_application_insights').law_name)]",
232 |                                 "IngestionMode": "LogAnalytics",
233 |                                 "publicNetworkAccessForIngestion": "Enabled",
234 |                                 "publicNetworkAccessForQuery": "Enabled"
235 |                             }
236 |                         },
237 |                         {
238 |                             "type": "Microsoft.Storage/storageAccounts/blobServices",
239 |                             "apiVersion": "2021-01-01",
240 |                             "name": "[concat(variables('var_storage').storageAccounts.name, '/default')]",
241 |                             "dependsOn": [
242 |                                 "[resourceId('Microsoft.Storage/storageAccounts', variables('var_storage').storageAccounts.name)]"
243 |                             ],
244 |                             "sku": {
245 |                                 "name": "Standard_RAGRS",
246 |                                 "tier": "Standard"
247 |                             },
248 |                             "properties": {
249 |                                 "cors": {
250 |                                     "corsRules": []
251 |                                 },
252 |                                 "deleteRetentionPolicy": {
253 |                                     "enabled": false
254 |                                 }
255 |                             }
256 |                         }
257 |                         // {
258 |                         //     "type": "Microsoft.Storage/storageAccounts/fileServices",
259 |                         //     "apiVersion": "2021-01-01",
260 |                         //     "name": "[concat(variables('var_storage').storageAccounts.name, '/default')]",
261 |                         //     "dependsOn": [
262 |                         //         "[resourceId('Microsoft.Storage/storageAccounts', variables('var_storage').storageAccounts.name)]"
263 |                         //     ],
264 |                         //     "sku": {
265 |                         //         "name": "Standard_RAGRS",
266 |                         //         "tier": "Standard"
267 |                         //     },
268 |                         //     "properties": {
269 |                         //         "protocolSettings": {
270 |                         //             "smb": {}
271 |                         //         },
272 |                         //         "cors": {
273 |                         //             "corsRules": []
274 |                         //         },
275 |                         //         "shareDeleteRetentionPolicy": {
276 |                         //             "enabled": true,
277 |                         //             "days": 7
278 |                         //         }
279 |                         //     }
280 |                         // },
281 |                         // {
282 |                         //     "type": "Microsoft.Storage/storageAccounts/queueServices",
283 |                         //     "apiVersion": "2021-01-01",
284 |                         //     "name": "[concat(variables('var_storage').storageAccounts.name, '/default')]",
285 |                         //     "dependsOn": [
286 |                         //         "[resourceId('Microsoft.Storage/storageAccounts', variables('var_storage').storageAccounts.name)]"
287 |                         //     ],
288 |                         //     "properties": {
289 |                         //         "cors": {
290 |                         //             "corsRules": []
291 |                         //         }
292 |                         //     }
293 |                         // },
294 |                         // {
295 |                         //     "type": "Microsoft.Storage/storageAccounts/tableServices",
296 |                         //     "apiVersion": "2021-01-01",
297 |                         //     "name": "[concat(variables('var_storage').storageAccounts.name, '/default')]",
298 |                         //     "dependsOn": [
299 |                         //         "[resourceId('Microsoft.Storage/storageAccounts', variables('var_storage').storageAccounts.name)]"
300 |                         //     ],
301 |                         //     "properties": {
302 |                         //         "cors": {
303 |                         //             "corsRules": []
304 |                         //         }
305 |                         //     }
306 |                         // }
307 |                     ],
308 |                     "outputs": {}
309 |                 }
310 |             }
311 |         }
312 |     ]
313 | }


--------------------------------------------------------------------------------
/src/setup/config/setup_config.json:
--------------------------------------------------------------------------------
1 | {
2 | 
3 |     "applicationID":"deeadfbxxxd39049b450",
4 |     "tenantID":"72f988bxxxab-2dxxxx7cd011db47",
5 |     "subscriptionID":"89c37xxxxxx98e0-1cfb98c0262e",
6 |     "resourceGroupName":"acltrsapadbkmlops99",
7 |     "resourceGroupLocation":"NorthEurope"
8 | }
9 | 


--------------------------------------------------------------------------------
/src/setup/configureResources.ps1:
--------------------------------------------------------------------------------
 1 | <#
 2 | Prereqisite :
 3 | 
 4 | Service Principal must be granted subscription contributor permission
 5 | 
 6 | 1. setup_config.json should be filled with the proper details. ( already done during the first script execution )
 7 | 2. appsecret.txt should be having the client secret of the service principal. ( already done during the first script execution )
 8 | 3..\vault\DBKtoken.txt file should be filled with the Databricks Personal Access token.
 9 | 3. change the directory path in the command line to the project file path.
10 | cd C:\Users\......\MLOpsBasic-Databricks\src\setup
11 | 
12 | 
13 | Post Execution Step
14 | =================================
15 | 1. create .env file in root with the details from the output of the execution
16 | 
17 | PYTHONPATH=/workspaces/MLOpsBasic-Databricks/src/modules
18 | APPI_IK="7936xxxx8497696"
19 | DATABRICKS_HOST=https://adb-dapi398220xxxxxb066e49b7-2.XX.azuredatabricks.net/
20 | DATABRICKS_TOKEN=793xxxx8497696
21 | DATABRICKS_ORDGID=53d000xxxxb-9634-ae6a9658c775
22 | 
23 | 2. DATABRICKS_HOST=https://adb-dapi3982xxxx6e94657b066e49b7-2.XX.azuredatabricks.net/ ==> change the "XX" with the correct version from the databricks workspace URL.
24 | 
25 | #>
26 | 
27 | Write-Verbose     "PSScriptRoot is: $PSScriptRoot"
28 | $rootPath = (Get-Item -Path $PSScriptRoot).FullName
29 | Write-Verbose     "config file path: $rootPath\config\setup_config.json"
30 | $config = Get-Content -Raw -Path "$rootPath\config\setup_config.json" | ConvertFrom-Json
31 | 
32 | 
33 | # $rootPath = $config.rootDirectoryPath
34 | # cd $rootpath
35 | 
36 | $applicationID=$config.applicationID
37 | $appsecret = Get-Content -Path "$rootPath\vault\appsecret.txt"
38 | $tenantId =$config.tenantID
39 | $subscriptionID = $config.subscriptionID
40 | $resourceGroupname = $config.resourceGroupName
41 | $resourceGroupLocation = $config.resourceGroupLocation
42 | 
43 | 
44 | $dbktoken = Get-Content -Path "$rootPath\vault\DBKtoken.txt"
45 | 
46 | # DBK Cluster Creation
47 | cd $rootPath
48 | cd util
49 | $returnResult = .\Deploy-DBCluster.ps1 -ResourceGroupName $resourceGroupname -Location $resourceGroupLocation -BearerToken $dbktoken -Verbose 
50 | $clusterID  = $returnResult | select -Last 1
51 | 
52 | cd..
53 | # Login to databricks
54 | $resourceGroupLocation = $resourceGroupLocation.replace(' ','')
55 | 
56 | 
57 | $DBAPIRootUrl = "https://"+$resourceGroupLocation+".azuredatabricks.net"
58 | $DBAPIKey = $dbktoken
59 |  
60 | [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12
61 |  
62 | $ClustersAPIListUrl = $DBAPIRootUrl.Trim('/') + "/api/2.0/workspace/list"
63 |  
64 | $headers = @{
65 |   Authorization = "Bearer $DBAPIKey"
66 |   "Content-Type" = "application/json"
67 | }
68 |  
69 | $Path= "/"
70 | $parameters = @{
71 |   path = $Path
72 | }
73 |  
74 | $response = Invoke-WebRequest -Uri $ClustersAPIListUrl -Method GET -Headers $headers -Body $parameters
75 | $orgID = $response.Headers.'X-Databricks-Org-Id'
76 | 
77 | $appInsightName = ((Get-AzApplicationInsights -ResourceGroupName $resourceGroupname)  | Where-Object {$_.Name -eq $resourceGroupname+"-ai"})
78 | $instrumentationKey = $appInsightName.InstrumentationKey
79 | 
80 | 
81 | $output = 'PYTHONPATH=/workspaces/dstoolkit-ml-ops-for-databricks/src/modules
82 | APPI_IK={0}
83 | DATABRICKS_HOST=https://adb-{1}.XX.azuredatabricks.net/
84 | DATABRICKS_TOKEN={2}
85 | DATABRICKS_ORDGID={3}' -f $instrumentationKey, $orgID,$dbktoken,$orgID
86 | 
87 | Write-Host $output
88 | 


--------------------------------------------------------------------------------
/src/setup/deployResources.ps1:
--------------------------------------------------------------------------------
 1 | <#
 2 | Prereqisite :
 3 | Service Principal must be granted subscription contributor permission
 4 | 
 5 | 1. setup_config.json should be filled with the proper details. 
 6 | 2. .\vault\appsecret.txt should be having the client secret of the service principal
 7 | 3. change the directory path in the command line to the project file path.
 8 | cd C:\Users\......\MLOpsBasic-Databricks\src\setup
 9 | #>
10 | 
11 | Write-Verbose     "PSScriptRoot is: $PSScriptRoot"
12 | $rootPath = (Get-Item -Path $PSScriptRoot).FullName
13 | Write-Verbose     "config file path: $rootPath\config\setup_config.json"
14 | $config = Get-Content -Raw -Path "$rootPath\config\setup_config.json" | ConvertFrom-Json
15 | 
16 | 
17 | # $rootPath = $config.rootDirectoryPath
18 | # cd $rootpath
19 | 
20 | $applicationID=$config.applicationID
21 | $appsecret = Get-Content -Path "$rootPath\vault\appsecret.txt"
22 | $tenantId =$config.tenantID
23 | $subscriptionID = $config.subscriptionID
24 | $resourceGroupname = $config.resourceGroupName
25 | $resourceGroupLocation = $config.resourceGroupLocation
26 | 
27 | #Install Modules. 
28 | 
29 | if (!(Get-Module -Name "Az.Accounts" -ListAvailable)){
30 |     Install-Module -Name "Az.Accounts"
31 |     Import-Module -Name "Az.Accounts"
32 | 
33 | }
34 | 
35 | if (!(Get-Module -Name "Az.ApplicationInsights" -ListAvailable)){
36 |     Install-Module -Name "Az.ApplicationInsights"
37 |     Import-Module -Name "Az.ApplicationInsights"
38 | 
39 | }
40 | 
41 | 
42 | if (!(Get-Module -Name "Az.Databricks" -ListAvailable)){
43 |     Install-Module -Name "Az.Databricks"
44 |     Import-Module -Name "Az.Databricks"
45 | 
46 | }
47 | 
48 | $PWord= ConvertTo-SecureString -String  $appsecret -AsPlainText -Force
49 | $Credential1 = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $applicationID, $PWord
50 | $info= Connect-AzAccount -ServicePrincipal -Credential $Credential1 -TenantId $tenantid -Subscription $subscriptionID
51 | 
52 | # Create the Resource Group
53 | if (!( Get-AzResourceGroup -Name $resourceGroupname -ErrorVariable notPresent -ErrorAction SilentlyContinue )){
54 | New-AzResourceGroup -Location $resourceGroupLocation -Name $resourceGroupname
55 | }
56 | 
57 | # Task 1: Deploy the Resource Group
58 | $templatefileLocation = $rootPath + "\arm-templates\template.json"
59 | 
60 | # Task 2: Deploy the Resource
61 | $deploymentDetails = New-AzDeployment -Name "DBKadnResourceCreation"   `
62 | -Location $resourceGroupLocation -resource_group $resourceGroupname `
63 | -TemplateFile $templatefileLocation `
64 | -locationFromTemplate $resourceGroupLocation
65 | 
66 | $deploymentDetails
67 | 
68 | # if ($deploymentDetails.ProvisioningState -eq "Succeeded"){
69 | #     $dbkName = (Get-AzDatabricksWorkspace  -ResourceGroupName "AccleratorDBKMLOps1").Name
70 | #     $appInsightName = (Get-AzApplicationInsights -ResourceGroupName "AccleratorDBKMLOps1")    
71 | #     }
72 | 
73 | 


--------------------------------------------------------------------------------
/src/setup/util/DBCluster-Configuration.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "num_workers": 1,
 3 |   "cluster_name": "devcluster",
 4 |   "spark_version": "7.3.x-cpu-ml-scala2.12",
 5 |   "spark_conf": {
 6 |       "spark.databricks.delta.preview.enabled": "true"
 7 |   },
 8 |   "azure_attributes": {
 9 |       "first_on_demand": 1,
10 |       "availability": "ON_DEMAND_AZURE",
11 |       "spot_bid_max_price": -1
12 |   },
13 |   "node_type_id": "Standard_DS3_v2",
14 |   "driver_node_type_id": "Standard_DS3_v2",
15 |   "ssh_public_keys": [],
16 |   "custom_tags": {},
17 |   "spark_env_vars": {},
18 |   "autotermination_minutes": 20,
19 |   "enable_elastic_disk": true,
20 |   "cluster_source": "API",
21 |   "init_scripts": [],
22 |   "cluster_id": "0519-195053-tough408"
23 | }


--------------------------------------------------------------------------------
/src/setup/util/Deploy-DBCluster.ps1:
--------------------------------------------------------------------------------
 1 | 
 2 | param
 3 | (
 4 | 
 5 | 	[Parameter(Position = 0, Mandatory = $True,  HelpMessage = "Specify the ResourceGroupName.")]	
 6 | 	[String] $ResourceGroupName,
 7 | 	[Parameter(Position = 1, Mandatory = $True,  HelpMessage = "Specify the Location.")]	
 8 | 	[String] $Location,
 9 | 	[Parameter(Position = 2, Mandatory = $True, HelpMessage = "Specify the BearerToken.")]	
10 | 	[String] $BearerToken # TODO: This should come from DevOps task
11 | )
12 | 
13 | $VerbosePreference = 'Continue'  
14 | 
15 | Write-Verbose     "PSScriptRoot is: $PSScriptRoot"
16 | $ScriptFolderPath = (Get-Item -Path $PSScriptRoot).FullName
17 | Write-Verbose     "parameter file path: $ScriptFolderPath"
18 | 
19 | $clusterFilePath  = "$ScriptFolderPath\DBCluster-Configuration.json"
20 | 
21 | $clusterId        = $null
22 | $clusterName = (Get-Content -Path $clusterFilePath | ConvertFrom-Json).cluster_name
23 | $clusterDefintion = Get-Content -Path $clusterFilePath
24 | 
25 | $resourceGroupLocation = $Location.replace(' ','')
26 | $DBAPIRootUrl = "https://"+$resourceGroupLocation+".azuredatabricks.net"
27 | $DBAPIKey = $BearerToken
28 |  
29 | [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12
30 |  
31 | $ClustersAPIListUrl = $DBAPIRootUrl.Trim('/') + "/api/2.0/clusters/list"
32 |  
33 | $headers = @{
34 |   Authorization = "Bearer $DBAPIKey"
35 |   "Content-Type" = "application/json"
36 | }
37 |  
38 | $Path= "/"
39 | $parameters = @{
40 |   path = $Path
41 | }
42 |  
43 | $response = Invoke-WebRequest -Uri $ClustersAPIListUrl -Method GET -Headers $headers -Body $parameters
44 | 
45 | $responseObj = $response.Content | ConvertFrom-Json
46 | $clusterid = ""
47 | foreach ( $c in $responseObj.clusters){
48 |     if($c.cluster_name -eq $clusterName){
49 |         $clusterid = $c.cluster_id        
50 |     }
51 | }
52 | if($clusterid){
53 |     Write-Host "The cluster is already present"
54 | }
55 | else{
56 |     Write-Host "new cluster to be created"
57 | 
58 |     $ClustersAPIListUrl = $DBAPIRootUrl.Trim('/') + "/api/2.0/clusters/create"
59 |  
60 |     $headers = @{
61 |       Authorization = "Bearer $DBAPIKey"
62 |       "Content-Type" = "application/json"
63 |     }
64 |  
65 |     $Path= "/"
66 |     $parameters = @{
67 |       path = $Path
68 |     }
69 |  
70 |     $response = Invoke-WebRequest -Uri $ClustersAPIListUrl -Method POST -Headers $headers -Body $clusterDefintion
71 |     $clusterid = ($response.Content|ConvertFrom-Json).cluster_id
72 | }
73 | return $clusterid


--------------------------------------------------------------------------------
/src/setup/util/Deploy-DBCluster_using_CLI.ps1:
--------------------------------------------------------------------------------
  1 | 
  2 | param
  3 | (
  4 | 
  5 | 	[Parameter(Position = 0, Mandatory = $True,  HelpMessage = "Specify the ResourceGroupName.")]	
  6 | 	[String] $ResourceGroupName,
  7 | 	[Parameter(Position = 1, Mandatory = $True,  HelpMessage = "Specify the Location.")]	
  8 | 	[String] $Location,
  9 | 	[Parameter(Position = 2, Mandatory = $True, HelpMessage = "Specify the BearerToken.")]	
 10 | 	[String] $BearerToken # TODO: This should come from DevOps task
 11 | )
 12 | 
 13 | #$Environment       = "Dev"
 14 | #$ResourceGroupName = "RS-DEV-WE-03"
 15 | #$Location          = "westeurope"
 16 | #$BearerToken       = "d"
 17 | 
 18 | #$psISE.CurrentFile.FullPath
 19 | 
 20 | # This switch needs to be enabled to print verbose messages
 21 | $VerbosePreference = 'Continue'  
 22 | 
 23 | Write-Verbose     "PSScriptRoot is: $PSScriptRoot"
 24 | $ScriptFolderPath = (Get-Item -Path $PSScriptRoot).FullName
 25 | Write-Verbose     "parameter file path: $ScriptFolderPath"
 26 | 
 27 | $clusterFilePath  = "$ScriptFolderPath\DBCluster-Configuration.json"
 28 | $clusterId        = $null
 29 | 
 30 | # Install Libraries
 31 | python -m pip install --upgrade pip
 32 | #python -m pip install wheel
 33 | #python -m pip install setuptools
 34 | python -m pip install databricks-cli
 35 | 
 36 | #Removing the space from the Location is there is any
 37 | 
 38 | $Location = $Location.replace(' ','')
 39 | # Login to databricks
 40 | @"
 41 | https://$Location.azuredatabricks.net
 42 | $BearerToken
 43 | "@ | databricks configure --token
 44 | 
 45 | # Create Interactive clusters
 46 | # Check if the cluster exist
 47 | $clusterName = (Get-Content -Path $clusterFilePath | ConvertFrom-Json).cluster_name
 48 | $clusters    = (databricks clusters list --output="JSON" | ConvertFrom-Json).clusters | Where-Object { $_.cluster_name -eq $clusterName }
 49 | 
 50 | if ($null -ne $clusters) 
 51 | {
 52 |     $clusterId = $clusters.cluster_id
 53 | }
 54 | if($clusterid -ne $null){
 55 | Write-Verbose $clusterId
 56 | }
 57 | 
 58 | if ($clusterId) 
 59 | {
 60 |     Write-Verbose "Cluster already exist with ID $clusterId"
 61 |     if ((databricks clusters get --cluster-id $clusterId | ConvertFrom-Json).state -ne "RUNNING") 
 62 | 	{
 63 |         Write-Verbose "Cluster state is terminated starting cluster: $clusterId"
 64 |         databricks clusters start --cluster-id $clusterId
 65 | 
 66 | 		# Start the cluster and poll until its state changes to Running
 67 |         while ((databricks clusters get --cluster-id $clusterId | ConvertFrom-Json).state -eq "PENDING") 
 68 | 		{
 69 |             Write-Verbose "Waiting for Databrick cluster id $($clusterId) to get started, sleep for 30 seconds"
 70 |             Start-Sleep -Seconds 30
 71 |         }
 72 | 
 73 |         if ((databricks clusters get --cluster-id $clusterId | ConvertFrom-Json).state -eq "RUNNING") 
 74 | 		{
 75 |             Write-Verbose "Databrick cluster id $($clusterId) is now running"
 76 |         }
 77 |         else 
 78 | 		{
 79 |             Write-Verbose "Databrick cluster id $($clusterId) creation failed. exiting script"
 80 |             exit
 81 |         }
 82 |     }
 83 | }
 84 | else 
 85 | {
 86 |     #Create a fixed node cluster
 87 |     $clusterId = (databricks clusters create --json-file $clusterFilePath | ConvertFrom-Json).cluster_id
 88 |     if($clusterid -ne $null){
 89 | 		Write-Verbose "cluster id $clusterId"
 90 | 	}
 91 | 
 92 |     while ((databricks clusters get --cluster-id $clusterId | ConvertFrom-Json).state -eq "PENDING") 
 93 | 	{
 94 |         Write-Verbose "Waiting for Databrick cluster id $($clusterId) to created, sleep for 30 seconds"
 95 |         Start-Sleep -Seconds 30
 96 |     }
 97 | 
 98 |     if ((databricks clusters get --cluster-id $clusterId | ConvertFrom-Json).state -eq "RUNNING") 
 99 | 	{
100 |         Write-Verbose "Databrick cluster id $($clusterId) is now running"
101 |     }
102 |     else 
103 | 	{
104 |         Write-Verbose "Databrick cluster id $($clusterId) creation failed. exiting script"
105 |         exit
106 |     }
107 | 
108 | }
109 | return $clusterId
110 | 


--------------------------------------------------------------------------------
/src/tutorial/README.md:
--------------------------------------------------------------------------------
 1 | # TODO
 2 | 
 3 | # Scripts
 4 | 
 5 | ## Create cluster
 6 | ```bash
 7 | cd /workspaces/MLOpsBasic-Databricks/src/tutorial && \
 8 | python scripts/create_cluster.py -c cluster_config.json
 9 | ```
10 | 
11 | ## Local configuration
12 | ```bash
13 | cd /workspaces/MLOpsBasic-Databricks/src/tutorial && \
14 | python scripts/local_config.py -c cluster_config.json
15 | ```
16 | 
17 | ## Secrets configuration
18 | ```bash
19 | cd /workspaces/MLOpsBasic-Databricks/src/tutorial && \
20 | python scripts/set_secrets.py -c cluster_config.json --scope 'test_scope' --secret_name 'test_scret_name' --secret_value 'test_secrete_value'
21 | ```


--------------------------------------------------------------------------------
/src/tutorial/cluster_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "num_workers": 2,
 3 |     "cluster_name": "devcluster",
 4 |     "spark_version": "7.3.x-cpu-ml-scala2.12",
 5 |     "spark_conf": {
 6 |         "spark.databricks.delta.preview.enabled": "true",
 7 |         "spark.sql.execution.arrow.enabled": "true"
 8 |     },
 9 |     "node_type_id": "Standard_DS3_v2",
10 |     "driver_node_type_id": "Standard_DS3_v2",
11 |     "ssh_public_keys": [],
12 |     "custom_tags": {},
13 |     "spark_env_vars": {
14 |         "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
15 |     },
16 |     "autotermination_minutes": 60,
17 |     "enable_elastic_disk": true
18 | }


--------------------------------------------------------------------------------
/src/tutorial/create_databricks_secrets.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dbkcore.core import Log
3 | from scripts import set_secrets
4 | 
5 | appi_key_env = 'APPI_IK'
6 | set_secrets.main(scope='config', secret_name=appi_key_env, secret_value=os.environ[appi_key_env])


--------------------------------------------------------------------------------
/src/tutorial/deploy.py:
--------------------------------------------------------------------------------
 1 | """Configure Databricks cluster."""
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | # import sys
 6 | # sys.path.append(str(Path(__file__).parent.parent.joinpath('modules')))
 7 | import os
 8 | import json
 9 | from dbkcore.core import Log
10 | from scripts import create_cluster
11 | from scripts import install_dbkframework
12 | from scripts import set_secrets
13 | from scripts import local_config
14 | import argparse
15 | 
16 | Log(name=Path(__file__).stem)
17 | 
18 | 
19 | def command_exec(command, ignore=False):
20 |     """
21 |     Execute shell command.
22 | 
23 |     Parameters
24 |     ----------
25 |     command : str
26 |         Command to execute
27 |     ignore : bool, optional
28 |         Ignore exception, by default False
29 | 
30 |     Raises
31 |     ------
32 |     Exception
33 |         Raises exception if command failes
34 |     """
35 |     Log.get_instance().log_info(f'Running command -> {command}')
36 |     if not ignore:
37 |         if os.system(command) != 0:
38 |             raise Exception(f'Failed to execute: {command}')
39 | 
40 | 
41 | def parse_args(args_list=None):
42 |     """
43 |     Parse command line arguments.
44 | 
45 |     Parameters
46 |     ----------
47 |     args_list : [type], optional
48 |         Argument list, by default None
49 | 
50 |     Returns
51 |     -------
52 |     ArgumentParser
53 |         Arguments parsed
54 |     """
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument('-c', '--config_file', help="Full path of cluster's json configuration", type=str, required=True)
57 |     args_parsed = parser.parse_args(args_list)
58 |     return args_parsed
59 | 
60 | 
61 | def main(cluster_config_file):
62 |     """
63 |     Execute the script.
64 | 
65 |     Parameters
66 |     ----------
67 |     cluster_config_file : str
68 |         Path of the configuration file
69 | 
70 |     Raises
71 |     ------
72 |     Exception
73 |         Raises when script failes
74 |     """
75 |     appi_key_env = 'APPI_IK'
76 | 
77 |     create_cluster.main(cluster_config_file=cluster_config_file)
78 |     local_config.main(cluster_config_file=cluster_config_file)
79 |     set_secrets.main(scope='config', secret_name=appi_key_env, secret_value=os.environ[appi_key_env])
80 |     install_dbkframework.main(cluster_config_file=cluster_config_file)
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     args = parse_args()
85 |     main(cluster_config_file=args.config_file)
86 | 


--------------------------------------------------------------------------------
/src/tutorial/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-ml-ops-for-databricks/506bb19578da05133ce3ddc4dd651d0ff0298172/src/tutorial/scripts/__init__.py


--------------------------------------------------------------------------------
/src/tutorial/scripts/create_cluster.py:
--------------------------------------------------------------------------------
 1 | """Create a cluster in Databricks."""
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | # sys.path.append(str(Path(__file__).parent.parent.joinpath('modules')))
 6 | import json
 7 | from dbkenv.core import DatabricksResourceManager
 8 | from dbkenv.core import Configuration
 9 | from dbkenv.core import ResourceClient
10 | from dbkenv.core import Log
11 | import argparse
12 | 
13 | 
14 | 
15 | 
16 | Log(name=Path(__file__).stem)
17 | 
18 | 
19 | def parse_args(args_list=None):
20 |     """
21 |     Parse command line arguments.
22 | 
23 |     Parameters
24 |     ----------
25 |     args_list : [type], optional
26 |         Argument list, by default None
27 | 
28 |     Returns
29 |     -------
30 |     ArgumentParser
31 |         Arguments parsed
32 |     """
33 |     parser = argparse.ArgumentParser()
34 |     parser.add_argument('-c', '--config_file', help="Full path of cluster's json configuration", type=str, required=True)
35 |     args_parsed = parser.parse_args(args_list)
36 |     return args_parsed
37 | 
38 | 
39 | def main(cluster_config_file):
40 |     """
41 |     Execute the script.
42 | 
43 |     Parameters
44 |     ----------
45 |     cluster_config_file : str
46 |         Path of the configuration file
47 | 
48 |     Raises
49 |     ------
50 |     Exception
51 |         Raises when script failes
52 |     """
53 |     configuration = Configuration(file_load=True)
54 |     # cluster_config_file = str(Path(__file__).parent.joinpath('unittest_cluster.json'))
55 | 
56 |     with open(cluster_config_file.strip(), 'r') as cl:
57 |         cluster_configuration = json.load(cl)
58 | 
59 |     cluster_name = cluster_configuration['cluster_name']
60 |     # instantiate the logger
61 | 
62 |     client = ResourceClient(
63 |         host=configuration.DATABRICKS_HOST,
64 |         personal_token=configuration.DATABRICKS_TOKEN
65 |     )
66 |     drm = DatabricksResourceManager(
67 |         client=client,
68 |         cluster_name=cluster_name,
69 |         cluster_configuration=cluster_configuration
70 |     )
71 | 
72 |     drm.cluster.create_cluster_and_wait()
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     args = parse_args()
77 |     main(cluster_config_file=args.config_file)
78 | 


--------------------------------------------------------------------------------
/src/tutorial/scripts/framework_testing/remote_analysis.py:
--------------------------------------------------------------------------------
  1 | """Example of framework usage"""
  2 | 
  3 | import random
  4 | from acai_ml.core import Engine
  5 | import pandas as pd
  6 | from pydataset import data
  7 | from pathlib import Path
  8 | from dbkcore.core import trace
  9 | from dbkcore.core import Log
 10 | from dbkdev.data_steps import DataStep, DataStepDataframe
 11 | from dbkdev.data_steps import apply_test
 12 | from sklearn.model_selection import ParameterSampler
 13 | from sklearn.utils.fixes import loguniform
 14 | from pyspark.sql import functions as F
 15 | import numpy as np
 16 | from sklearn.model_selection import cross_val_score
 17 | from sklearn import svm
 18 | 
 19 | 
 20 | class Step_loadData(DataStep):
 21 |     """Load the defined dataset."""
 22 | 
 23 |     def test(self):
 24 |         """Apply data tests."""
 25 |         self.test_is_dataframe_empty(df=self.output_data.dataframe)
 26 |         self.test_null_values(
 27 |             cols=['Sepal.Length', 'Sepal.Width'],
 28 |             df=self.output_data.dataframe
 29 |         )
 30 | 
 31 |     @apply_test
 32 |     @trace
 33 |     def initialize(self, name_dataset: str):
 34 |         """
 35 |         Initialize the DataStep.
 36 | 
 37 |         Parameters
 38 |         ----------
 39 |         name_dataset : str
 40 |             Name of the dataset to load from pydataset package
 41 |         """
 42 |         p_df = data(name_dataset)
 43 |         p_df.columns = [c.replace('.', '') for c in p_df.columns]
 44 |         dt = self.spark.createDataFrame(p_df)
 45 |         self.set_output_data(dt)
 46 | 
 47 | 
 48 | class Step_crossValidate(DataStep):
 49 |     """Run multiple models in parallel."""
 50 | 
 51 |     def test(self):
 52 |         pass
 53 | 
 54 |     @trace(attrs_refact=['appi_ik'])
 55 |     def initialize(
 56 |         self,
 57 |         dt: DataStepDataframe,
 58 |         pipeline_name: str,
 59 |         appi_ik: str,
 60 |         n_iter: int
 61 |     ):
 62 |         param_grid = {
 63 |             'C': loguniform(1e0, 1e3),
 64 |             'kernel': ['linear', 'rbf'],
 65 |             'class_weight': ['balanced', None]
 66 |         }
 67 |         rng = np.random.RandomState(0)
 68 |         param_list = list(
 69 |             ParameterSampler(
 70 |                 param_grid,
 71 |                 n_iter=n_iter,
 72 |                 random_state=rng
 73 |             )
 74 |         )
 75 |         # p_dt = Engine.get_instance().spark().createDataFrame(pd.DataFrame(param_list)).\
 76 |         #     withColumn('id', F.monotonically_increasing_id())
 77 |         p_dt = self.spark.createDataFrame(pd.DataFrame(param_list)).\
 78 |             withColumn('id', F.monotonically_increasing_id())
 79 |         dt_train = dt.dataframe.crossJoin(
 80 |             p_dt
 81 |         )
 82 | 
 83 |         udf_schema = dt_train.select(
 84 |             'id',
 85 |             F.lit(0.0).alias('score')
 86 |         ).schema
 87 | 
 88 |         def pudf_train(dt_model):
 89 |             param_id = dt_model['id'].unique()[0]
 90 |             param_c = dt_model['C'].unique()[0]
 91 |             param_class_weight = dt_model['class_weight'].unique()[0]
 92 |             param_kernel = dt_model['kernel'].unique()[0]
 93 | 
 94 |             logging_custom_dimensions = {
 95 |                 'id': str(param_id),
 96 |                 'C': str(param_c),
 97 |                 'class_weight': param_class_weight,
 98 |                 'kernel': param_kernel
 99 |             }
100 | 
101 |             Log(pipeline_name, appi_ik)
102 | 
103 |             try:
104 | 
105 |                 # Raising randomly exception
106 |                 if random.randint(0, 20) > 15:
107 |                     raise 'Random exception'
108 | 
109 |                 dt_x = dt_model[
110 |                     [
111 |                         'SepalLength',
112 |                         'SepalWidth',
113 |                         'PetalLength',
114 |                         'PetalWidth'
115 |                     ]
116 |                 ]
117 |                 y = dt_model['Species']
118 |                 clf = svm.SVC(
119 |                     kernel=param_kernel,
120 |                     C=param_c,
121 |                     class_weight=param_class_weight,
122 |                     random_state=42
123 |                 )
124 |                 scores = cross_val_score(clf, dt_x, y, cv=5, scoring='f1_macro')
125 |                 score = scores.mean()
126 |                 dt_out = pd.DataFrame(
127 |                     {
128 |                         'id': [param_id],
129 |                         'score': [score]
130 |                     }
131 |                 )
132 |                 Log.get_instance().log_info("Training:success", custom_dimension=logging_custom_dimensions)
133 |             except Exception:
134 |                 Log.get_instance().log_error("Training:failed", custom_dimension=logging_custom_dimensions)
135 |                 dt_out = pd.DataFrame(
136 |                     {
137 |                         'id': [param_id],
138 |                         'score': [-1]
139 |                     }
140 |                 )
141 |             return dt_out
142 | 
143 |         '''
144 |         dt_model = dt_train.where(F.col('id') == 17179869184).toPandas()
145 |         '''
146 |         dt_cross_evals = dt_train.\
147 |             groupBy(['id']).\
148 |             applyInPandas(pudf_train, schema=udf_schema).\
149 |             cache()
150 |         dt_cross_evals.count()
151 |         self.set_output_data(dt_cross_evals)
152 | 
153 | 
154 | Engine()
155 | Engine().get_instance().initialize_env()
156 | # pipeline_name = Path(__file__).stem
157 | pipeline_name = "Remote Testing"
158 | Engine().get_instance().initialize_logger(pipeline_name=pipeline_name)
159 | # Engine().get_instance().spark().conf.set("spark.sql.execution.arrow.enabled", "true")
160 | 
161 | run_id = 'test_run_id'
162 | 
163 | step_loadData = Step_loadData(
164 |     spark=Engine.get_instance().spark(),
165 |     run_id=run_id
166 | )
167 | 
168 | step_loadData.initialize(
169 |     name_dataset='iris'
170 | )
171 | 
172 | step_crossValidate = Step_crossValidate(
173 |     spark=Engine.get_instance().spark(),
174 |     run_id=run_id
175 | )
176 | 
177 | step_crossValidate.initialize(
178 |     dt=step_loadData.output_data,
179 |     pipeline_name=pipeline_name,
180 |     appi_ik=Engine().get_instance().appi_ik,
181 |     n_iter=1000
182 | )
183 | 
184 | step_crossValidate.output_data.dataframe.toPandas()
185 | 


--------------------------------------------------------------------------------
/src/tutorial/scripts/install_dbkframework.py:
--------------------------------------------------------------------------------
  1 | """Build and installs the dbkframework."""
  2 | 
  3 | from pathlib import Path
  4 | 
  5 | # import sys
  6 | # sys.path.append(str(Path(__file__).parent.parent.joinpath('modules')))
  7 | import os
  8 | import json
  9 | from dbkcore.core import Log
 10 | from dbkenv.core import ResourceClient
 11 | from dbkenv.core import Configuration
 12 | from dbkenv.core import DatabricksResourceManager
 13 | import argparse
 14 | 
 15 | Log(name=Path(__file__).stem)
 16 | 
 17 | 
 18 | def command_exec(command, ignore=False):
 19 |     """
 20 |     Execute shell command.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     command : str
 25 |         Command to execute
 26 |     ignore : bool, optional
 27 |         Ignore exception, by default False
 28 | 
 29 |     Raises
 30 |     ------
 31 |     Exception
 32 |         Raises exception if command failes
 33 |     """
 34 |     Log.get_instance().log_info(f'Running command -> {command}')
 35 |     if not ignore:
 36 |         if os.system(command) != 0:
 37 |             raise Exception(f'Failed to execute: {command}')
 38 | 
 39 | 
 40 | def parse_args(args_list=None):
 41 |     """
 42 |     Parse command line arguments.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     args_list : [type], optional
 47 |         Argument list, by default None
 48 | 
 49 |     Returns
 50 |     -------
 51 |     ArgumentParser
 52 |         Arguments parsed
 53 |     """
 54 |     parser = argparse.ArgumentParser()
 55 |     parser.add_argument('-c', '--config_file', help="Full path of cluster's json configuration", type=str, required=True)
 56 |     args_parsed = parser.parse_args(args_list)
 57 |     return args_parsed
 58 | 
 59 | 
 60 | def main(cluster_config_file):
 61 |     """
 62 |     Execute the script.
 63 | 
 64 |     Parameters
 65 |     ----------
 66 |     cluster_config_file : str
 67 |         Path of the configuration file
 68 | 
 69 |     Raises
 70 |     ------
 71 |     Exception
 72 |         Raises when script failes
 73 |     """
 74 |     configuration = Configuration(file_load=True)
 75 |     with open(cluster_config_file.strip(), 'r') as cl:
 76 |         cluster_configuration = json.load(cl)
 77 | 
 78 |     cluster_name = cluster_configuration['cluster_name']
 79 | 
 80 |     client = ResourceClient(
 81 |         host=configuration.DATABRICKS_HOST,
 82 |         personal_token=configuration.DATABRICKS_TOKEN
 83 |     )
 84 |     drm = DatabricksResourceManager(
 85 |         client=client,
 86 |         cluster_name=cluster_name,
 87 |         cluster_configuration=cluster_configuration
 88 |     )
 89 | 
 90 |     cluster_id = drm.cluster.cluster_id
 91 | 
 92 |     drm.cluster.start_cluster_and_wait()
 93 | 
 94 |     modules_to_deploy = [
 95 |         'dbkframework'
 96 |     ]
 97 | 
 98 |     pipelines_folder = Path(__file__).\
 99 |         parent.\
100 |         parent.\
101 |         parent.\
102 |         absolute().\
103 |         joinpath('pipelines')
104 | 
105 |     for module in modules_to_deploy:
106 | 
107 |         package_folder = pipelines_folder.joinpath(module)
108 |         dist_folder = package_folder.joinpath('dist')
109 | 
110 |         setup_file = package_folder.joinpath('setup.py')
111 | 
112 |         command_string = f"cd {str(package_folder)} && python {str(setup_file)} sdist bdist_wheel"
113 |         res = os.system(command_string)
114 | 
115 |         if res != 0:
116 |             raise Exception(f'Failed to build {module}')
117 | 
118 |         wheel = sorted([v for v in dist_folder.glob('*.whl')], key=lambda i: i.stat().st_ctime, reverse=True)[0]
119 |         dbk_whl_name = wheel.name
120 |         dbk_whl_root = 'dbfs:/FileStore/dev/artifacts/'
121 |         dbk_whl_path = f'{dbk_whl_root}{dbk_whl_name}'
122 | 
123 |         command_exec(f'databricks fs rm {dbk_whl_root}', ignore=True)
124 |         command_exec(f'databricks fs cp -r {wheel} {dbk_whl_path}')
125 | 
126 |         command_exec(f'databricks libraries uninstall --cluster-id {cluster_id} --whl {dbk_whl_path}')
127 |         command_exec(f'databricks libraries install --cluster-id {cluster_id} --whl {dbk_whl_path}')
128 | 
129 |     command_exec(f'databricks clusters restart --cluster-id {cluster_id}')
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     args = parse_args()
134 |     main(cluster_config_file=args.config_file)
135 | 


--------------------------------------------------------------------------------
/src/tutorial/scripts/local_config.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | 
 4 | # sys.path.append(str(Path(__file__).parent.parent.joinpath('modules')))
 5 | import json
 6 | from dbkcore.core import Log
 7 | from dbkenv.core import ResourceClient
 8 | from dbkenv.core import Configuration
 9 | from dbkenv.core import DatabricksResourceManager
10 | from dbkenv.local import DatabricksLocal
11 | import argparse
12 | 
13 | 
14 | 
15 | 
16 | Log(name=Path(__file__).stem)
17 | 
18 | 
19 | def parse_args(args_list=None):
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('-c', '--config_file', help="Full path of cluster's json configuration", type=str, required=True)
22 |     args_parsed = parser.parse_args(args_list)
23 |     return args_parsed
24 | 
25 | 
26 | def main(cluster_config_file):
27 | 
28 |     configuration = Configuration(file_load=True)
29 |     # cluster_config_file = str(Path(__file__).parent.joinpath('unittest_cluster.json'))
30 | 
31 |     with open(cluster_config_file.strip(), 'r') as cl:
32 |         cluster_configuration = json.load(cl)
33 | 
34 |     cluster_name = cluster_configuration['cluster_name']
35 | 
36 |     client = ResourceClient(
37 |         host=configuration.DATABRICKS_HOST,
38 |         personal_token=configuration.DATABRICKS_TOKEN
39 |     )
40 |     drm = DatabricksResourceManager(
41 |         client=client,
42 |         cluster_name=cluster_name,
43 |         cluster_configuration=cluster_configuration
44 |     )
45 | 
46 |     cluster_id = drm.cluster.cluster_id
47 | 
48 |     local_config = DatabricksLocal(
49 |         host=configuration.DATABRICKS_HOST,
50 |         databricks_token=configuration.DATABRICKS_TOKEN,
51 |         cluster_id=cluster_id,
52 |         org_id=configuration.DATABRICKS_ORDGID
53 |     )
54 |     local_config.initialize()
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     args = parse_args()
59 |     main(cluster_config_file=args.config_file)
60 | 


--------------------------------------------------------------------------------
/src/tutorial/scripts/set_secrets.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Add secret to Databricks.
 3 | """
 4 | import sys
 5 | from pathlib import Path
 6 | 
 7 | # sys.path.append(str(Path(__file__).parent.parent.joinpath('modules')))
 8 | from dbkenv.core import Configuration
 9 | from dbkenv.core import ResourceClient
10 | from dbkenv.core import Secret
11 | from dbkenv.core import Log
12 | import argparse
13 | 
14 | 
15 | Log(name=Path(__file__).stem)
16 | 
17 | 
18 | def parse_args(args_list=None):
19 |     """
20 |     Parse command line arguments.
21 | 
22 |     Parameters
23 |     ----------
24 |     args_list : [type], optional
25 |         [description], by default None
26 | 
27 |     Returns
28 |     -------
29 |     ArgumentParser
30 |         Parsed arguments
31 |     """
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument('--scope', help="Scope to use", type=str, required=True)
34 |     parser.add_argument('--secret_name', help="Name of the secret", type=str, required=True)
35 |     parser.add_argument('--secret_value', help="Value of the secret", type=str, required=True)
36 |     args_parsed = parser.parse_args(args_list)
37 |     return args_parsed
38 | 
39 | 
40 | def main(
41 |     scope: str,
42 |     secret_name: str,
43 |     secret_value: str
44 | ):
45 |     """
46 |     Run main function.
47 | 
48 |     Parameters
49 |     ----------
50 |     scope : str
51 |         Scope to use
52 |     secret_name : str
53 |         Name of the secret
54 |     secret_value : str
55 |         Value of the secret
56 |     """
57 |     configuration = Configuration(file_load=True)
58 | 
59 |     client = ResourceClient(
60 |         host=configuration.DATABRICKS_HOST,
61 |         personal_token=configuration.DATABRICKS_TOKEN
62 |     )
63 |     secret_client = Secret(
64 |         client=client
65 |     )
66 | 
67 |     scopes = secret_client.scopes()
68 |     if scope not in scopes:
69 |         secret_client.add_scope(
70 |             scope=scope
71 |         )
72 | 
73 |     secret_client.add_secret(
74 |         scope=scope,
75 |         secret_name=secret_name,
76 |         secret_value=secret_value
77 |     )
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     args = parse_args()
82 |     main(
83 |         scope=args.scope,
84 |         secret_name=args.secret_name,
85 |         secret_value=args.secret_value
86 |     )
87 |     # main(
88 |     #     scope='test_scope',
89 |     #     secret_name='test_name',
90 |     #     secret_value='test_value'
91 |     # )
92 | 


--------------------------------------------------------------------------------
/workspace.code-workspace:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"folders": [
 3 | 		{
 4 | 			"path": "."
 5 | 		}
 6 | 	],
 7 | 	"settings": {
 8 | 		"python.venvPath": "/usr/local/lib/python3.7/site-packages/pyspark/jars",
 9 | 		"python.testing.pytestArgs": [
10 | 			"src"
11 | 		],
12 | 		"python.testing.unittestEnabled": false,
13 | 		"python.testing.nosetestsEnabled": false,
14 | 		"python.testing.pytestEnabled": true,
15 | 		"python.envFile":  "${workspaceFolder}/.env",
16 | 		"python.analysis.extraPaths": [
17 | 			"src/modules"
18 | 		],
19 | 		"workbench.colorCustomizations": {
20 | 			"activityBar.activeBackground": "#93fcdc",
21 | 			"activityBar.activeBorder": "#fa45d4",
22 | 			"activityBar.background": "#93e6fc",
23 | 			"activityBar.foreground": "#15202b",
24 | 			"activityBar.inactiveForeground": "#15202b99",
25 | 			"activityBarBadge.background": "#fa45d4",
26 | 			"activityBarBadge.foreground": "#15202b",
27 | 			"statusBar.background": "#93fcdc",
28 | 			"statusBar.foreground": "#15202b",
29 | 			"statusBarItem.hoverBackground": "#2fd0fa",
30 | 			"titleBar.activeBackground": "#93fcdc",
31 | 			"titleBar.activeForeground": "#15202b",
32 | 			"titleBar.inactiveBackground": "#93fcdc99",
33 | 			"titleBar.inactiveForeground": "#15202b99"
34 | 		},
35 | 		"peacock.remoteColor": "#93fcdc"
36 | 	},
37 | 	"extensions": {
38 | 		"recommendations": [
39 | 			"ms-python.python",
40 | 			"visualstudioexptteam.vscodeintellicode",
41 | 			"ms-python.vscode-pylance",
42 | 			"ms-azuretools.vscode-docker",
43 | 			"ms-vscode-remote.remote-containers"
44 | 		]
45 | 	}
46 | }


--------------------------------------------------------------------------------