├── .azure_devops
    └── workflows
    │   ├── 1-master-pipelines
    │       ├── cicd-pipeline.yaml
    │       └── manual-pipeline.yaml
    │   ├── 2-jobs-pipelines
    │       └── job-deployment.yaml
    │   └── 3-steps-pipelines
    │       ├── step-deployment.yaml
    │       └── step-pr-tests.yaml
├── .dbx
    ├── lock.json
    └── project.json
├── .github
    └── workflows
    │   ├── onDeploy.yaml
    │   ├── onRelease.yaml
    │   └── taskDatabricks.yaml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── New Text Document.txt
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── dataops
    └── src_nyc_taxi
    │   ├── data_quality.py
    │   └── transform.py
├── docs
    ├── README.md
    └── images
    │   ├── AppInsightConnectionString.jpg
    │   ├── AzureResources.JPG
    │   ├── Azure_Machine_Learning_GIF.gif
    │   ├── DatabricksNotebookExecution.JPG
    │   ├── DatabricksORGIDandHOSTID.JPG
    │   ├── DatabricksTokenGeneration.jpg
    │   ├── DevContainer.jpg
    │   ├── DockerImageLoad.jpg
    │   ├── InstallExtensions.jpg
    │   ├── MLOps_for_databricks_Solution_Acclerator_logo.JPG
    │   ├── OutputOfTheConfigurationStep.jpg
    │   ├── Overview.JPG
    │   ├── PipelineSteps.JPG
    │   ├── PowershellScreen.jpg
    │   ├── SecretsFileImage.jpg
    │   ├── SuccessfulClusterCreation.JPG
    │   ├── Verify_Python_Interpreter.jpg
    │   ├── YoutubeThumbNail.png
    │   ├── cluster-upload-wheel.jpg
    │   ├── databricks-connect-pass.jpg
    │   ├── dstoolitgif.gif
    │   ├── final.jpg
    │   ├── map01.png
    │   ├── map02.png
    │   ├── map03.png
    │   ├── map04.png
    │   ├── map05.png
    │   ├── map06.png
    │   ├── map07.png
    │   ├── pythonversion.jpg
    │   └── workspaceselection.jpg
├── experiments
    ├── notebooks
    │   └── ciaran_experiments
    │   │   └── nyc_taxi
    │   │       └── nyc_taxi_lgbm_1.py
    └── pipelines
    │   └── ciaran_experiments
    │       ├── workflow.yaml
    │       └── workflow_configs
    │           ├── featurization.yaml
    │           ├── training.yaml
    │           └── workflow_params.yaml
├── infrastructure
    ├── bicep
    │   ├── az_templates
    │   │   ├── az_app_insights
    │   │   │   └── az_app_insights.bicep
    │   │   ├── az_data_lake
    │   │   │   └── az_data_lake.bicep
    │   │   ├── az_databricks
    │   │   │   └── az_databricks.bicep
    │   │   ├── az_key_vault
    │   │   │   └── az_key_vault.bicep
    │   │   └── az_machine_learning
    │   │   │   └── az_machine_learning.bicep
    │   ├── main.bicep
    │   └── params
    │   │   ├── development
    │   │       └── bicep.parameters.json
    │   │   ├── production
    │   │       └── bicep.parameters.json
    │   │   ├── sandbox
    │   │       └── bicep.parameters.json
    │   │   └── uat
    │   │       └── bicep.parameters.json
    └── databricks
    │   └── databricks_configs
    │       ├── development
    │           ├── clusters.json
    │           ├── rbac.json
    │           └── repos.json
    │       ├── production
    │           ├── clusters.json
    │           ├── rbac.json
    │           └── repos.json
    │       ├── sandbox
    │           ├── clusters.json
    │           ├── rbac.json
    │           └── repos.json
    │       └── uat
    │           ├── clusters.json
    │           ├── rbac.json
    │           └── repos.json
├── mlops
    └── nyc_taxi
    │   ├── aml_pipelines
    │       ├── v1
    │       │   └── nyc_pipeline.py
    │       └── v2
    │       │   └── dontdelete
    │       │       ├── databricks
    │       │           └── listclusters.py
    │       │       ├── dependencies
    │       │           └── conda.yaml
    │       │       └── pipelines
    │       │           └── databricks.ipynb
    │   ├── databricks_workflows
    │       ├── nyc_taxi.yaml
    │       └── unit_tests.yaml
    │   └── monitoring
    │       ├── data_drift_monitor.py
    │       ├── mflow_experiment_dashboard_pbi.py
    │       └── model_serving_monitor.py
├── pyproject.toml
├── score.py
├── setup.ps1
├── src
    └── pkg
    │   ├── dbx_utils
    │       ├── __init__.py
    │       ├── common.py
    │       ├── utils_azure_login.py
    │       ├── utils_azure_login.sh
    │       ├── utils_create_aad_tokens.py
    │       ├── utils_create_aad_tokens.sh
    │       ├── utils_create_azure_resources.py
    │       ├── utils_create_azure_resources.sh
    │       ├── utils_create_cluster.py
    │       ├── utils_create_databricks_token.sh
    │       ├── utils_create_key_vault_secrets.sh
    │       ├── utils_create_repo_folder.py
    │       ├── utils_create_repo_folder.sh
    │       ├── utils_create_role_based_access.sh
    │       ├── utils_create_secret_scopes.py
    │       ├── utils_create_secret_scopes.sh
    │       ├── utils_git_configuration.py
    │       ├── utils_repo_pull.py
    │       ├── utils_repo_pull.sh
    │       └── utils_set_env_vars.sh
    │   ├── nyc_taxi
    │       ├── build
    │       │   └── lib
    │       │   │   ├── common
    │       │   │       └── __init__.py
    │       │   │   ├── evaluation
    │       │   │       └── __init__.py
    │       │   │   ├── featurization
    │       │   │       └── __init__.py
    │       │   │   ├── prediction
    │       │   │       └── __init__.py
    │       │   │   ├── registration
    │       │   │       └── __init__.py
    │       │   │   └── training
    │       │   │       └── __init__.py
    │       ├── common
    │       │   └── __init__.py
    │       ├── dist
    │       │   ├── src_nyc_taxi-0.0.1-py3-none-any.whl
    │       │   └── src_nyc_taxi-0.0.1.tar.gz
    │       ├── entrypoint.py
    │       ├── evaluation
    │       │   └── __init__.py
    │       ├── featurization
    │       │   └── __init__.py
    │       ├── prediction
    │       │   └── __init__.py
    │       ├── pyproject.toml
    │       ├── registration
    │       │   └── __init__.py
    │       ├── setup.cfg
    │       ├── setup.py
    │       ├── src_nyc_taxi.egg-info
    │       │   ├── PKG-INFO
    │       │   ├── SOURCES.txt
    │       │   ├── dependency_links.txt
    │       │   ├── requires.txt
    │       │   └── top_level.txt
    │       └── training
    │       │   └── __init__.py
    │   └── wine_quality
    │       ├── combined_wine_data.csv
    │       ├── wine_quality.py
    │       └── winedata.csv
└── test
    ├── entrypoint.py
    └── test_dbx_utils_pkg
        ├── test_utils_azure_login.py
        ├── test_utils_create_azure_resources.py
        ├── test_utils_create_cluster.py
        ├── test_utils_create_repo_folder.py
        └── test_utils_repo_pull.py


/.azure_devops/workflows/1-master-pipelines/cicd-pipeline.yaml:
--------------------------------------------------------------------------------
  1 | # UPDATES PENDING - MAY NOT WORK
  2 | 
  3 | 
  4 | name:                                     Databricks Deployment
  5 | 
  6 | trigger:
  7 |   branches:
  8 |     include:
  9 |     - main
 10 |     - dev
 11 |     #- features/*
 12 |     - releases/*
 13 |   paths:
 14 |     exclude:
 15 |     - README.md
 16 | 
 17 | #pr: none
 18 | 
 19 | pool:
 20 |   vmImage:                                'ubuntu-latest'
 21 | 
 22 | 
 23 | #Secrets
 24 | variables:
 25 | - group: ADO-Secrets
 26 | - name: isPR
 27 |   value: $[startsWith(variables['Build.SourceBranch'], 'refs/pull/')]
 28 | 
 29 | - name: isMain
 30 |   value: $[eq(variables['Build.SourceBranch'], 'refs/heads/main')]
 31 | 
 32 | - name: isPRFeatureOnMain
 33 |   value: $[eq(variables['System.PullRequest.SourceBranch'], 'features/*')]
 34 | 
 35 | 
 36 | #resources:
 37 | #  repositories:
 38 | #  - repository: self
 39 | #    ref: 'refs/heads/$(branchName)'
 40 | 
 41 | # PROTECT THE MAIN BRANCH SO YOU CANT PUSH DIRECTLY TO IT 
 42 | stages:
 43 |   - stage: developmentDeploy  
 44 |     condition: and(eq(variables['Build.SourceBranchName'], 'dev'), eq(variables['Build.Reason'], 'IndividualCI'))                            
 45 |     displayName: developmentDeploy
 46 |     jobs:
 47 |       - template: ..\2-jobs-pipelines\job-deployment.yaml
 48 |         parameters:
 49 |           environment: development
 50 |           azureSubscription: DBX_ADO_DSTOOLKIT
 51 |           branchName: $(Build.SourceBranchName)
 52 | 
 53 | # This will deploy code on the source branch for the PR. If PR from Feature to Dev, then this will deploy Feature. 
 54 |   - stage: pullRequestChecks
 55 |     condition: and(startsWith(variables['system.pullRequest.sourceBranch'], 'features/'), eq(variables['system.pullRequest.targetBranch'], 'dev'))                      
 56 |     displayName: pullRequestChecks
 57 |     jobs:
 58 |       - template: ..\3-steps-pipelines\step-pr-tests.yaml
 59 |         parameters:
 60 |           environment: development
 61 |           azureSubscription: DBX_ADO_DSTOOLKIT
 62 |           branchName: $(Build.SourceBranchName)
 63 | 
 64 | 
 65 | # Change To Main Branch --> Deploy To Test Environment
 66 |   - stage: uatDeploy                                   
 67 |     displayName: uatDeploy
 68 |     condition: and(eq(variables['Build.SourceBranchName'], 'main'), eq(variables['Build.Reason'], 'IndividualCI'))  
 69 |     jobs:
 70 |       - template:  ..\2-jobs-pipelines\job-deployment.yaml
 71 |         parameters:
 72 |           environment: uat
 73 |           azureSubscription: DBX_ADO_DSTOOLKIT
 74 |           branchName: $(Build.SourceBranchName)
 75 |   
 76 | 
 77 |   - stage: pullRequestChecks_dev_to_main
 78 |     condition: and(eq(variables['system.pullRequest.sourceBranch'], 'dev'), eq(variables['system.pullRequest.targetBranch'], 'main'))                      
 79 |     displayName: pullRequestChecks
 80 |     jobs:
 81 |       - template: ..\3-steps-pipelines\step-pr-tests.yaml
 82 |         parameters:
 83 |           environment: uat
 84 |           azureSubscription: DBX_ADO_DSTOOLKIT
 85 |           branchName: $(Build.SourceBranchName)
 86 | 
 87 | 
 88 | # Tag Release Branch --> Deploy To Production Environment
 89 | #  - stage:                                ProductionDeploy                                   
 90 | #    displayName:                          ProductionDeploy
 91 | #    condition: and(startsWith(variables['Build.SourceBranch'], 'refs/tags/v'), eq(variables['Build.Reason'], 'IndividualCI')) 
 92 | #    jobs:
 93 | #      - template:                         ..\2-Jobs\Job-Databricks.yaml
 94 | #        parameters:
 95 | #          Environment:                    Production
 96 | #          azureSubscription:              DBX_ADO_DSTOOLKIT
 97 | #          enableRepoPull:                 true
 98 | #          branchName:                     $(Build.SourceBranchName)
 99 | #          updateFolder:                   DevelopmentFolder 
100 | 
101 | 
102 | # Implement a condition to ignore a azure resource deployment if Infra folder is unchanged. 
103 | # https://pumpingco.de/blog/run-an-azure-pipelines-job-only-if-source-code-has-changed/
104 | 


--------------------------------------------------------------------------------
/.azure_devops/workflows/1-master-pipelines/manual-pipeline.yaml:
--------------------------------------------------------------------------------
 1 | name: Service Principal MLOps Databricks Deployment
 2 | 
 3 | trigger: none
 4 | pr: none
 5 | 
 6 | pool:
 7 |   vmImage: ubuntu-latest
 8 | 
 9 | variables:
10 | - group: ADO-Secrets
11 | 
12 | parameters:
13 | - name: environment
14 |   displayName: Choose Environment 
15 |   type: string
16 |   default: sandbox
17 |   values:
18 |   - sandbox
19 |   - development
20 |   - uat
21 |   - production
22 |   - all 
23 | 
24 | - name: azureSubscription
25 |   displayName: Enter Service Connection Name
26 |   default: DBX_ADO_DSTOOLKIT
27 |   type: string 
28 | 
29 | stages:
30 |   - stage: sandboxDeployment
31 |     condition: or( eq('${{ parameters.environment }}', 'sandbox'), eq('${{ parameters.environment }}', 'all'))          
32 |     displayName: sandboxDeployment
33 |     jobs:
34 |       - template: ..\2-jobs-pipelines\job-deployment.yaml
35 |         parameters:
36 |           environment: sandbox  
37 |           azureSubscription: ${{ parameters.azureSubscription }}
38 |           branchName: main
39 |   
40 |   - stage: developmentDeployment   
41 |     condition: or( eq('${{ parameters.environment }}', 'development'), eq('${{ parameters.environment }}', 'all'))                    
42 |     displayName: developmentDeployment
43 |     dependsOn: [] 
44 |     jobs:
45 |       - template: ..\2-jobs-pipelines\job-deployment.yaml
46 |         parameters:
47 |           environment: development
48 |           azureSubscription: ${{ parameters.azureSubscription }}
49 |           branchName: main
50 | 
51 |   - stage: uatDeployment 
52 |     condition: or( eq('${{ parameters.ENVIRONMENT }}', 'uat'), eq('${{ parameters.ENVIRONMENT }}', 'all'))
53 |     displayName: uatDeployment
54 |     dependsOn: [] 
55 |     jobs:
56 |       - template: ..\2-jobs-pipelines\job-deployment.yaml
57 |         parameters:
58 |           environment: uat
59 |           azureSubscription: ${{ parameters.azureSubscription }}
60 |           releaseBranch: 'release/1'
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/.azure_devops/workflows/2-jobs-pipelines/job-deployment.yaml:
--------------------------------------------------------------------------------
 1 | parameters: 
 2 |   environment:                                String
 3 |   azureSubscription:                          String
 4 |   branchName:                                 String
 5 | 
 6 | jobs:                  
 7 |   - deployment:                               databricks_mlops_${{ parameters.environment }}
 8 |     displayName:                              databricks_mlops_${{ parameters.environment }}
 9 |     variables:
10 |       - name: PYSPARK_PYTHON
11 |         value: python3.9
12 |     environment:                              ${{ parameters.environment }}
13 |     strategy:
14 |       runOnce:
15 |         deploy:
16 |           steps:
17 |           - checkout:                         self
18 |             fetchDepth:                       2
19 |             #ref:                              ${{ parameters.branchName }}
20 |             # Paramount for fetchDepth to 2 for Git File Changes Check
21 |           - template:                         ../3-steps-pipelines/step-deployment.yaml
22 |             parameters:
23 |               azureSubscription:              ${{ parameters.azureSubscription }}       
24 |               environment:                    ${{ parameters.environment }}
25 |               branchName:                     ${{ parameters.branchName }}
26 | 


--------------------------------------------------------------------------------
/.azure_devops/workflows/3-steps-pipelines/step-pr-tests.yaml:
--------------------------------------------------------------------------------
  1 | parameters:
  2 |   azureSubscription:                  String
  3 |   environment:                        String
  4 |   branchName:                         String
  5 | 
  6 | steps:
  7 | 
  8 | - task:     UsePythonVersion@0
  9 |   inputs:
 10 |     versionSpec: '3.8'
 11 |     architecture: 'x64'
 12 | 
 13 | - script: |
 14 |     sudo apt update && sudo apt install jq -y    
 15 |     python -m pip install requests python-dotenv poetry databricks-cli setuptools
 16 |     python -m pip install azure-cli==2.49.0 azure-mgmt-storage==21.0.0
 17 |     az extension add -n azure-cli-ml
 18 |     #python -m pip install azureml azureml-core azureml-pipeline
 19 |   displayName: Install Packages
 20 | 
 21 | - script: |
 22 |     az config set extension.use_dynamic_install=yes_without_prompt
 23 |     az extension add --name databricks
 24 |   displayName: Configure Azure CLI
 25 | 
 26 | - script: |
 27 |     python -m poetry install
 28 |   displayName: 'Install Testing Requirements locally'
 29 | 
 30 | - bash: |
 31 |     mkdir -p tests
 32 |   displayName: 'Create Unit Test Directory'
 33 | 
 34 | #- script: |
 35 | #    python -m poetry run pylint --output-format=pylint_junit.JUnitReporter:tests/testresults.xml src/pkg/dbx_utils
 36 | #  displayName: 'Pylinting dbx_utils'
 37 | 
 38 | - script: |
 39 |     python -m poetry run bandit -rv src/pkg/dbx_utils/
 40 |   displayName: 'Security Checks Bandit'
 41 |   continueOnError: true 
 42 | 
 43 | 
 44 | - task: AzureCLI@2
 45 |   displayName: Generate AAD Tokens
 46 |   name: aad_tokens
 47 |   continueOnError: true 
 48 |   inputs:
 49 |     azureSubscription: ${{ parameters.azureSubscription }}
 50 |     scriptType: bash
 51 |     scriptLocation: scriptPath
 52 |     scriptPath: $(Build.SourcesDirectory)/src/pkg/dbx_utils/utils_create_aad_tokens.sh
 53 |   env:
 54 |     DBX_RESOURCE_ID: 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d
 55 | 
 56 | - script: |
 57 |     python -m poetry run python $(Build.SourcesDirectory)/src/pkg/dbx_utils/utils_azure_login.py
 58 |   displayName: Azure Login
 59 |   continueOnError: true 
 60 |   env:
 61 |     ARM_CLIENT_ID : $(ARM_CLIENT_ID)
 62 |     ARM_TENANT_ID: $(ARM_TENANT_ID)
 63 |     ARM_CLIENT_SECRET: $(ARM_CLIENT_SECRET)
 64 | 
 65 |   
 66 | - task: AzureCLI@2
 67 |   displayName: Set Environment Variables  
 68 |   name: "env_variables"
 69 |   inputs:
 70 |     scriptType: bash
 71 |     scriptLocation: scriptPath 
 72 |     azureSubscription: ${{ parameters.azureSubscription }}
 73 |     scriptPath: $(Build.SourcesDirectory)/src/pkg/dbx_utils/utils_set_env_vars.sh
 74 |   env:
 75 |     ENVIRONMENT: ${{ parameters.environment }}
 76 |     DevOps_Agent: "Azure DevOps Agent"
 77 |   
 78 | 
 79 | - script: |
 80 |     set -e
 81 |     python -m poetry run dbx configure
 82 | 
 83 |     python -m poetry run dbx execute DatabricksUtilsTesting \
 84 |       --deployment-file mlops/nyc_taxi/databricks_workflows/unit_tests.yaml \
 85 |       --cluster-name=ml_cluster
 86 |     
 87 |     databricks fs cp dbfs:/FileStore/databricks_utils_unit_testresults.xml $(Build.ArtifactStagingDirectory)/databricks_utils_unit_testresults.xml 
 88 | 
 89 |     databricks fs cp dbfs:/FileStore/databricks_utils_cov_report.xml $(Build.ArtifactStagingDirectory)/databricks_utils_cov_report.xml
 90 | 
 91 |   displayName: Unit Testing - DBX Cluster
 92 |   env: 
 93 |     DATABRICKS_TOKEN: $(aad_tokens.DATABRICKS_AAD_TOKEN)
 94 |     DATABRICKS_HOST: $(env_variables.DATABRICKS_HOST)
 95 | 
 96 | - task: PublishTestResults@2
 97 |   inputs:
 98 |     testResultsFormat: 'JUnit'
 99 |     testResultsFiles: '$(Build.ArtifactStagingDirectory)/*_testresults.xml'
100 |     testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version) - Unit Test results'
101 |   condition: succeededOrFailed()
102 |   displayName: 'Publish Unit Test Results'
103 | 
104 | - task: PublishTestResults@2
105 |   inputs:
106 |     testResultsFormat: 'JUnit'
107 |     testResultsFiles: '**/*_testresults.xml'
108 |     testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version) - Linting Test results'
109 |   condition: succeededOrFailed()
110 |   displayName: 'Publish Linting Test Results'
111 | 
112 | 
113 | - task: PublishCodeCoverageResults@1
114 |   inputs:
115 |     codeCoverageTool: Cobertura
116 |     summaryFileLocation: '$(Build.ArtifactStagingDirectory)/*_cov_report.xml'
117 |   displayName: 'Publish Coverage Results'
118 |   condition: succeededOrFailed()
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/.dbx/lock.json:
--------------------------------------------------------------------------------
1 | {
2 |     "context_id": "8789636436477646098"
3 | }


--------------------------------------------------------------------------------
/.dbx/project.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "environments": {
 3 |         "default": {
 4 |             "profile": "DEFAULT",
 5 |             "storage_type": "mlflow",
 6 |             "properties": {
 7 |                 "workspace_directory": "/Shared/dbx/projects/dstoolkit-mlops-databricks",
 8 |                 "artifact_location": "dbfs:/dbx/dstoolkit-mlops-databricks"
 9 |             }
10 |         },
11 |         "sandbox": {
12 |             "profile": "sandbox",
13 |             "storage_type": "mlflow",
14 |             "properties": {
15 |                 "workspace_directory": "/Shared/dbx/projects/dstoolkit-mlops-databricks",
16 |                 "artifact_location": "dbfs:/dbx/dstoolkit-mlops-databricks"
17 |             }
18 |         },
19 |         "development": {
20 |             "profile": "development",
21 |             "storage_type": "mlflow",
22 |             "properties": {
23 |                 "workspace_directory": "/Shared/dbx/projects/dstoolkit-mlops-databricks",
24 |                 "artifact_location": "dbfs:/dbx/dstoolkit-mlops-databricks"
25 |             }
26 |         },
27 |         "uat": {
28 |             "profile": "uat",
29 |             "storage_type": "mlflow",
30 |             "properties": {
31 |                 "workspace_directory": "/Shared/dbx/projects/dstoolkit-mlops-databricks",
32 |                 "artifact_location": "dbfs:/dbx/dstoolkit-mlops-databricks"
33 |             }
34 |         },
35 |         "production": {
36 |             "profile": "production",
37 |             "storage_type": "mlflow",
38 |             "properties": {
39 |                 "workspace_directory": "/Shared/dbx/projects/dstoolkit-mlops-databricks",
40 |                 "artifact_location": "dbfs:/dbx/dstoolkit-mlops-databricks"
41 |             }
42 |         },
43 |         "ciaran_sandbox": {
44 |             "profile": "ciaran_sandbox",
45 |             "workspace_dir": "/Shared/ciaran_sandbox",
46 |             "artifact_location": "dbfs:/Shared/cicd_workflows/ciaran_sandbox"
47 |         }
48 |     },
49 |     "inplace_jinja_support": true,
50 |     "failsafe_cluster_reuse_with_assets": false,
51 |     "context_based_upload_for_execute": false
52 | }


--------------------------------------------------------------------------------
/.github/workflows/onDeploy.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | name:                         onDeploy Databricks
 3 | on:                             
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       script_language: 
 7 |         type: choice
 8 |         description: Python or Bash (Databricks API)
 9 |         options:
10 |           - python
11 |           - bash
12 |         default: python
13 | 
14 | jobs:
15 |   ReuseableMatrixJobForDeployment:
16 |     name:                     Master Deployment
17 |     strategy:
18 |       matrix:
19 |         targetEnvironment:    [ sandbox ] # development, uat, production 
20 |     uses:                     ./.github/workflows/taskDatabricks.yaml
21 |     with:
22 |       ENVIRONMENT:            ${{ matrix.targetEnvironment }}
23 |       DBX_REPO_BRANCH:        'main'
24 |       SCRIPT_LANGUAGE:        ${{ github.event.inputs.script_language }}
25 |       DevOps_Agent:           GitHub
26 |     secrets:
27 |       ARM_CLIENT_ID:          ${{ secrets.ARM_CLIENT_ID }}    
28 |       ARM_CLIENT_SECRET:      ${{ secrets.ARM_CLIENT_SECRET }}
29 |       ARM_TENANT_ID:          ${{ secrets.ARM_TENANT_ID }}
30 |       PAT_GITHUB:             ${{ secrets.PAT_GITHUB }}
31 |       
32 |   
33 | 
34 | 
35 | 
36 |       
37 | 
38 |   
39 |   
40 | 


--------------------------------------------------------------------------------
/.github/workflows/onRelease.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | name: onRelease Databricks
 4 | on: 
 5 |   pull_request:
 6 |     branches:
 7 |       - main
 8 |       - feature/**
 9 |       - release/**
10 |     tags:
11 |       - 'v**'
12 |     types:
13 |       - opened
14 |       - closed
15 | 
16 | jobs:
17 |   pr_CI_Development:
18 | 
19 |     if:                         github.event_name == 'pull_request' && github.event.action == 'opened' && github.base_ref == 'main'
20 |     name:                       Checks
21 |     runs-on:                    ubuntu-latest
22 |     steps:
23 |       - run: |
24 |           echo "Insert Continuous Integration Tests"
25 | 
26 |           # IMPORTANT: The testing framework is not yet implemented, and therefore still under development. 
27 | 
28 |           #cd mlOps/devOps/utils
29 | 
30 |           #python -m pytest -v
31 | 
32 |   prApproved_CD_Development:
33 |     if:                         github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged == true && contains(github.head_ref, 'feature') && github.base_ref == 'main'
34 |     uses:                       ./.github/workflows/taskDatabricks.yaml
35 |     with:
36 |       ENVIRONMENT:              development
37 |       DBX_REPO_BRANCH:          main
38 |       SCRIPT_LANGUAGE:          python
39 |       DevOps_Agent:             GitHub
40 |     secrets:
41 |       ARM_CLIENT_ID:            ${{ secrets.ARM_CLIENT_ID }}    
42 |       ARM_CLIENT_SECRET:        ${{ secrets.ARM_CLIENT_SECRET }}
43 |       ARM_TENANT_ID:            ${{ secrets.ARM_TENANT_ID }}
44 |       PAT_GITHUB:               ${{ secrets.PAT_GITHUB }}
45 | 
46 |   pr_CI_UAT:
47 | 
48 |     if:                         github.event_name == 'pull_request' && github.event.action == 'opened' && contains(github.base_ref, 'release')
49 |     name:                       Checks
50 |     runs-on:                    ubuntu-latest
51 |     steps:
52 |       - run: |
53 |           echo "Insert Continuous Integration Tests"
54 |       - run: |
55 |           echo "${{ github.head_ref }}"
56 |           echo "${{ github.base_ref }}"
57 | 
58 |   prApproved_CD_UAT:
59 |     if:                         github.event_name == 'pull_request' && github.event.action == 'closed' && github.head_ref == 'main' && contains(github.base_ref, 'release')
60 |     uses:                       ./.github/workflows/taskDatabricks.yaml
61 |     with:
62 |       ENVIRONMENT:              uat
63 |       DBX_REPO_BRANCH:          'release/1'
64 |       SCRIPT_LANGUAGE:          python
65 |       DevOps_Agent:             GitHub
66 |     secrets:
67 |       ARM_CLIENT_ID:            ${{ secrets.ARM_CLIENT_ID }}    
68 |       ARM_CLIENT_SECRET:        ${{ secrets.ARM_CLIENT_SECRET }}
69 |       ARM_TENANT_ID:            ${{ secrets.ARM_TENANT_ID }}
70 |       PAT_GITHUB:               ${{ secrets.PAT_GITHUB }}
71 | 
72 |     
73 | # git tag -a v1.0.1 -m "my version 1.0.1"
74 | # git tag -l
75 | # git push origin v1.0.1
76 | # git tag -d v1.0.1
77 | # ( Create Security Rule That Only Allow Release Branch to be Tagged). 
78 | 
79 |   prApproved_CD_Production:
80 |     if:                         ${{ startsWith(github.ref, 'refs/tags/v') }}
81 |     uses:                       ./.github/workflows/taskDatabricks.yaml
82 |     with:
83 |       ENVIRONMENT:              production
84 |       DBX_REPO_BRANCH:          'release/1'
85 |       SCRIPT_LANGUAGE:          python
86 |       DevOps_Agent:             GitHub
87 |     secrets:
88 |       ARM_CLIENT_ID:            ${{ secrets.ARM_CLIENT_ID }}    
89 |       ARM_CLIENT_SECRET:        ${{ secrets.ARM_CLIENT_SECRET }}
90 |       ARM_TENANT_ID:            ${{ secrets.ARM_TENANT_ID }}
91 |       PAT_GITHUB:               ${{ secrets.PAT_GITHUB }}
92 |       
93 |     
94 | 
95 | 
96 | 
97 | 
98 |       
99 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/.vscode/
 2 | .venv
 3 | .databricks
 4 | deprecated
 5 | mlOps/modelOps/data_science/deprecated
 6 | mlOps/devOps/params/deprecated
 7 | .dbx/sync
 8 | localdev
 9 | azureDevOps
10 | .env
11 | .venv_dbx_con13
12 | dev.env
13 | data_science/src_nyc_taxi/build
14 | data_science/src_nyc_taxi/dist
15 | data_science/src_nyc_taxi/*/__pycache__ 
16 | data_science/src_nyc_taxi/src_nyc_taxi.egg-info
17 | data_science/src_nyc_taxi/src_nyc_taxi-0.0.1
18 | poetry.lock
19 | .linux_venv
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Microsoft
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/New Text Document.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/New Text Document.txt


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/dataops/src_nyc_taxi/data_quality.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/dataops/src_nyc_taxi/data_quality.py


--------------------------------------------------------------------------------
/dataops/src_nyc_taxi/transform.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | 
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | # MAGIC %md ## Define data preprocessing helper functions
  7 | 
  8 | # COMMAND ----------
  9 | 
 10 | from uszipcode import SearchEngine
 11 | import sqlite3
 12 | import pandas as pd
 13 | from pyspark.sql.functions import udf, col
 14 | from pyspark.sql.types import IntegerType
 15 | import math
 16 | from urllib import request
 17 | import os
 18 | 
 19 | BAD_ZIPCODE_VALUE = 'bad_zipcode'
 20 | file_location = "dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/"
 21 | file_type = "csv"
 22 | target_year = 2016
 23 | 
 24 | def push_zipcode_data_to_executors():
 25 |   # Download directly from github since the default download location can be flaky
 26 |   target_dir = '/tmp/db/'
 27 |   print(target_dir)
 28 |   target_file = os.path.join(target_dir, 'simple_db.sqlite')
 29 |   print(target_file)
 30 |   remote_url = 'https://github.com/MacHu-GWU/uszipcode-project/files/5183256/simple_db.log'
 31 |   os.makedirs(target_dir, exist_ok=True)
 32 |   print(os.makedirs(target_dir, exist_ok=True))
 33 |   request.urlretrieve(remote_url, target_file)
 34 |   print(request.urlretrieve(remote_url, target_file))
 35 |   # Query the zipcode database into a pandas dataframe
 36 |   #search = SearchEngine(db_file_dir=target_dir)
 37 |   conn = sqlite3.connect(target_file)
 38 |   pdf = pd.read_sql_query('''select  zipcode, lat, lng, radius_in_miles, 
 39 |                           bounds_west, bounds_east, bounds_north, bounds_south from 
 40 |                           simple_zipcode''',conn)
 41 |   return sc.broadcast(pdf)
 42 |   
 43 | # Define UDF to lookup ZIP code based on latitude and longitude
 44 | @udf('string')
 45 | def get_zipcode(lat, lng):
 46 |     if lat is None or lng is None:
 47 |       return BAD_ZIPCODE_VALUE
 48 |     dist_btwn_lat_deg = 69.172
 49 |     dist_btwn_lon_deg = math.cos(lat) * 69.172
 50 |     radius = 5
 51 |     lat_degr_rad = abs(radius / dist_btwn_lat_deg)
 52 |     lon_degr_rad = abs(radius / dist_btwn_lon_deg)
 53 |     lat_lower = lat - lat_degr_rad
 54 |     lat_upper = lat + lat_degr_rad
 55 |     lng_lower = lng - lon_degr_rad
 56 |     lng_upper = lng + lon_degr_rad
 57 |     pdf = zipcodes_broadcast_df.value
 58 |     try:
 59 |         out = pdf[(pdf['lat'].between(lat_lower, lat_upper)) & (pdf['lng'].between(lng_lower, lng_upper))]
 60 |         dist = [None]*len(out)
 61 |         for i in range(len(out)):
 62 |             dist[i] = (out['lat'].iloc[i]-lat)**2 + (out['lng'].iloc[i]-lng)**2
 63 |         zip = out['zipcode'].iloc[dist.index(min(dist))]
 64 |     except:
 65 |         zip = BAD_ZIPCODE_VALUE
 66 |     return zip
 67 |   
 68 | def get_data_files(yyyy, months):
 69 |   data_files = []
 70 |   for mm in months:
 71 |     mm = str(mm) if mm >= 10 else f"0{mm}"
 72 |     month_data_files = list(filter(lambda file_name: f"{yyyy}-{mm}" in file_name,
 73 |                            [f.path for f in dbutils.fs.ls(file_location)]))
 74 |     data_files += month_data_files
 75 |   return data_files
 76 |   
 77 | def load_data(data_files, sample=1.0):
 78 |   df = (spark.read.format("csv")
 79 |         .option("inferSchema", "true")
 80 |         .option("header", "true")
 81 |         .option("ignoreLeadingWhiteSpace", "true")
 82 |         .option("ignoreTrailingWhiteSpace", "true")
 83 |         .option("sep", ",")
 84 |         .load(data_files)
 85 |       ).sample(False, sample, 123)
 86 |   
 87 |   # Rename, cast types, and filter columns
 88 |   column_allow_list = { 
 89 |     "pickup_datetime": ["tpep_pickup_datetime", "timestamp"],
 90 |     "tpep_pickup_datetime": ["tpep_pickup_datetime", "timestamp"],
 91 |     
 92 |     # type conversion
 93 |     "dropoff_datetime": ["tpep_dropoff_datetime", "timestamp"],
 94 |     "tpep_dropoff_datetime": ["tpep_dropoff_datetime", "timestamp"],
 95 |     
 96 |     "pickup_zip": ["pickup_zip", "integer"],
 97 |     "dropoff_zip": ["dropoff_zip", "integer"],
 98 |     "trip_distance": ["trip_distance", "double"],
 99 |     "fare_amount": ["fare_amount", "double"],
100 |     "pickup_latitude": ["pickup_latitude", "double"],
101 |     "pickup_longitude": ["pickup_longitude", "double"],
102 |     "dropoff_latitude": ["dropoff_latitude", "double"],
103 |     "dropoff_longitude": ["dropoff_longitude", "double"],
104 |   }
105 |   columns = []
106 |   for orig in df.columns:
107 |     orig_lower = orig.lower()
108 |     if orig_lower in column_allow_list:
109 |       new_name, data_type = column_allow_list[orig_lower]
110 |       columns.append(col(orig).cast(data_type).alias(new_name.lower()))
111 |   
112 |   return df.select(columns)  
113 | 
114 | def annotate_zipcodes(df):
115 |   to_zip = lambda lat, lng:  get_zipcode(col(lat).astype("double"), col(lng).astype("double"))
116 |   # Add ZIP code columns, drop intermediate columns
117 |   df = (df
118 |           .withColumn('pickup_zip', to_zip("pickup_latitude", "pickup_longitude"))
119 |           .withColumn('dropoff_zip', to_zip("dropoff_latitude", "dropoff_longitude"))
120 |           .drop('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')
121 |          )
122 |   # Filter out rows with bad data
123 |   df = df.filter(df.pickup_zip != BAD_ZIPCODE_VALUE)
124 |   df = df.filter(df.dropoff_zip != BAD_ZIPCODE_VALUE)
125 |   
126 |   # Cast ZIP code to int
127 |   df = df.withColumn("pickup_zip", df["pickup_zip"].cast(IntegerType()))
128 |   df = df.withColumn("dropoff_zip", df["dropoff_zip"].cast(IntegerType()))
129 |   return df
130 | 
131 | def write_to_table(df, database, table):
132 |   (df.write
133 |    .format("delta")
134 |    .mode("overwrite")
135 |    .option("overwriteSchema", "true")
136 |    .saveAsTable(f"{database}.{table}"))
137 | 
138 | 
139 | # COMMAND ----------
140 | 
141 | spark.sql("CREATE DATABASE IF NOT EXISTS feature_store_taxi_example;")
142 | 
143 | # COMMAND ----------
144 | 
145 | # MAGIC %md ## Generate DataFrame and write to table
146 | 
147 | # COMMAND ----------
148 | 
149 | # Read ZIP code data and push a broadcast dataframe to executors to speed up the UDF
150 | zipcodes_broadcast_df = push_zipcode_data_to_executors()
151 | 
152 | # Generate data file names for the first 2 months of data in 2016
153 | data_files = get_data_files(target_year,months=[1,2])
154 | 
155 | # Load in a small subsample of data to speed things up for this example
156 | df = load_data(data_files, sample=.001)
157 | 
158 | # Repartition -- by default this dataset only has a single partition.  
159 | # Use a small parition count since the dataset is already small.
160 | df = df.repartition(6)
161 | 
162 | # Enhance the DataFrame by converting latitude and longitude coordinates into ZIP codes 
163 | df_with_zip = annotate_zipcodes(df)
164 | 
165 | # Write the DataFrame to a Delta table
166 | write_to_table(df_with_zip, database="feature_store_taxi_example", table="nyc_yellow_taxi_with_zips")
167 | 
168 | # COMMAND ----------
169 | 
170 | raw_data = spark.read.table("feature_store_taxi_example.nyc_yellow_taxi_with_zips")
171 | 
172 | # COMMAND ----------
173 | 
174 | display(raw_data)
175 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # TODO


--------------------------------------------------------------------------------
/docs/images/AppInsightConnectionString.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/AppInsightConnectionString.jpg


--------------------------------------------------------------------------------
/docs/images/AzureResources.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/AzureResources.JPG


--------------------------------------------------------------------------------
/docs/images/Azure_Machine_Learning_GIF.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/Azure_Machine_Learning_GIF.gif


--------------------------------------------------------------------------------
/docs/images/DatabricksNotebookExecution.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/DatabricksNotebookExecution.JPG


--------------------------------------------------------------------------------
/docs/images/DatabricksORGIDandHOSTID.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/DatabricksORGIDandHOSTID.JPG


--------------------------------------------------------------------------------
/docs/images/DatabricksTokenGeneration.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/DatabricksTokenGeneration.jpg


--------------------------------------------------------------------------------
/docs/images/DevContainer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/DevContainer.jpg


--------------------------------------------------------------------------------
/docs/images/DockerImageLoad.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/DockerImageLoad.jpg


--------------------------------------------------------------------------------
/docs/images/InstallExtensions.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/InstallExtensions.jpg


--------------------------------------------------------------------------------
/docs/images/MLOps_for_databricks_Solution_Acclerator_logo.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/MLOps_for_databricks_Solution_Acclerator_logo.JPG


--------------------------------------------------------------------------------
/docs/images/OutputOfTheConfigurationStep.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/OutputOfTheConfigurationStep.jpg


--------------------------------------------------------------------------------
/docs/images/Overview.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/Overview.JPG


--------------------------------------------------------------------------------
/docs/images/PipelineSteps.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/PipelineSteps.JPG


--------------------------------------------------------------------------------
/docs/images/PowershellScreen.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/PowershellScreen.jpg


--------------------------------------------------------------------------------
/docs/images/SecretsFileImage.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/SecretsFileImage.jpg


--------------------------------------------------------------------------------
/docs/images/SuccessfulClusterCreation.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/SuccessfulClusterCreation.JPG


--------------------------------------------------------------------------------
/docs/images/Verify_Python_Interpreter.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/Verify_Python_Interpreter.jpg


--------------------------------------------------------------------------------
/docs/images/YoutubeThumbNail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/YoutubeThumbNail.png


--------------------------------------------------------------------------------
/docs/images/cluster-upload-wheel.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/cluster-upload-wheel.jpg


--------------------------------------------------------------------------------
/docs/images/databricks-connect-pass.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/databricks-connect-pass.jpg


--------------------------------------------------------------------------------
/docs/images/dstoolitgif.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/dstoolitgif.gif


--------------------------------------------------------------------------------
/docs/images/final.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/final.jpg


--------------------------------------------------------------------------------
/docs/images/map01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map01.png


--------------------------------------------------------------------------------
/docs/images/map02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map02.png


--------------------------------------------------------------------------------
/docs/images/map03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map03.png


--------------------------------------------------------------------------------
/docs/images/map04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map04.png


--------------------------------------------------------------------------------
/docs/images/map05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map05.png


--------------------------------------------------------------------------------
/docs/images/map06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map06.png


--------------------------------------------------------------------------------
/docs/images/map07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map07.png


--------------------------------------------------------------------------------
/docs/images/pythonversion.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/pythonversion.jpg


--------------------------------------------------------------------------------
/docs/images/workspaceselection.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/workspaceselection.jpg


--------------------------------------------------------------------------------
/experiments/notebooks/ciaran_experiments/nyc_taxi/nyc_taxi_lgbm_1.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | from featurization import run_feature_store_refresh
 3 | run_feature_store_refresh()
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | from training import run_training 
 8 | 
 9 | num_rounds_arr = [20,40,60,80,100,120,170]
10 | 
11 | for num_rounds in num_rounds_arr:
12 |     run_training(
13 |         experiment_name = "ciaran_experiment_nyc_taxi",
14 |         model_name = "taxi_example_fare_packaged",
15 |         model_params = {
16 |             "objective": "regression",
17 |             "metric": "rmse",
18 |             "num_leaves": 25,
19 |             "learning_rate": 0.2,
20 |             "bagging_fraction": 0.9,
21 |             "feature_fraction": 0.9,
22 |             "bagging_seed": 42,
23 |             "verbosity": -1,
24 |             "seed": 42,
25 |             "num_rounds": num_rounds
26 |         }
27 |     )
28 |     from registration import run_registration
29 |     run_registration(
30 |         model_name = "taxi_example_fare_packaged"
31 |     )
32 | 


--------------------------------------------------------------------------------
/experiments/pipelines/ciaran_experiments/workflow.yaml:
--------------------------------------------------------------------------------
 1 | custom:
 2 | 
 3 |   # Cluster configs for each environment
 4 |   default-cluster-spec: &default-cluster-spec
 5 |     spark_version: '11.3.x-cpu-ml-scala2.12'
 6 |     node_type_id: 'Standard_DS3_v2' 
 7 |     driver_node_type_id: 'Standard_DS3_v2'  
 8 |     num_workers: 1
 9 |     # To reduce start up time for each job, it is advisable to use a cluster pool. To do so involves supplying the following
10 |     # two fields with a pool_id to acquire both the driver and instances from.
11 |     # If driver_instance_pool_id and instance_pool_id are set, both node_type_id and driver_node_type_id CANNOT be supplied.
12 |     # As such, if providing a pool_id for driver and worker instances, please ensure that node_type_id and driver_node_type_id are not present
13 | #    driver_instance_pool_id: '0617-151415-bells2-pool-hh7h6tjm'
14 | #    instance_pool_id: '0617-151415-bells2-pool-hh7h6tjm'
15 | 
16 |   dev-cluster-config: &dev-cluster-config
17 |     new_cluster:
18 |       <<: *default-cluster-spec
19 | 
20 |   staging-cluster-config: &staging-cluster-config
21 |     new_cluster:
22 |       <<: *default-cluster-spec
23 | 
24 |   prod-cluster-config: &prod-cluster-config
25 |     new_cluster:
26 |       <<: *default-cluster-spec
27 | 
28 | #build:
29 | #  no_build: true     
30 | build:
31 |   python: poetry
32 |   #python: "poetry build -f wheel"
33 | 
34 | environments:
35 |   default:
36 |     workflows:
37 |       - name: NEW_FUNCTION 
38 |         tasks:
39 |           - task_key: "NEW_FUNCTION"
40 |             existing_cluster_id: "0524-153828-e2rk9h52"
41 |             spark_python_task:
42 |               python_file: "{{var['parameters']['file_path']}}"
43 | 
44 |       - name: FEATURE_TABLE_REFRESH 
45 |         tasks:
46 |           - task_key: "FEATURE_TABLE_REFRESH"
47 |             existing_cluster_id: "0524-153828-e2rk9h52"
48 |             spark_python_task:
49 |               python_file: "{{var['parameters']['file_path']}}"
50 | 
51 |       - name: MODEL_TRAINING 
52 |         tasks:
53 |           - task_key: "MODEL_TRAINING"
54 |             existing_cluster_id: "0524-153828-e2rk9h52"
55 |             spark_python_task:
56 |               python_file: "{{var['parameters']['file_path']}}"


--------------------------------------------------------------------------------
/experiments/pipelines/ciaran_experiments/workflow_configs/featurization.yaml:
--------------------------------------------------------------------------------
1 | parameters:
2 |     file_path: 'file://data_science/src_nyc_taxi/featurization/__init__.py'


--------------------------------------------------------------------------------
/experiments/pipelines/ciaran_experiments/workflow_configs/training.yaml:
--------------------------------------------------------------------------------
1 | parameters:
2 |     file_path: 'file://data_science/src_nyc_taxi/training/__init__.py'
3 | 


--------------------------------------------------------------------------------
/experiments/pipelines/ciaran_experiments/workflow_configs/workflow_params.yaml:
--------------------------------------------------------------------------------
 1 | ML_PIPELINE_FILES:
 2 | 
 3 |     DATA_INGEST_PREP:
 4 |         FILE_PATH: '/Repos/ciaranh@microsoft.com/experiments/mlOps/dataOps/nyc_taxi/data_prep'
 5 |         WHL_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/pyWheels/Helper_Functions/dist/helperfunctions-0.0.1-py3-none-any.whl'
 6 | 
 7 |     FEATURE_ENGINEERING: 
 8 |         FILE_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/feature_eng.py'
 9 |         WHL_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/pyWheels/Helper_Functions/dist/helperfunctions-0.0.1-py3-none-any.whl'
10 |         PARAMETERS:
11 |             ENV:  '--env'
12 |             FILE: 'file:fuse://mlOps/modelOps/ml_pipelines/az_databricks/cicd/workflow_params.yaml'
13 |             EXPERIMENT_NAME: 'dbx_workflow_fe'
14 |             TRACK_IN_AZURE_ML: True
15 | 
16 |     TRAIN_REGISTER:
17 |         FILE_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi//train_register.py'
18 |         WHL_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/pyWheels/Helper_Functions/dist/helperfunctions-0.0.1-py3-none-any.whl'
19 |         PARAMETERS:
20 |             ENV:  '--env'
21 |             FILE: 'file:fuse://mlOps/modelOps/ml_pipelines/az_databricks/cicd/workflow_params.yaml'
22 |             EXPERIMENT_NAME: 'dbx_workflow_train'
23 |             TRACK_IN_AZURE_ML: True
24 | 
25 | 
26 |     MODEL_INFERENCE:
27 |         FILE_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/score.py'
28 |         WHL_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/pyWheels/Helper_Functions/dist/helperfunctions-0.0.1-py3-none-any.whl'
29 |         PARAMETERS:
30 |             ENV:  '--env'
31 |             FILE: 'file:fuse://mlOps/modelOps/ml_pipelines/az_databricks/cicd/workflow_params.yaml'
32 |             EXPERIMENT_NAME: 'dbx_workflow_inference'
33 |             TRACK_IN_AZURE_ML: True
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/infrastructure/bicep/az_templates/az_app_insights/az_app_insights.bicep:
--------------------------------------------------------------------------------
 1 | param location string = 'uksouth'
 2 | 
 3 | param logwsname string 
 4 | var varlogwsname = '${logwsname}${substring(uniqueString(resourceGroup().id), 0, 4)}'
 5 | 
 6 | param appinsightname string 
 7 | var varappinsightname = '${appinsightname}${substring(uniqueString(resourceGroup().id), 0, 4)}'
 8 | 
 9 | 
10 | 
11 | resource logAnalytics 'Microsoft.OperationalInsights/workspaces@2021-12-01-preview' = {
12 |   name: varlogwsname
13 |   location: location
14 |   properties: {
15 |     sku: {
16 |       name: 'PerGB2018'
17 |     }
18 |     retentionInDays: 30
19 |     features: {
20 |       legacy: 0
21 |       searchVersion: 1
22 |       enableLogAccessUsingOnlyResourcePermissions: true
23 |     }
24 |     workspaceCapping: {
25 |       dailyQuotaGb: -1
26 |     }
27 |     publicNetworkAccessForIngestion: 'Enabled'
28 |     publicNetworkAccessForQuery: 'Enabled'
29 |   }
30 | }
31 | 
32 | resource appInsight 'Microsoft.Insights/components@2020-02-02' = {
33 |   name: varappinsightname
34 |   location: location
35 |   kind: 'web'
36 |   properties: {
37 |     Application_Type: 'web'
38 |     Flow_Type: 'Redfield'
39 |     Request_Source: 'IbizaAIExtension'
40 |     WorkspaceResourceId: logAnalytics.id
41 |     IngestionMode: 'LogAnalytics'
42 |     publicNetworkAccessForIngestion: 'Enabled'
43 |     publicNetworkAccessForQuery: 'Enabled'
44 | 
45 |   }
46 | }
47 | 
48 | output azAppInsightsID string = appInsight.id
49 | 


--------------------------------------------------------------------------------
/infrastructure/bicep/az_templates/az_data_lake/az_data_lake.bicep:
--------------------------------------------------------------------------------
 1 | // ################################################################################################################################################################//
 2 | //                                                                       Define Parameters                                                                  
 3 | // ################################################################################################################################################################//
 4 | param storageConfig object
 5 | param location string
 6 | param containerNames array
 7 | param ShouldCreateContainers bool = true
 8 | param storageAccountName string
 9 | param workspaceName string
10 | param resourceGroupName string
11 | param azKeyVaultName string
12 | 
13 | 
14 | // ################################################################################################################################################################//
15 | //                                                                       Define Variables                                                                    
16 | // ################################################################################################################################################################//
17 | var varstorageAccountName = '${storageAccountName}${substring(uniqueString(resourceGroup().id), 0, 4)}'
18 | 
19 | 
20 | 
21 | // ################################################################################################################################################################//
22 | //                                                             Deploy Storage Account Per Environment                                                                         
23 | // ################################################################################################################################################################//
24 | 
25 | resource azStorage 'Microsoft.Storage/storageAccounts@2021-08-01' =  {    
26 |   name: varstorageAccountName
27 |     location: location
28 |     kind: storageConfig.kind
29 |     sku: {
30 |       name: storageConfig.sku_name
31 |     }
32 |     properties: {
33 |       allowBlobPublicAccess: storageConfig.allowBlobPublicAccess
34 |       isHnsEnabled: storageConfig.isHnsEnabled
35 |       accessTier: storageConfig.accessTier
36 |     }
37 | 
38 |     // Nested Resource Deployment - Containers within Storage Account
39 |     resource blobServices 'blobServices' = {
40 |       name: 'default'
41 |       resource containersCreate 'containers' = [for ContainerName in containerNames: if (ShouldCreateContainers) {
42 |         name: ContainerName
43 |         properties: {
44 |           publicAccess: 'None'
45 |         }
46 |       }]
47 |     }
48 | }
49 | 
50 | 
51 |   
52 | // ################################################################################################################################################################//
53 | //                                                                       Outputs                                                                    
54 | // ################################################################################################################################################################//
55 | // output storagekey string = listKeys(resourceId('Microsoft.Storage/storageAccounts', name), '2021-08-01').keys[0].value
56 |   output varstorageAccountName string = azStorage.name
57 |   output varstorageAccountID string = azStorage.id
58 |   output workspaceName string = workspaceName
59 |   output resourceGroupName string = resourceGroupName
60 |   output azKeyVaultName string = azKeyVaultName
61 | 
62 | 
63 | 
64 | 
65 |   
66 | 
67 | 


--------------------------------------------------------------------------------
/infrastructure/bicep/az_templates/az_databricks/az_databricks.bicep:
--------------------------------------------------------------------------------
 1 | // ################################################################################################################################################################//
 2 | //                                                                       Define Parameters                                                                  
 3 | // ################################################################################################################################################################//
 4 | 
 5 | param azMachineLearningWSId string 
 6 | param location string
 7 | param workspaceName string
 8 | var varworkspaceName = '${workspaceName}-${substring(uniqueString(resourceGroup().id), 0, 4)}'
 9 | var managedResourceGroupName = '${workspaceName}-mrg-${substring(uniqueString(resourceGroup().id), 0, 4)}'
10 | 
11 | @allowed([
12 |   'standard'
13 |   'premium'
14 | ])
15 | param pricingTier string = 'premium'
16 | 
17 | 
18 | // ################################################################################################################################################################//
19 | //                                                                       Define Variables                                                                    
20 | // ################################################################################################################################################################//
21 | var roleDefinitionUser = guid('${resourceGroup().id}/8e3af657-a8ff-443c-a75c-2fe8c4bcb635/')
22 | 
23 | 
24 | 
25 | // ################################################################################################################################################################//
26 | //                                                             Deploy AzDatabricks Workspace                                                                     
27 | // ################################################################################################################################################################//
28 | resource azDatabricksWS 'Microsoft.Databricks/workspaces@2023-02-01' = {
29 |   name: varworkspaceName
30 |   
31 |   location: location
32 |   properties: {
33 |     managedResourceGroupId: '${subscription().id}/resourceGroups/${managedResourceGroupName}'
34 |     publicNetworkAccess: 'Enabled'
35 |     //parameters: {
36 |     //  amlWorkspaceId: {
37 |     //    value: azMachineLearningWSId
38 |     //  }
39 |     //}
40 |     authorizations: [
41 |       {
42 |         principalId: '0e3c30b0-dd4e-4937-96ca-3fe88bd8f259'
43 |         roleDefinitionId: roleDefinitionUser 
44 |       }
45 |     ]
46 |     
47 |   }
48 |   sku: {
49 |     name: pricingTier
50 |   }
51 |   
52 |   
53 | 
54 | }
55 | 
56 | //resource spRoleAssignment 'Microsoft.Authorization/roleAssignments@2020-10-01-preview' = {
57 | //  name: guid(azDatabricksWS.id, roleDefinitionAzureEventHubsDataOwner)
58 | //  dependsOn: [
59 | //    azDatabricksWS
60 | //  ]
61 | //  properties: {
62 | //    principalId: 'ab926dd1-657d-4bb2-9987-c7857046d0dd'
63 | //    roleDefinitionId: roleDefinitionUser
64 | //    principalType: 'ServicePrincipal'
65 | //  }
66 | //}
67 | 
68 | 
69 | output azDatabricksWorkspaceID string = azDatabricksWS.id
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/infrastructure/bicep/az_templates/az_key_vault/az_key_vault.bicep:
--------------------------------------------------------------------------------
 1 | 
 2 | param environment string
 3 | param location string
 4 | var keyVaultName = 'keyvault-${environment}-${substring(uniqueString(resourceGroup().id), 0, 4)}'
 5 | 
 6 | 
 7 | resource azKeyVault 'Microsoft.KeyVault/vaults@2021-10-01' = {
 8 |   name: keyVaultName
 9 |   location: location
10 |   properties: {
11 |     sku: {
12 |       family: 'A'
13 |       name: 'premium'
14 |     }
15 |     tenantId: subscription().tenantId
16 |     networkAcls: {
17 |       defaultAction: 'Allow'
18 |       bypass: 'AzureServices'
19 |       ipRules: []
20 |       virtualNetworkRules: []
21 |     }
22 |     enableRbacAuthorization: true // if this is false then you cannot use RBAC assignments, on acl (below). If true acl (below) is ignored
23 |     enableSoftDelete: true
24 |     enabledForTemplateDeployment: true
25 |     accessPolicies: [
26 |     ]
27 |   }
28 |   
29 | }
30 | 
31 | output azKeyVaultName string = azKeyVault.name
32 | output azKeyVaultID string = azKeyVault.id
33 | 


--------------------------------------------------------------------------------
/infrastructure/bicep/az_templates/az_machine_learning/az_machine_learning.bicep:
--------------------------------------------------------------------------------
 1 | param location string 
 2 | param azAppInsightsID string
 3 | param azKeyVaultID string
 4 | param amlwsname string 
 5 | param amlblobname string 
 6 | 
 7 | 
 8 | var varamlblobname = '${amlblobname}${substring(uniqueString(resourceGroup().id), 0, 4)}'
 9 | var varamlwsname = '${amlwsname}-${substring(uniqueString(resourceGroup().id), 0, 4)}'
10 | 
11 | 
12 | resource amlBlob 'Microsoft.Storage/storageAccounts@2021-08-01' =  {    
13 |   name: varamlblobname
14 |     location: location
15 |     kind: 'StorageV2'
16 |     sku: {
17 |       name: 'Standard_LRS'
18 |     }
19 |     properties: {
20 |       allowBlobPublicAccess: true
21 |       isHnsEnabled: false
22 |       accessTier: 'Hot'
23 |     }
24 | }
25 | 
26 | 
27 | resource AzMachineLearning 'Microsoft.MachineLearningServices/workspaces@2023-04-01' = {
28 |   name: varamlwsname
29 |   location: location
30 | 
31 |   identity: {
32 |     type: 'SystemAssigned'
33 |   }
34 |   properties: {
35 |     publicNetworkAccess: 'Enabled'
36 |     applicationInsights: azAppInsightsID
37 |     storageAccount: amlBlob.id
38 |     keyVault: azKeyVaultID
39 |   }
40 |   
41 |   sku: {
42 |     name: 'Enterprise'
43 |   }
44 |     
45 | }
46 | 
47 | output azMachineLearningWSId string = AzMachineLearning.id
48 | 


--------------------------------------------------------------------------------
/infrastructure/bicep/main.bicep:
--------------------------------------------------------------------------------
  1 | targetScope = 'subscription'
  2 | 
  3 | param location string
  4 | param environment string
  5 | param storageConfig object
  6 | param containerNames array
  7 | param resourceGroupName string
  8 | param workspaceName string
  9 | param pricingTier string
 10 | param ShouldCreateContainers bool = true
 11 | param loganalyticswsname string 
 12 | param appInsightswsname string 
 13 | param storageAccountName string 
 14 | param TemplateParamFilePath string
 15 | param TemplateFilePath string
 16 | param AZURE_DATABRICKS_APP_ID string
 17 | param MANAGEMENT_RESOURCE_ENDPOINT string 
 18 | param amlblobname string 
 19 | param amlwsname string 
 20 | 
 21 | // ################################################################################################################################################################//
 22 | //                                                                       Create Resource Group                                                                    
 23 | // ################################################################################################################################################################//
 24 | resource azResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' = {
 25 |   dependsOn: []
 26 |   name: resourceGroupName
 27 |   // Location of the Resource Group Does Not Have To Match That of The Resouces Within. Metadata for all resources within groups can reside in 'uksouth' below
 28 |   location: location
 29 | }
 30 | 
 31 | 
 32 | // ################################################################################################################################################################//
 33 | //                                                                  KEY VAULT - SELECT KV                                                                                //
 34 | // ################################################################################################################################################################//
 35 | 
 36 | module azKeyVault 'az_templates/az_key_vault/az_key_vault.bicep' = {
 37 |   dependsOn: [
 38 |     azResourceGroup
 39 |     
 40 |   ]
 41 |   scope: azResourceGroup
 42 |   name: 'azKeyVault'
 43 |   params: {
 44 |     environment: environment 
 45 |     location: location
 46 |   }
 47 | }
 48 | 
 49 | // ################################################################################################################################################################//
 50 | //                                                                       Module for Create Azure Data Lake Storage
 51 | // RBAC is assigned -> azDatabricks given access to Storage 
 52 | // ################################################################################################################################################################//
 53 | module azDataLake 'az_templates/az_data_lake/az_data_lake.bicep' =  {
 54 |   dependsOn: [
 55 |     azResourceGroup
 56 |   ]
 57 |   scope: resourceGroup(resourceGroupName)
 58 |   name: 'azDataLake' 
 59 |   params: {
 60 |     storageAccountName: storageAccountName
 61 |     storageConfig: storageConfig
 62 |     location: location
 63 |     containerNames: containerNames
 64 |     ShouldCreateContainers: ShouldCreateContainers
 65 |     workspaceName: workspaceName
 66 |     resourceGroupName: resourceGroupName
 67 |     azKeyVaultName: azKeyVault.outputs.azKeyVaultName
 68 | 
 69 | 
 70 |   }
 71 | }
 72 | 
 73 | 
 74 | module logAnalytics 'az_templates/az_app_insights/az_app_insights.bicep' = {
 75 |   dependsOn: [
 76 |     azResourceGroup
 77 |   ]
 78 |   scope: resourceGroup(resourceGroupName)
 79 |   name: 'logAnalytics'
 80 |   params: {
 81 |     location: location
 82 |     logwsname: loganalyticswsname
 83 |     appinsightname: appInsightswsname
 84 |   }
 85 | }
 86 | 
 87 | 
 88 | // ################################################################################################################################################################//
 89 | //                                                                       Module for Creating Azure Machine Learning Workspace
 90 | // Outputs AzDatabricks Workspace ID, which is used when Assigning RBACs.
 91 | // ################################################################################################################################################################//
 92 | module azMachineLearning 'az_templates/az_machine_learning/az_machine_learning.bicep' =  {
 93 |   dependsOn: [
 94 |     logAnalytics
 95 |     azDataLake
 96 |     azKeyVault
 97 | 
 98 |   ]
 99 |   scope: resourceGroup(resourceGroupName)
100 |   name: 'amlws' 
101 |   params: {
102 |     location: location
103 |     azAppInsightsID: logAnalytics.outputs.azAppInsightsID
104 |     azKeyVaultID: azKeyVault.outputs.azKeyVaultID
105 |     amlwsname: amlwsname
106 |     amlblobname: amlblobname
107 | 
108 | 
109 | 
110 |   }
111 | }
112 | 
113 | // ################################################################################################################################################################//
114 | //                                                                       Module for Creating Azure Databricks Workspace
115 | // Outputs AzDatabricks Workspace ID, which is used when Assigning RBACs
116 | // ################################################################################################################################################################//
117 | 
118 | module azDatabricks 'br/public:avm/res/databricks/workspace:0.1.0' = {
119 |   dependsOn: [
120 |     azMachineLearning
121 |   ]
122 |   scope: resourceGroup(resourceGroupName)
123 |   name: 'azDatabricks-dbrws'
124 |   params: {
125 |     name: workspaceName
126 |     location: location
127 |     skuName: pricingTier
128 |   }
129 | }
130 | 
131 | 
132 | output azDatabricksWorkspaceID string = azDatabricks.outputs.resourceId 
133 | 
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/infrastructure/bicep/params/development/bicep.parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#",
 3 |   "contentVersion": "1.0.0.0",
 4 |   "parameters": {
 5 |     "TemplateParamFilePath": {
 6 |       "value": "infrastructure/bicep/params/development/bicep.parameters.json"
 7 |     },
 8 |     "TemplateFilePath": {
 9 |       "value": "infrastructure/bicep/main.bicep"
10 |     },
11 |     "AZURE_DATABRICKS_APP_ID": {
12 |       "value": "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d"
13 |     },
14 |     "MANAGEMENT_RESOURCE_ENDPOINT": {
15 |       "value": "https://management.core.windows.net/"
16 |     },
17 |     "location": {
18 |         "value": "eastus"
19 |     },
20 |     "environment": {
21 |         "value": "dev"
22 |     },
23 | 
24 |     "containerNames": {
25 |         "value": [
26 |           "bronze",
27 |           "silver",
28 |           "gold"
29 |         ]
30 |     },
31 |     "storageConfig": {
32 |       "value": {
33 |         "kind": "StorageV2",
34 |         "sku_name": "Standard_LRS",
35 |         "allowBlobPublicAccess": true,
36 |         "isHnsEnabled": true,
37 |         "accessTier": "Hot"
38 |       }
39 |     },
40 |     "resourceGroupName" :{
41 |       "value": "databricks-dev-rg"
42 |     },
43 |     "workspaceName" : {
44 |       "value": "dbxwsdev"
45 |     },
46 |     "pricingTier": {
47 |       "value": "premium"
48 |     },
49 |     "ShouldCreateContainers": {
50 |       "value": true
51 |     },
52 |     "loganalyticswsname": {
53 |       "value": "loganalyticsdev"
54 |     },
55 |     "appInsightswsname": {
56 |       "value": "appinsightsdev"
57 |     },
58 |     "storageAccountName": {
59 |       "value": "adlsdev"
60 |     },
61 |     "amlwsname": {
62 |       "value": "amldev"
63 |     },
64 |     "amlblobname": {
65 |       "value": "amlblobdev"
66 |     }
67 | 
68 |   }
69 | }
70 |   
71 | 


--------------------------------------------------------------------------------
/infrastructure/bicep/params/production/bicep.parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#",
 3 |     "contentVersion": "1.0.0.0",
 4 |     "parameters": {
 5 |       "TemplateParamFilePath": {
 6 |         "value": "infrastructure/bicep/params/production/bicep.parameters.json"
 7 |       },
 8 |       "TemplateFilePath": {
 9 |         "value": "infrastructure/bicep/main.bicep"
10 |       },
11 |       "AZURE_DATABRICKS_APP_ID": {
12 |         "value": "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d"
13 |       },
14 |       "MANAGEMENT_RESOURCE_ENDPOINT": {
15 |         "value": "https://management.core.windows.net/"
16 |       },
17 |       "location": {
18 |           "value": "eastus"
19 |       },
20 |       "environment": {
21 |           "value": "prod"
22 |       },
23 |       "containerNames": {
24 |           "value": [
25 |             "bronze",
26 |             "silver",
27 |             "gold"
28 |           ]
29 |       },
30 |       "storageConfig": {
31 |         "value": {
32 |           "kind": "StorageV2",
33 |           "sku_name": "Standard_LRS",
34 |           "allowBlobPublicAccess": true,
35 |           "isHnsEnabled": true,
36 |           "accessTier": "Hot"
37 |         }
38 |       },
39 |       "resourceGroupName" :{
40 |         "value": "databricks-prod-rg"
41 |       },
42 |       "workspaceName" : {
43 |         "value": "dbxwsprod"
44 |       },
45 |       "pricingTier": {
46 |         "value": "premium"
47 |       },
48 |       "ShouldCreateContainers": {
49 |         "value": true
50 |       },
51 |       "loganalyticswsname": {
52 |         "value": "loganalyticsprod"
53 |       },
54 |       "appInsightswsname": {
55 |         "value": "appinsightsprod"
56 |       },
57 |       "storageAccountName": {
58 |         "value": "adlsprod"
59 |       },
60 |       "amlwsname": {
61 |         "value": "amlprod"
62 |       },
63 |       "amlblobname": {
64 |         "value": "amlblobprod"
65 |       }
66 |     }
67 |   }
68 |     
69 |   


--------------------------------------------------------------------------------
/infrastructure/bicep/params/sandbox/bicep.parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#",
 3 |   "contentVersion": "1.0.0.0",
 4 |   "parameters": {
 5 |     "TemplateParamFilePath": {
 6 |       "value": "infrastructure/bicep/params/sandbox/bicep.parameters.json"
 7 |     },
 8 |     "TemplateFilePath": {
 9 |       "value": "infrastructure/bicep/main.bicep"
10 |     },
11 |     "AZURE_DATABRICKS_APP_ID": {
12 |       "value": "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d"
13 |     },
14 |     "MANAGEMENT_RESOURCE_ENDPOINT": {
15 |       "value": "https://management.core.windows.net/"
16 |     },
17 |     "location": {
18 |         "value": "eastus"
19 |     },
20 |     "environment": {
21 |         "value": "sandbox"
22 |     },
23 | 
24 |     "containerNames": {
25 |         "value": [
26 |           "bronze",
27 |           "silver",
28 |           "gold"
29 |         ]
30 |     },
31 |     "storageConfig": {
32 |       "value": {
33 |         "kind": "StorageV2",
34 |         "sku_name": "Standard_LRS",
35 |         "allowBlobPublicAccess": true,
36 |         "isHnsEnabled": true,
37 |         "accessTier": "Hot"
38 |       }
39 |     },
40 |     "resourceGroupName" :{
41 |       "value": "databricks-sandbox-rg"
42 |     },
43 |     "workspaceName" : {
44 |       "value": "dbxwssandbox"
45 |     },
46 |     "pricingTier": {
47 |       "value": "premium"
48 |     },
49 |     "ShouldCreateContainers": {
50 |       "value": true
51 |     },
52 |     "loganalyticswsname": {
53 |       "value": "loganalyticssandbox"
54 |     },
55 |     "appInsightswsname": {
56 |       "value": "appinsightssandbox"
57 |     },
58 |     "storageAccountName": {
59 |       "value": "adlssandbox"
60 |     },
61 |     "amlwsname": {
62 |       "value": "amlsandbox"
63 |     },
64 |     "amlblobname": {
65 |       "value": "amlblobsandbox"
66 |     }
67 |   }
68 | }
69 |   
70 | 


--------------------------------------------------------------------------------
/infrastructure/bicep/params/uat/bicep.parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#",
 3 |     "contentVersion": "1.0.0.0",
 4 |     "parameters": {
 5 |       "TemplateParamFilePath": {
 6 |         "value": "infrastructure/bicep/params/uat/bicep.parameters.json"
 7 |       },
 8 |       "TemplateFilePath": {
 9 |         "value": "infrastructure/bicep/main.bicep"
10 |       },
11 |       "AZURE_DATABRICKS_APP_ID": {
12 |         "value": "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d"
13 |       },
14 |       "MANAGEMENT_RESOURCE_ENDPOINT": {
15 |         "value": "https://management.core.windows.net/"
16 |       },
17 |       "location": {
18 |           "value": "eastus"
19 |       },
20 |       "environment": {
21 |           "value": "uat"
22 |       },
23 |       "containerNames": {
24 |           "value": [
25 |             "bronze",
26 |             "silver",
27 |             "gold"
28 |           ]
29 |       },
30 |       "storageConfig": {
31 |         "value": {
32 |           "kind": "StorageV2",
33 |           "sku_name": "Standard_LRS",
34 |           "allowBlobPublicAccess": true,
35 |           "isHnsEnabled": true,
36 |           "accessTier": "Hot"
37 |         }
38 |       },
39 |       "resourceGroupName" :{
40 |         "value": "databricks-uat-rg"
41 |       },
42 |       "workspaceName" : {
43 |         "value": "dbxwsuat"
44 |       },
45 |       "pricingTier": {
46 |         "value": "premium"
47 |       },
48 |       "ShouldCreateContainers": {
49 |         "value": true
50 |       },
51 |       "loganalyticswsname": {
52 |         "value": "loganalyticsuat"
53 |       },
54 |       "appInsightswsname": {
55 |         "value": "appinsightsuat"
56 |       },
57 |       "storageAccountName": {
58 |         "value": "adlsuat"
59 |       },
60 |       "amlwsname": {
61 |         "value": "amluat"
62 |       },
63 |       "amlblobname": {
64 |         "value": "amlblobuat"
65 |       }
66 |     }
67 |   }
68 |     
69 |   


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/development/clusters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Clusters":  [
 3 |             
 4 |         {
 5 |             "cluster_name":  "ml_cluster",
 6 |             "spark_version":  "13.3.x-cpu-ml-scala2.12",
 7 |             "node_type_id":  "Standard_DS3_v2",
 8 |             "spark_conf": {
 9 |                             "spark.databricks.delta.preview.enabled": "true"
10 |                         },
11 |             "autotermination_minutes":  30,
12 |             "runtime_engine":  "STANDARD",
13 |             "autoscale":  {
14 |                             "min_workers":  2,
15 |                             "max_workers":  3
16 |                         }
17 |         }
18 |     ]
19 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/development/rbac.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "RBAC_Assignments":  [
 3 |                              {
 4 |                                  "roles":  [
 5 |                                                "Key Vault Administrator"
 6 |                                            ],
 7 |                                  "roleBeneficiaryObjID":  "d30dd2e8-25d0-49cb-b99a-80ae061aac2c",
 8 |                                  "Description":  "Your Object ID",
 9 |                                  "principalType":  "User"
10 |                              },
11 |                              {
12 |                                  "roles":  [
13 |                                                "Contributor",
14 |                                                "Key Vault Administrator",
15 |                                                "Storage Blob Data Contributor"
16 |                                            ],
17 |                                  "roleBeneficiaryObjID":  "eb578d1b-72d9-4aa7-97be-97ace3a8954e",
18 |                                  "Description":  "Databricks SPN",
19 |                                  "principalType":  "ServicePrincipal"
20 |                              }
21 |                          ]
22 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/development/repos.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Git_Configuration":  [
 3 |                               {
 4 |                                   "git_username":  "clintgrove",
 5 |                                   "git_provider":  "gitHub"
 6 |                               }
 7 |                           ],
 8 |     "Repo_Configuration":  [
 9 |                                {
10 |                                    "url":  "https://github.com/clintgrove/dstoolkit-mlops-databricks",
11 |                                    "provider":  "gitHub",
12 |                                    "path":  "DevelopmentFolder",
13 |                                    "branch":  "main"
14 |                                }
15 |                            ]
16 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/production/clusters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Clusters":  [
 3 |         {
 4 |             "cluster_name":  "ml_cluster",
 5 |             "spark_version":  "13.0.x-cpu-ml-scala2.12",
 6 |             "node_type_id":  "Standard_DS3_v2",
 7 |             "spark_conf":  {
 8 |                         },
 9 |             "autotermination_minutes":  30,
10 |             "runtime_engine":  "STANDARD",
11 |             "autoscale":  {
12 |                             "min_workers":  2,
13 |                             "max_workers":  3
14 |                         }
15 |         }
16 |     ]
17 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/production/rbac.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "RBAC_Assignments":  [
 3 |                              {
 4 |                                  "roles":  [
 5 |                                                "Key Vault Administrator"
 6 |                                            ],
 7 |                                  "roleBeneficiaryObjID":  "d30dd2e8-25d0-49cb-b99a-80ae061aac2c",
 8 |                                  "Description":  "Your Object ID",
 9 |                                  "principalType":  "User"
10 |                              },
11 |                              {
12 |                                  "roles":  [
13 |                                                "Contributor",
14 |                                                "Key Vault Administrator",
15 |                                                "Storage Blob Data Contributor"
16 |                                            ],
17 |                                  "roleBeneficiaryObjID":  "eb578d1b-72d9-4aa7-97be-97ace3a8954e",
18 |                                  "Description":  "Databricks SPN",
19 |                                  "principalType":  "ServicePrincipal"
20 |                              }
21 |                          ]
22 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/production/repos.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Git_Configuration":  [
 3 |                               {
 4 |                                   "git_username":  "clintgrove",
 5 |                                   "git_provider":  "gitHub"
 6 |                               }
 7 |                           ],
 8 |     "Repo_Configuration":  [
 9 |                                {
10 |                                    "url":  "https://github.com/clintgrove/dstoolkit-mlops-databricks",
11 |                                    "provider":  "gitHub",
12 |                                    "path":  "ProductionFolder",
13 |                                    "branch":  "main"
14 |                                }
15 |                            ]
16 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/sandbox/clusters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Clusters":  [
 3 |         {
 4 |             "cluster_name":  "ml_cluster",
 5 |             "spark_version":  "13.0.x-cpu-ml-scala2.12",
 6 |             "node_type_id":  "Standard_DS3_v2",
 7 |             "spark_conf":  {
 8 |                         },
 9 |             "autotermination_minutes":  30,
10 |             "runtime_engine":  "STANDARD",
11 |             "autoscale":  {
12 |                             "min_workers":  2,
13 |                             "max_workers":  3
14 |                         }
15 |         }
16 |     ]
17 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/sandbox/rbac.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "RBAC_Assignments":  [
 3 |                              {
 4 |                                  "roles":  [
 5 |                                                "Key Vault Administrator"
 6 |                                            ],
 7 |                                  "roleBeneficiaryObjID":  "d30dd2e8-25d0-49cb-b99a-80ae061aac2c",
 8 |                                  "Description":  "Your Object ID",
 9 |                                  "principalType":  "User"
10 |                              },
11 |                              {
12 |                                  "roles":  [
13 |                                                "Contributor",
14 |                                                "Key Vault Administrator",
15 |                                                "Storage Blob Data Contributor",
16 |                                                "AzureML Data Scientist"
17 |                                            ],
18 |                                  "roleBeneficiaryObjID":  "eb578d1b-72d9-4aa7-97be-97ace3a8954e",
19 |                                  "Description":  "Databricks SPN",
20 |                                  "principalType":  "ServicePrincipal"
21 |                              }
22 |                          ]
23 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/sandbox/repos.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Git_Configuration":  [
 3 |                               {
 4 |                                   "git_username":  "clintgrove",
 5 |                                   "git_provider":  "gitHub"
 6 |                               }
 7 |                           ],
 8 |     "Repo_Configuration":  [
 9 |                                {
10 |                                    "url":  "https://github.com/clintgrove/dstoolkit-mlops-databricks",
11 |                                    "provider":  "gitHub",
12 |                                    "path":  "Sandbox",
13 |                                    "branch":  "main"
14 |                                }
15 |                            ]
16 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/uat/clusters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Clusters":  [
 3 |             
 4 |         {
 5 |             "cluster_name":  "ml_cluster",
 6 |             "spark_version":  "13.0.x-cpu-ml-scala2.12",
 7 |             "node_type_id":  "Standard_DS3_v2",
 8 |             "spark_conf": {
 9 |                             "spark.databricks.delta.preview.enabled": "true"
10 |                         },
11 |             "autotermination_minutes":  30,
12 |             "runtime_engine":  "STANDARD",
13 |             "autoscale":  {
14 |                             "min_workers":  2,
15 |                             "max_workers":  3
16 |                         }
17 |         }
18 |     ]
19 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/uat/rbac.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "RBAC_Assignments":  [
 3 |                              {
 4 |                                  "roles":  [
 5 |                                                "Key Vault Administrator"
 6 |                                            ],
 7 |                                  "roleBeneficiaryObjID":  "d30dd2e8-25d0-49cb-b99a-80ae061aac2c",
 8 |                                  "Description":  "Your Object ID",
 9 |                                  "principalType":  "User"
10 |                              },
11 |                              {
12 |                                  "roles":  [
13 |                                                "Contributor",
14 |                                                "Key Vault Administrator",
15 |                                                "Storage Blob Data Contributor"
16 |                                            ],
17 |                                  "roleBeneficiaryObjID":  "eb578d1b-72d9-4aa7-97be-97ace3a8954e",
18 |                                  "Description":  "Databricks SPN",
19 |                                  "principalType":  "ServicePrincipal"
20 |                              }
21 |                          ]
22 | }


--------------------------------------------------------------------------------
/infrastructure/databricks/databricks_configs/uat/repos.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Git_Configuration":  [
 3 |                               {
 4 |                                   "git_username":  "clintgrove",
 5 |                                   "git_provider":  "gitHub"
 6 |                               }
 7 |                           ],
 8 |     "Repo_Configuration":  [
 9 |                                {
10 |                                    "url":  "https://github.com/clintgrove/dstoolkit-mlops-databricks",
11 |                                    "provider":  "gitHub",
12 |                                    "path":  "UATFolder",
13 |                                    "branch":  "main"
14 |                                }
15 |                            ]
16 | }


--------------------------------------------------------------------------------
/mlops/nyc_taxi/aml_pipelines/v1/nyc_pipeline.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import requests
  3 | from azureml.core import Workspace, Experiment
  4 | from azureml.core.compute import ComputeTarget, AmlCompute
  5 | from azureml.pipeline.steps import PythonScriptStep, DatabricksStep
  6 | from azureml.pipeline.core import Pipeline, PipelineData, StepSequence
  7 | from azureml.core.compute import ComputeTarget, DatabricksCompute
  8 | from azureml.exceptions import ComputeTargetException
  9 | from azureml.core.authentication import ServicePrincipalAuthentication
 10 | 
 11 | 
 12 | DATABRICKS_COMPUTE_NAME = os.environ['DATABRICKS_COMPUTE_NAME']
 13 | DATABRICKS_CLUSTER_NAME = os.environ['DATABRICKS_CLUSTER_NAME']
 14 | RESOURCE_GROUP_NAME = os.environ['RESOURCE_GROUP_NAME']
 15 | DATABRICKS_AAD_TOKEN = os.environ['DATABRICKS_AAD_TOKEN']
 16 | DATABRICKS_INSTANCE = os.environ['DATABRICKS_INSTANCE']
 17 | DATABRICKS_WS_NAME = os.environ['DATABRICKS_WS_NAME']
 18 | WORKSPACE_ID = os.environ['WORKSPACE_ID']
 19 | SUBSCRIPTION_ID = os.environ['SUBSCRIPTION_ID']
 20 | ARM_CLIENT_SECRET = os.environ['ARM_CLIENT_SECRET']
 21 | ARM_TENANT_ID = os.environ['ARM_TENANT_ID']
 22 | ARM_CLIENT_ID = os.environ['ARM_CLIENT_ID']
 23 | DATABRICKS_MANAGEMENT_TOKEN = os.environ['DATABRICKS_MANAGEMENT_TOKEN']
 24 | ENVIRONMENT = os.environ['ENVIRONMENT']
 25 | AML_WS_NAME = os.environ['AML_WS_NAME']
 26 | 
 27 | DBRKS_REQ_HEADERS = {
 28 |     'Authorization': f'Bearer {DATABRICKS_AAD_TOKEN}',
 29 |     'X-Databricks-Azure-SP-Management-Token': f'{DATABRICKS_MANAGEMENT_TOKEN}',
 30 |     'X-Databricks-Azure-Workspace-Resource-Id': f'{WORKSPACE_ID}',
 31 |     'Content-Type': 'application/json'
 32 | }
 33 | 
 34 | #print(DATABRICKS_COMPUTE_NAME)
 35 | #print(WORKSPACE_ID)
 36 | #print(RESOURCE_GROUP_NAME)
 37 | #print(DATABRICKS_AAD_TOKEN)
 38 | #print(DATABRICKS_MANAGEMENT_TOKEN)
 39 | #print(DATABRICKS_INSTANCE)
 40 | #print(SUBSCRIPTION_ID)
 41 | #print(ARM_CLIENT_SECRET)
 42 | #print(ARM_TENANT_ID)
 43 | #print(ARM_CLIENT_ID)
 44 | #print(AML_WS_NAME)
 45 | 
 46 | class GetClusterID():
 47 |     def __init__(self, cluster_name):
 48 |         self.clusters_obj = self.list_clusters()['clusters']
 49 |         self.cluster_name = cluster_name
 50 |     def get_cluster_id(self):
 51 |         """
 52 |             Returns Cluster ID for a given cluster name.
 53 |         """
 54 |         for cluster in self.clusters_obj:
 55 |             if cluster['cluster_name'] ==  self.cluster_name:
 56 |                 print("ml_cluster exists")
 57 |                 cluster_id = cluster['cluster_id']
 58 |                 print(cluster_id)
 59 |                 return cluster_id
 60 |     def list_clusters(self):
 61 |         """
 62 |             Returns a Json object containing a list of existing Databricks Clusters.
 63 |         """
 64 |         response = requests.get('https://' + DATABRICKS_INSTANCE + '/api/2.0/clusters/list', headers=DBRKS_REQ_HEADERS)
 65 |         if response.status_code != 200:
 66 |             raise Exception(response.content)
 67 |         else:
 68 |             return response.json()
 69 | 
 70 | def create_pipeline_structure(databricks_compute, ws, cluster_id):
 71 |     print('Creating the pipeline structure')
 72 | 
 73 |     nyc_taxi_e2e_mlops = DatabricksStep(
 74 |         name="nyc_taxi_e2e_mlops",
 75 |         notebook_path="/Repos/"+ ARM_CLIENT_ID + "/Sandbox/data_science/src_nyc_taxi/src.py",
 76 |         #notebook_params={'myparam': 'testparam', 
 77 |         #    'myparam2': pipeline_param},
 78 |         run_name='nyc_taxi_e2e_mlops',
 79 |         compute_target=databricks_compute,
 80 |         existing_cluster_id=cluster_id,
 81 |         allow_reuse=True,
 82 |         num_workers=3
 83 |     )
 84 | 
 85 | 
 86 |     step_sequence = StepSequence(steps=[nyc_taxi_e2e_mlops])
 87 |     pipeline = Pipeline(workspace=ws, steps=step_sequence)
 88 |     pipeline.validate()
 89 |     
 90 |     return pipeline
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 |     svc_pr = ServicePrincipalAuthentication(
 95 |         tenant_id = ARM_TENANT_ID,
 96 |         service_principal_id = ARM_CLIENT_ID,
 97 |         service_principal_password = ARM_CLIENT_SECRET 
 98 |     )
 99 |     ws = Workspace(
100 |         subscription_id=SUBSCRIPTION_ID,
101 |         resource_group=RESOURCE_GROUP_NAME,
102 |         workspace_name=AML_WS_NAME,
103 |         auth=svc_pr
104 |     )
105 | 
106 |     #print(f" AML Workspace Properties: {ws} ")
107 | 
108 |     try:
109 |         databricks_compute = DatabricksCompute(workspace=ws, name=DATABRICKS_COMPUTE_NAME)
110 |         print('Compute target {} already exists'.format(DATABRICKS_COMPUTE_NAME))
111 |     except ComputeTargetException:
112 |         #print('Compute not found, will use below parameters to attach new one')
113 |         #print('db_compute_name {}'.format(DATABRICKS_COMPUTE_NAME))
114 |         #print('db_resource_group {}'.format(RESOURCE_GROUP_NAME))
115 |         #print('db_workspace_name {}'.format(DATABRICKS_WS_NAME))
116 |         #print('db_access_token {}'.format(DATABRICKS_AAD_TOKEN))
117 | 
118 |         config = DatabricksCompute.attach_configuration(
119 |             resource_group = RESOURCE_GROUP_NAME,
120 |             workspace_name = DATABRICKS_WS_NAME,
121 |             access_token= DATABRICKS_AAD_TOKEN)
122 |         databricks_compute=ComputeTarget.attach(ws, DATABRICKS_COMPUTE_NAME, config)
123 |         databricks_compute.wait_for_completion(True)
124 | 
125 |     
126 |     cluster_obj = GetClusterID("ml_cluster")
127 |     cluster_id = cluster_obj.get_cluster_id()
128 | 
129 | 
130 | 
131 |     #existingClusters = listClusters()['clusters']
132 |     #for cluster in existingClusters:
133 |     #    if cluster['cluster_name'] == "ml_cluster":
134 |     #        print("ml_cluster exists")
135 |     #        cluster_id = cluster['cluster_id']
136 |     #        print(cluster_id)
137 |     #    else:
138 |     #        print("ml_cluster does not exist: cannot continue")
139 |     #notebook_path=os.getenv("DATABRICKS_NOTEBOOK_PATH", "/Data_Scientist/featureEngineering.py")
140 |     #notebook_path=os.getenv("DATABRICKS_NOTEBOOK_PATH", "databricks.ipynb")
141 | 
142 | 
143 |     pipeline = create_pipeline_structure(databricks_compute=databricks_compute, ws=ws, cluster_id=cluster_id)
144 |     published_pipeline = pipeline.publish("databricks_pipeline", version="1.0.0", description="Databricks Pipeline")
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/mlops/nyc_taxi/aml_pipelines/v2/dontdelete/databricks/listclusters.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import pandas as pd
  4 | from sklearn.model_selection import train_test_split
  5 | import logging
  6 | import mlflow
  7 | import requests
  8 | import os
  9 | 
 10 | #parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
 11 | 
 12 | def main():
 13 |     """Main function of the script."""
 14 | 
 15 |     # input and output arguments
 16 |     parser = argparse.ArgumentParser()
 17 | 
 18 |     parser.add_argument("--data", type=str, help="path to input data")
 19 |     parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
 20 |     parser.add_argument("--train_data", type=str, help="path to train data")
 21 |     parser.add_argument("--test_data", type=str, help="path to test data")
 22 | 
 23 |     args = parser.parse_args()
 24 |     # Start Logging
 25 |     mlflow.start_run()
 26 | 
 27 |     print(" ".join(f"{k}={v}" for k, v in vars(args).items()))
 28 | 
 29 |     print("input data:", args.data)
 30 | 
 31 |     credit_df = pd.read_excel(args.data, header=1, index_col=0)
 32 | 
 33 |     mlflow.log_metric("num_samples", credit_df.shape[0])
 34 |     mlflow.log_metric("num_features", credit_df.shape[1] - 1)
 35 | 
 36 |     credit_train_df, credit_test_df = train_test_split(
 37 |         credit_df,
 38 |         test_size=args.test_train_ratio,
 39 |     )
 40 | 
 41 |     # output paths are mounted as folder, therefore, we are adding a filename to the path
 42 |     credit_train_df.to_csv(os.path.join(args.train_data, "data.csv"), index=False)
 43 | 
 44 |     credit_test_df.to_csv(os.path.join(args.test_data, "data.csv"), index=False)
 45 | 
 46 |     # Stop Logging
 47 |     mlflow.end_run()
 48 | 
 49 | 
 50 | 
 51 |     # Retrieve Tokens 
 52 | 
 53 | 
 54 | def createManagementToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL):
 55 |         """
 56 |             Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens
 57 |         """
 58 | 
 59 |         tokenRequestBody['resource'] = 'https://management.core.windows.net/'
 60 |         
 61 |         response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody)
 62 |         
 63 |         if response.status_code == 200:
 64 |             print(response.status_code)
 65 |         
 66 |         else:
 67 |             raise Exception(response.text)
 68 |         
 69 |         return response.json()['access_token']
 70 | 
 71 | def createBearerToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL):
 72 |         """
 73 |             Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens
 74 |         """
 75 |         
 76 |         tokenRequestBody['resource'] = '2ff814a6-3304-4ab8-85cb-cd0e6f879c1d'
 77 |         
 78 |         response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody)
 79 |         
 80 |         if response.status_code == 200:
 81 |             print(response.status_code)
 82 |         
 83 |         else:
 84 |             raise Exception(response.text)
 85 |         
 86 |         return response.json()['access_token']
 87 | 
 88 | 
 89 | 
 90 | def listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE):
 91 |     """
 92 |         Returns a Json object containing a list of existing Databricks Clusters.
 93 |     """
 94 | 
 95 |     response = requests.get('https://' + DATABRICKS_INSTANCE + '/api/2.0/clusters/list', headers=DBRKS_REQ_HEADERS)
 96 | 
 97 |     if response.status_code != 200:
 98 |         raise Exception(response.content)
 99 | 
100 |     else:
101 |         return response.json()
102 | 
103 | 
104 | 
105 | if __name__ == "__main__":
106 | 
107 |     # The sp credentials need to come in from key vault 
108 | 
109 |     tokenRequestBody = {
110 |         'grant_type': 'client_credentials',
111 |         'client_id': ' ',
112 |         'client_secret': ' '
113 |     } 
114 |     tokenRequestHeaders = {'Content-Type': 'application/x-www-form-urlencoded'}
115 |     tokenBaseURL = 'https://login.microsoftonline.com/' + ' ' + '/oauth2/token'
116 | 
117 |     DBRKS_BEARER_TOKEN = createBearerToken(tokenRequestBody=tokenRequestBody, 
118 |                                     tokenRequestHeaders=tokenRequestHeaders, 
119 |                                     tokenBaseURL=tokenBaseURL
120 |                     )
121 |     
122 |     DBRKS_MANAGEMENT_TOKEN = createManagementToken(tokenRequestBody=tokenRequestBody,
123 |                                             tokenRequestHeaders=tokenRequestHeaders,
124 |                                             tokenBaseURL=tokenBaseURL
125 |                     )
126 | 
127 | 
128 |     DBRKS_REQ_HEADERS = {
129 |     'Authorization': f'Bearer {DBRKS_BEARER_TOKEN}',
130 |     'X-Databricks-Azure-SP-Management-Token': f'{DBRKS_MANAGEMENT_TOKEN}',
131 |     'X-Databricks-Azure-Workspace-Resource-Id': '/subscriptions/<>/resourceGroups/databricks-sandbox-rg/providers/Microsoft.Databricks/workspaces/dbxwssandbox-eco3',
132 |     'Content-Type': 'application/json'
133 | }
134 |     DATABRICKS_INSTANCE = "adb-204110209##.#.azuredatabricks.net"
135 | 
136 |     existingClusters = listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE)
137 | 
138 |     print(existingClusters)
139 | 


--------------------------------------------------------------------------------
/mlops/nyc_taxi/aml_pipelines/v2/dontdelete/dependencies/conda.yaml:
--------------------------------------------------------------------------------
 1 | name: model-env
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.8
 6 |   - numpy=1.21.2
 7 |   - pip=21.2.4
 8 |   - scikit-learn=0.24.2
 9 |   - scipy=1.7.1
10 |   - pandas>=1.1,<1.2
11 |   - pip:
12 |     - inference-schema[numpy-support]==1.3.0
13 |     - xlrd==2.0.1
14 |     - mlflow== 1.26.1
15 |     - azureml-mlflow==1.42.0
16 |     - pandas
17 |     - requests


--------------------------------------------------------------------------------
/mlops/nyc_taxi/databricks_workflows/nyc_taxi.yaml:
--------------------------------------------------------------------------------
 1 | custom:
 2 |   basic-cluster-props: &basic-cluster-props
 3 |     spark_version: "13.0.x-cpu-ml-scala2.12"
 4 |     node_type_id: "Standard_DS3_v2"
 5 |     spark_env_vars:
 6 |       PYSPARK_PYTHON: "/databricks/python3/bin/python3"
 7 |     enable_elastic_disk: true
 8 |     runtime_engine: STANDARD
 9 |     autoscale:
10 |       min_workers: 2
11 |       max_workers: 8
12 | 
13 | 
14 |   nyc_taxi_vars: &nyc_taxi_vars
15 |     job_clusters:
16 |       - job_cluster_key: training_job_cluster
17 |         new_cluster:
18 |           <<: *basic-cluster-props                        
19 |     tasks:
20 |       - task_key: "nyc_taxi_pipeline_data_engineering"
21 |         job_cluster_key: "training_job_cluster"
22 |         spark_python_task:
23 |             python_file: "file://dataops/src_nyc_taxi/transform.py" 
24 |         libraries: [ 
25 |           whl: "file://src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1-py3-none-any.whl" 
26 |         ]
27 | 
28 |       - task_key: "nyc_taxi_pipeline_machine_learning"
29 |         job_cluster_key: "training_job_cluster"
30 |         spark_python_task:
31 |             python_file: "file://src/pkg/nyc_taxi/entrypoint.py"
32 |         libraries: [ 
33 |           whl: "file://src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1-py3-none-any.whl" 
34 |         ]
35 |         depends_on:
36 |           - task_key: "nyc_taxi_pipeline_data_engineering"
37 | 
38 | build:
39 |   no_build: true
40 | 
41 | environments:
42 |   default:
43 |     workflows:
44 |       - name: "DatabricksArtifacts"
45 |         <<: *nyc_taxi_vars


--------------------------------------------------------------------------------
/mlops/nyc_taxi/databricks_workflows/unit_tests.yaml:
--------------------------------------------------------------------------------
 1 | custom:
 2 | 
 3 |   basic-cluster-props: &basic-cluster-props
 4 |     spark_version: "13.0.x-cpu-ml-scala2.12"
 5 |     node_type_id: "Standard_DS3_v2"
 6 |     spark_env_vars:
 7 |       PYSPARK_PYTHON: "/databricks/python3/bin/python3"
 8 |     enable_elastic_disk: true
 9 |     runtime_engine: STANDARD
10 |     autoscale:
11 |       min_workers: 2
12 |       max_workers: 8
13 | 
14 |   databricks_utils_testing_vars: &databricks_utils_testing_vars
15 |     job_clusters:
16 |       - job_cluster_key: training_job_cluster
17 |         new_cluster:
18 |           <<: *basic-cluster-props                        
19 |     tasks:
20 |       - task_key: "unittests"
21 |         job_cluster_key: "training_job_cluster"
22 |         spark_python_task:
23 |             python_file: "file://test/entrypoint.py"
24 |             # this call supports all standard pytest arguments
25 |             parameters: [
26 |               "file:fuse://test/test_dbx_utils_pkg/test_utils_create_cluster.py",
27 |               "-o",
28 |               "cache_dir=/dbfs/FileStore/",
29 |               "--cov=dbx_utils",
30 |               "--cov-append",
31 |               "--cov-report=xml:/dbfs/FileStore/databricks_utils_cov_report.xml",
32 |               "--cov-report=html:/dbfs/FileStore/htmlcov",
33 |               "--junitxml=/dbfs/FileStore/databricks_utils_unit_testresults.xml"
34 |               ]    
35 | 
36 | build:
37 |   python: poetry
38 | 
39 | environments:
40 |   default:
41 |     workflows:
42 |       - name: "DatabricksUtilsTesting"
43 |         <<: *databricks_utils_testing_vars
44 | 


--------------------------------------------------------------------------------
/mlops/nyc_taxi/monitoring/data_drift_monitor.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/mlops/nyc_taxi/monitoring/data_drift_monitor.py


--------------------------------------------------------------------------------
/mlops/nyc_taxi/monitoring/mflow_experiment_dashboard_pbi.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | 
 3 | 
 4 | import mlflow
 5 | from mlflow.tracking import MlflowClient
 6 | 
 7 | mlflow_client = MlflowClient()
 8 | experiment = mlflow_client.get_experiment_by_name("/Shared/ciaran_experiment_nyc_taxi")
 9 | experiment_id = experiment.experiment_id
10 | 
11 | 
12 | df = mlflow.search_runs(
13 |     experiment_ids=experiment_id
14 | )
15 | 
16 | display(df)
17 | 
18 | df = df.rename(columns={"metrics.r2": "r2"})
19 | display(df)
20 | df = df[df.end_time.notnull()]
21 | df = df[df.r2.notnull()]
22 | 
23 | display(df)
24 | 
25 | df2 = df.drop(df[df['status'] == "FINISHED"].index, inplace = True)
26 | 
27 | display(df2)
28 | 


--------------------------------------------------------------------------------
/mlops/nyc_taxi/monitoring/model_serving_monitor.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/mlops/nyc_taxi/monitoring/model_serving_monitor.py


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "mlops_for_databricks"
 3 | version = "0.1.0"
 4 | description = "Full CICD deployment of mlops for Databricks"
 5 | 
 6 | license = "MIT"
 7 | 
 8 | authors = [
 9 |     "Ciaran Hamill Diamond"
10 | ]
11 | 
12 | repository = "https://github.com/python-poetry/poetry"
13 | homepage = "https://python-poetry.org"
14 | 
15 | # README file(s) are used as the package description
16 | readme = ["README.md", "LICENSE"]
17 | 
18 | # Keywords (translated to tags on the package index)
19 | keywords = ["packaging", "poetry"]
20 | 
21 | packages = [
22 |     {include = "dbx_utils", from = "src/pkg"},
23 |     {include = "nyc_taxi", from = "src/pkg"},
24 |     {include = "ciaran_experiments", from = "experiments/notebooks"}
25 | ]
26 | 
27 | 
28 | [tool.poetry.dependencies]
29 | # All Packages Destined For Databricks Cluster
30 | # Only Install Pacakages Here That Are Not Already Installed On Databricks Cluster
31 | 
32 | python = ">=3.8, <3.11"
33 | numpy = "^1.24.3"
34 | dbx = "^0.8.18"
35 | pytest = "^7.3.2"
36 | pytest-cov = "^4.1.0"
37 | bandit = "1.7.4"
38 | pylint = "2.15.0"
39 | pylint_junit = "0.3.2"
40 | python-dotenv = "1.0.0"
41 | pyspark = "3.2.1"
42 | delta-spark = "1.1.0"
43 | packaging = "21.*"
44 | mlflow-databricks-artifacts = "2.0.0"
45 | databricks-cli = "0.17.7"
46 | scikit-learn = "^1.2.2"
47 | lightgbm = "^3.3.5"
48 | pyyaml = "^6.0"
49 | pathlib = "^1.0.1"
50 | argparse = "^1.4.0"
51 | 
52 | 
53 | [tool.poetry.extras]
54 | [tool.poetry.scripts]


--------------------------------------------------------------------------------
/score.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import json
 4 | import numpy
 5 | import joblib
 6 | 
 7 | 
 8 | def init():
 9 |     """
10 |     This function is called when the container is initialized/started, typically after create/update of the deployment.
11 |     You can write the logic here to perform init operations like caching the model in memory
12 |     """
13 |     global model
14 |     # AZUREML_MODEL_DIR is an environment variable created during deployment.
15 |     # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
16 |     # Please provide your model's folder name if there is one
17 |     model_path = os.path.join(
18 |         os.getenv("AZUREML_MODEL_DIR"), "model/sklearn_regression_model.pkl"
19 |     )
20 |     # deserialize the model file back into a sklearn model
21 |     model = joblib.load(model_path)
22 |     logging.info("Init complete")
23 | 
24 | 
25 | def run(raw_data):
26 |     """
27 |     This function is called for every invocation of the endpoint to perform the actual scoring/prediction.
28 |     In the example we extract the data from the json input and call the scikit-learn model's predict()
29 |     method and return the result back
30 |     """
31 |     logging.info("model 1: request received")
32 |     data = json.loads(raw_data)["data"]
33 |     data = numpy.array(data)
34 |     result = model.predict(data)
35 |     logging.info("Request processed")
36 |     return result.tolist()


--------------------------------------------------------------------------------
/setup.ps1:
--------------------------------------------------------------------------------
 1 | # Create The Service Principal
 2 | # WARNING: DO NOT DELETE OUTPUT
 3 | 
 4 | $SubscriptionId=( az account show --query id -o tsv )
 5 | 
 6 | $main_sp_name="main_sp_"+$(Get-Random -Minimum 1000 -Maximum 9999)
 7 | 
 8 | # use --sdk-auth flag if using GitHub Action Azure Authenticator 
 9 | $DBX_CREDENTIALS=( az ad sp create-for-rbac -n $main_sp_name --role Owner --scopes /subscriptions/$SubscriptionId --query "{ARM_TENANT_ID:tenant, ARM_CLIENT_ID:appId, ARM_CLIENT_SECRET:password}")
10 | 
11 | 
12 | # Service Principal Credentials
13 | $DBX_CREDENTIALS=( $DBX_CREDENTIALS | convertfrom-json )
14 | #echo $DBX_CREDENTIALS
15 | $Client_ID=( $DBX_CREDENTIALS.ARM_CLIENT_ID )
16 | 
17 | 
18 | # Retrieve Object IDs
19 | $main_sp_name_obj_id=( az ad sp show --id $Client_ID --query "{roleBeneficiaryObjID:id}" -o tsv )
20 | 
21 | echo "Back Stop Command For Older Azure CLI Command"
22 |  
23 | if ($main_sp_name_obj_id -eq "None" ) { $main_sp_name_obj_id=( az ad sp show --id $Client_ID --query "{roleBeneficiaryObjID:objectId}" -o tsv ) }
24 | 
25 | 
26 |  
27 | $User_ObjID=( az ad signed-in-user show --query "{roleBeneficiaryObjID:id}" -o tsv )
28 |  
29 | echo "Back Stop Command For Older Azure CLI Command"
30 |  
31 | if ($User_ObjID -eq "None" ) { $User_ObjID=( az ad signed-in-user show --query "{roleBeneficiaryObjID: objectId}" -o tsv ) }
32 |  
33 | 
34 | 
35 | 
36 | echo "Update The Variable Files"
37 | $environments = @('sandbox', 'development', 'uat', 'production')
38 | foreach ($environment in $environments)
39 | {
40 |    $JsonData = Get-Content infrastructure/databricks/databricks_configs/$environment/repos.json -raw | ConvertFrom-Json
41 |    foreach ($Obj in $JsonData.Git_Configuration)
42 |    {
43 |        ($Obj.git_username = $Git_Configuration )
44 |    }
45 |    foreach ($Obj in $JsonData.Repo_Configuration)
46 |    {
47 |        ($Obj.url = $Repo_ConfigurationURL )
48 |    }
49 |    $JsonData | ConvertTo-Json -Depth 4  | set-content infrastructure/databricks/databricks_configs/$environment/repos.json -NoNewline
50 | }
51 |  
52 | foreach ($environment in $environments)
53 | {
54 |   $JsonData = Get-Content infrastructure/databricks/databricks_configs/$environment/rbac.json -raw | ConvertFrom-Json
55 |   $JsonData.RBAC_Assignments | % {if($_.Description -eq 'Your Object ID'){$_.roleBeneficiaryObjID=$User_ObjID}}
56 |   $JsonData.RBAC_Assignments | % {if($_.Description -eq 'Databricks SPN'){$_.roleBeneficiaryObjID=$main_sp_name_obj_id}}
57 |   $JsonData | ConvertTo-Json -Depth 4  | set-content infrastructure/databricks/databricks_configs/$environment/rbac.json -NoNewline
58 | }
59 | 
60 | git add . 
61 | git commit . -m 'Demo Deployment Commit'
62 | 
63 | git config core.autocrlf false
64 | git rm --cached -r .
65 | git reset --hard
66 | git pull
67 | git push
68 | 
69 | # Secret Configuration
70 | 
71 | echo "Credentials Used In Later Step - Do Not Delete"
72 | echo $DBX_CREDENTIALS


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/src/pkg/dbx_utils/__init__.py


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict
 3 | 
 4 | def get_databricks_request_headers() -> Dict[str, str]:
 5 |     """
 6 |     Gets the Databricks headers required for API calls using
 7 |     the Databricks AAD token, Databricks Management token and
 8 |     Databricks Workspace ID from the environment variables.
 9 | 
10 |     :return: databricks_req_headers
11 |     :type: dict
12 |     """
13 |     workspace_id = os.environ.get("WORKSPACE_ID")
14 |     databricks_aad_token = os.environ.get("DATABRICKS_AAD_TOKEN")
15 |     databricks_mgmt_token = os.environ.get("DATABRICKS_MANAGEMENT_TOKEN")
16 | 
17 |     databricks_req_headers = {
18 |         'Authorization': f'Bearer {databricks_aad_token}',
19 |         'X-Databricks-Azure-SP-Management-Token': f'{databricks_mgmt_token}',
20 |         'X-Databricks-Azure-Workspace-Resource-Id': f'{workspace_id}',
21 |         'Content-Type': 'application/json'
22 |     }
23 |     return databricks_req_headers


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_azure_login.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     This script is used to login to Azure using a service principal
 3 | """
 4 | import os
 5 | import subprocess
 6 | 
 7 | 
 8 | ARM_CLIENT_ID = os.environ['ARM_CLIENT_ID']
 9 | ARM_CLIENT_SECRET = os.environ['ARM_CLIENT_SECRET']
10 | ARM_TENANT_ID = os.environ['ARM_TENANT_ID']
11 | 
12 | 
13 | def run_cmd(cmd):
14 |     """
15 |         Run a command and return the output as a list of lines
16 |         shell=false for devops pipelines
17 |     """
18 |     process = subprocess.run(cmd, stdout=subprocess.PIPE, check=True, shell=False)
19 |     output = process.stdout.decode().split('\n')
20 |     #print(output)
21 |     output = [
22 |         line.strip('\n').strip('\r').strip('"') for line in output
23 |         if line.strip('\n').strip('\r')
24 |     ]
25 |     #import pdb; pdb.set_trace()
26 |     #print(f"Return Code: {process.returncode}").
27 |     if process.returncode != 0:
28 |         raise RuntimeError('\n'.join(output))
29 |     return output, process.returncode
30 | 
31 | 
32 | def start_azure_login():
33 |     """
34 |         Login to Azure using the service principal
35 |     """
36 |     az_login_cmd = ["az", "login", "--service-principal",
37 |                     "-u", ARM_CLIENT_ID,
38 |                     "-p", ARM_CLIENT_SECRET,
39 |                     "--tenant", ARM_TENANT_ID
40 |                     ]
41 |     print("Logging In To Azure")
42 |     #_, returncode = run_cmd(az_login_cmd)
43 |     output, returncode = run_cmd(az_login_cmd)
44 |     return returncode
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     returncode = start_azure_login()
49 |     print(f"Return Code: {returncode}")
50 | 


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_azure_login.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | az upgrade
 4 | echo $ARM_CLIENT_ID
 5 | echo $ARM_CLIENT_SECRET
 6 | echo $ARM_TENANT_ID
 7 | echo $AuthenticationType
 8 | 
 9 | az config set extension.use_dynamic_install=yes_without_prompt
10 | 
11 | 
12 | echo "Service Principal Authentication"
13 | az login --service-principal -u $ARM_CLIENT_ID -p $ARM_CLIENT_SECRET --tenant $ARM_TENANT_ID
14 | az account list


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_aad_tokens.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import os
 3 | 
 4 | def createManagementToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL):
 5 |         """
 6 |             Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens
 7 |         """
 8 | 
 9 |         tokenRequestBody['resource'] = 'https://management.core.windows.net/'
10 |         
11 |         response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody)
12 |         
13 |         if response.status_code == 200:
14 |             print(response.status_code)
15 |         
16 |         else:
17 |             raise Exception(response.text)
18 |         
19 |         return response.json()['access_token']
20 | 
21 | def createBearerToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL):
22 |         """
23 |             Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens
24 |         """
25 |         
26 |         tokenRequestBody['resource'] = '2ff814a6-3304-4ab8-85cb-cd0e6f879c1d'
27 |         
28 |         response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody)
29 |         
30 |         if response.status_code == 200:
31 |             print(response.status_code)
32 |         
33 |         else:
34 |             raise Exception(response.text)
35 |         
36 |         return response.json()['access_token']
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     tokenRequestBody = {
41 |         'grant_type': 'client_credentials',
42 |         'client_id': os.environ['ARM_CLIENT_ID'],
43 |         'client_secret': os.environ['ARM_CLIENT_SECRET']
44 |     } 
45 |     tokenRequestHeaders = {'Content-Type': 'application/x-www-form-urlencoded'}
46 |     tokenBaseURL = 'https://login.microsoftonline.com/' + os.environ['ARM_TENANT_ID'] + '/oauth2/token'
47 | 
48 |     bearerToken = createBearerToken(tokenRequestBody=tokenRequestBody, 
49 |                                     tokenRequestHeaders=tokenRequestHeaders, 
50 |                                     tokenBaseURL=tokenBaseURL
51 |                     )
52 |     
53 |     managementToken = createManagementToken(tokenRequestBody=tokenRequestBody,
54 |                                             tokenRequestHeaders=tokenRequestHeaders,
55 |                                             tokenBaseURL=tokenBaseURL
56 |                     )
57 | 
58 |     os.environ['DATABRICKS_AAD_TOKEN'] = bearerToken 
59 |     os.environ['DATABRICKS_MANAGEMENT_TOKEN'] = managementToken 
60 | 
61 |     print("DATABRICKS_AAD_TOKEN",os.environ['DATABRICKS_AAD_TOKEN'])
62 |     print("DATABRICKS_MANAGEMENT_TOKEN",os.environ['DATABRICKS_MANAGEMENT_TOKEN'])
63 | 
64 |     with open(os.getenv('GITHUB_ENV'), 'a') as env:
65 |         print(f'DATABRICKS_AAD_TOKEN={bearerToken}', file=env)
66 |         print(f'DATABRICKS_MANAGEMENT_TOKEN={managementToken}', file=env)
67 | 
68 | 
69 |     
70 |     #print("##vso[task.setvariable variable=DATABRICKS_AAD_TOKEN;isOutput=true;]{b}".format(b=bearerToken))
71 |     #print("##vso[task.setvariable variable=DATABRICKS_MANAGEMENT_TOKEN;isOutput=true;]{b}".format(b=managementToken))


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_aad_tokens.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | DATABRICKS_AAD_TOKEN=$( \
 4 |     az account get-access-token \
 5 |         --resource $DBX_RESOURCE_ID \
 6 |         --query "accessToken" \
 7 |         --output tsv \
 8 | )
 9 | 
10 | DATABRICKS_MANAGEMENT_TOKEN=$( \
11 |     az account get-access-token \
12 |         --resource "https://management.core.windows.net/" \
13 |         --query "accessToken" \
14 |         --output tsv \
15 | )
16 | 
17 | echo $DATABRICKS_AAD_TOKEN
18 | echo $DATABRICKS_MANAGEMENT_TOKEN
19 | 
20 | echo "##vso[task.setvariable variable="DATABRICKS_MANAGEMENT_TOKEN";isOutput=true;]$DATABRICKS_MANAGEMENT_TOKEN"
21 | echo "##vso[task.setvariable variable="DATABRICKS_AAD_TOKEN";isOutput=true;]$DATABRICKS_AAD_TOKEN" 


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_azure_resources.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import json
 4 | 
 5 | __here__ = os.path.dirname(__file__)
 6 | 
 7 | ENVIRONMENT = os.environ['ENVIRONMENT']
 8 | 
 9 | 
10 | class LoadJson():
11 |     def __init__(self):
12 |         self.json_file = 'infrastructure/bicep/params/' + ENVIRONMENT + '/bicep.parameters.json'
13 | 
14 |     def load_json(self):
15 |         with open(self.json_file, 'r') as f:
16 |             repos_config = json.load(f)
17 |         return repos_config
18 |     
19 |     def get_param_file_path(self):
20 |         return self.load_json()['parameters']['TemplateParamFilePath']['value']
21 |     
22 |     def get_template_file_path(self):
23 |         return self.load_json()['parameters']['TemplateFilePath']['value']
24 |     
25 |     def get_location(self):
26 |         return self.load_json()['parameters']['location']['value']
27 | 
28 | 
29 | def run_cmd(cmd):
30 |     #May Need To Rmove shell=True
31 |     process = subprocess.run(cmd, stdout=subprocess.PIPE)
32 |     #print(process)
33 |     output = process.stdout.decode().split('\n')
34 |     #print(output)
35 |     output = [line.strip('\n').strip('\r') for line in output]
36 |     #print(output)
37 |     if process.returncode != 0:
38 |         raise RuntimeError('\n'.join(output))
39 |     return output
40 | 
41 | 
42 | def deploy_azure_resources():
43 |     json_obj = LoadJson()
44 |     template_param_file_path = json_obj.get_param_file_path()
45 |     template_file_path = json_obj.get_template_file_path()
46 |     location  = json_obj.get_location()
47 | 
48 |     az_deploy_cmd = ["az", "deployment", "sub", "create",
49 |                     "--location", location,
50 |                     "--template-file", template_file_path,
51 |                     "--parameters", template_param_file_path,
52 |                     "--name", ENVIRONMENT,
53 |                     "--only-show-errors"  ]
54 |     
55 | 
56 |     print("Deploying Azure Resources... This Make Take A Few Minutes")
57 |     run_cmd(az_deploy_cmd)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     deploy_azure_resources()
62 | 


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_azure_resources.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo $ENVIRONMENT
 4 | echo "Ingest JSON File"
 5 | JSON=$( jq '.' infrastructure/bicep/params/$ENVIRONMENT/bicep.parameters.json)
 6 | 
 7 | TemplateParamFilePath=$( jq -r '.parameters.TemplateParamFilePath.value' <<< "$JSON")
 8 | echo "Parm File Path: $TemplateParamFilePath"
 9 | 
10 | 
11 | TemplateFilePath=$( jq -r '.parameters.TemplateFilePath.value' <<< "$JSON")
12 | echo "File Path: $TemplateFilePath"
13 | 
14 | location=$( jq -r '.parameters.location.value' <<< "$JSON")
15 | echo "Location: $location"
16 | 
17 | 
18 | echo "environment variable: $TemplateParamFilePath"
19 | echo "environment variable: $location"
20 | echo "environment variable: $TemplateFilePath"
21 | # Important to define unique deployment names as conflicts will occur
22 | echo "Create Azure DBX Resource Environments...."
23 | 
24 | az deployment sub create \
25 |     --location $location \
26 |     --template-file $TemplateFilePath \
27 |     --parameters $TemplateParamFilePath \
28 |     --name "$ENVIRONMENT" \
29 |     --only-show-errors


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_cluster.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import time
  4 | from urllib.error import HTTPError
  5 | 
  6 | import requests
  7 | 
  8 | from dbx_utils.common import get_databricks_request_headers
  9 | 
 10 | 
 11 | def ingest_cluster_param_file(filename):
 12 | 
 13 |     """
 14 |         loads  the json parameter file containing
 15 |         databricks cluster configs to build
 16 | 
 17 |         :param: filename: param file location
 18 |         :type: str
 19 | 
 20 |         :return: cluster_param_file: dbx cluster definitions
 21 |         :type: str
 22 |     """
 23 |     with open(filename, 'r', encoding="utf-8") as file:
 24 |         cluster_param_file = json.load(file)
 25 |         cluster_param_file = cluster_param_file['Clusters']
 26 | 
 27 |     return cluster_param_file
 28 | 
 29 | 
 30 | def create_clusters():
 31 |     """
 32 |         Main script which calls sub functions to create
 33 |         databricks clusters degined in params file
 34 |     """
 35 | 
 36 |     environment = os.environ.get("ENVIRONMENT")
 37 | 
 38 |     cluster_param_file = ingest_cluster_param_file(
 39 |         'infrastructure/databricks/databricks_configs/'
 40 |         + environment +
 41 |         '/clusters.json'
 42 |     )
 43 | 
 44 |     existing_clusters, _ = list_existing_clusters()
 45 | 
 46 |     existing_clusters_name_arr = get_cluster_names(existing_clusters)
 47 | 
 48 |     for cluster in cluster_param_file:
 49 |         # Cluster Does Not Exist - Deploy
 50 |         if cluster['cluster_name'] not in existing_clusters_name_arr:
 51 |             cluster_status, cluster_id = create_cluster(cluster)
 52 |             print(f"Cluster Status: {cluster_status}")
 53 |             manage_cluster_state(cluster_id)
 54 |         else:
 55 |             print(
 56 |                 f"Cluster {cluster['cluster_name']} already exists - Skipping."
 57 |             )
 58 | 
 59 | 
 60 | def list_existing_clusters():
 61 |     """
 62 |         Returns a Json object containing a list
 63 |         of existing Databricks Clusters.
 64 | 
 65 |         :return: response_content: dbx api response with clusters
 66 |         :type: str
 67 | 
 68 |         :return: status_code: api status code
 69 |         :type: int
 70 |     """
 71 | 
 72 |     databricks_req_headers = get_databricks_request_headers()
 73 |     databricks_instance = os.environ.get("DATABRICKS_INSTANCE")
 74 |     response = requests.get(
 75 |         'https://' + databricks_instance + '/api/2.0/clusters/list',
 76 |         headers=databricks_req_headers,
 77 |         timeout=10
 78 |     )
 79 | 
 80 |     if response.ok:
 81 |         return response.json(), response.status_code
 82 | 
 83 |     raise HTTPError(
 84 |         response.url, code=response.status_code, msg="Failure",
 85 |         hdrs=response.headers, fp=response
 86 |     )
 87 | 
 88 | 
 89 | def get_cluster_names(existing_clusters):
 90 |     """
 91 |         Parses JSON object with existing databricks clusters
 92 |         and returns an array with cluster names
 93 | 
 94 |         :param: cluster: json object of existing dbx clusters
 95 |         :type: str
 96 | 
 97 |         :return: existing_clusters_name_arr: array of cluster names
 98 |         :type: array
 99 |     """
100 |     existing_clusters_name_arr = []
101 | 
102 |     if existing_clusters:  # If clusters exist
103 |         for existing_cluster in existing_clusters['clusters']:
104 |             existing_clusters_name_arr.append(existing_cluster['cluster_name'])
105 |         return existing_clusters_name_arr
106 |     # If No Clusters Exist, Return Empty Array
107 |     return existing_clusters_name_arr
108 | 
109 | 
110 | def create_cluster(cluster):
111 |     """
112 |         Takes json definitions for clusters to create,
113 |         and invokes the Databricks  Cluster API.
114 | 
115 |         :param: cluster: cluster definition
116 |         :type: str
117 | 
118 |         :return: status code: response status for api call
119 |         :type: int
120 | 
121 |         :return: cluster_id: cluster id for newly created databricks cluster
122 |         :type: str
123 |     """
124 |     databricks_req_headers = get_databricks_request_headers()
125 |     databricks_instance = os.environ.get("DATABRICKS_INSTANCE")
126 |     response = requests.post(
127 |         'https://' + databricks_instance + '/api/2.0/clusters/create',
128 |         headers=databricks_req_headers,
129 |         json=cluster,
130 |         timeout=10
131 |     )
132 | 
133 |     if response.ok:
134 |         cluster_id = response.json()["cluster_id"]
135 |         return response.status_code, cluster_id
136 | 
137 |     raise HTTPError(
138 |         response.text,
139 |         code=response.status_code,
140 |         msg="Failure",
141 |         hdrs=response.headers,
142 |         fp=response
143 |     )
144 | 
145 | 
146 | def manage_cluster_state(cluster_id):
147 |     """
148 |         Function contuninally checks cluster state until
149 |         cluster is Running, or Fails to Start
150 | 
151 |         :param: cluster_id: clusterid for the Databricks Cluster
152 |         :type: str
153 |     """
154 | 
155 |     await_cluster = True
156 |     start_time = time.time()
157 |     loop_time = 1200  # 20 Minutes
158 |     while await_cluster:
159 |         current_time = time.time()
160 |         elapsed_time = current_time - start_time
161 |         if elapsed_time > loop_time:
162 |             raise Exception(f'Error: Loop took over {loop_time} seconds to run.')
163 |         if get_databricks_cluster_info(cluster_id)['state'] == 'TERMINATED':
164 |             print('Starting Terminated Cluster')
165 |             raise ValueError("Failed to create cluster, cluster teminated")
166 |         if get_databricks_cluster_info(cluster_id)['state'] == 'RESTARTING':
167 |             print('Cluster is Restarting')
168 |             time.sleep(60)
169 |         elif get_databricks_cluster_info(cluster_id)['state'] == 'PENDING':
170 |             print('Cluster is Pending Start')
171 |             time.sleep(60)
172 |         else:
173 |             print('Cluster is Running')
174 |             await_cluster = False
175 | 
176 | 
177 | def get_databricks_cluster_info(cluster_id):
178 |     """
179 |         Returns an object revealing the Databricks Cluster State
180 |         "Terminated", "Restarting", or "Pending"
181 | 
182 |         :param: cluster_id: clusterid for the Databricks Cluster
183 |         :type: str
184 | 
185 |         :return: api response object
186 |         :type: str
187 |     """
188 |     databricks_req_headers = get_databricks_request_headers()
189 |     databricks_instance = os.environ.get("DATABRICKS_INSTANCE")
190 |     databricks_cluster_id = {'cluster_id': cluster_id}
191 | 
192 |     response = requests.get(
193 |         'https://' + databricks_instance + '/api/2.0/clusters/get',
194 |         headers=databricks_req_headers,
195 |         params=databricks_cluster_id,
196 |         timeout=10
197 |         )
198 | 
199 |     if response.ok:
200 |         return response.json()
201 | 
202 |     raise HTTPError(
203 |             response.text,
204 |             code=response.status_code,
205 |             msg="Failure",
206 |             hdrs=response.headers,
207 |             fp=response
208 |         )
209 | 
210 | 
211 | def main():
212 |     """
213 |         Main function to invoke cluster creation
214 |     """
215 | 
216 |     create_clusters()
217 | 
218 | 
219 | if __name__ == "__main__":
220 |     main()
221 | 


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_databricks_token.sh:
--------------------------------------------------------------------------------
 1 | SECRET_NAME="dbkstoken"
 2 | # Check if secret exists
 3 | 
 4 | az upgrade 
 5 | 
 6 | az account set --subscription $SUBSCRIPTION_ID
 7 | 
 8 | 
 9 | SECRET_EXISTS=$(az keyvault secret list \
10 |                 --vault-name $AZ_KEYVAULT_NAME \
11 |                 --query "contains([].id, \
12 |                 'https://$AZ_KEYVAULT_NAME.vault.azure.net/secrets/$SECRET_NAME')")
13 | 
14 | echo "secret exists: $SECRET_EXISTS"
15 | 
16 | if [ $SECRET_EXISTS == true ]; then
17 |     echo "Secret '$SECRET_NAME' exists! fetching..."
18 |     DATABRICKS_TOKEN=$(az keyvault secret show \
19 |                     --name $SECRET_NAME \
20 |                     --vault-name $AZ_KEYVAULT_NAME \
21 |                     --query "value" \
22 |                     -o tsv )
23 | 
24 |     #echo "Secret Value: $DATABRICKS_TOKEN"
25 | 
26 |    # if [[ $DevOps_Agent == "GitHub" ]]; then
27 |    #     echo "DATABRICKS_TOKEN=$DATABRICKS_TOKEN" >> $GITHUB_ENV
28 |    # else
29 |    #     echo "##vso[task.setvariable variable="DATABRICKS_TOKEN";isOutput=true;]$DATABRICKS_TOKEN"
30 |    # fi  
31 |     
32 | 
33 | else
34 |     echo "Secret '$SECRET_NAME' Do Not exist! Creating PAT Token & Store In Key Vault..."
35 |     # Must Assign SP Minimum Contributor Permissions. Must also give the SP Key Vault Administrator Privileges (Need to Set these in YAML)
36 | 
37 |     PAT_TOKEN_RESPONSE=$(curl -X POST \
38 |                         -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
39 |                         -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
40 |                         -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" -d \
41 |                         '{
42 |                             "lifetime_seconds": "30000000", 
43 |                             "comment": "Token For Databricks"
44 |                         }' https://$DATABRICKS_INSTANCE/api/2.0/token/create )
45 | 
46 |     echo "PAT Token Creation Response...."
47 |     #echo $PAT_TOKEN_RESPONSE
48 | 
49 |     DATABRICKS_PAT_TOKEN=$(jq .token_value -r <<< "$PAT_TOKEN_RESPONSE")
50 |     echo "PAT Token Creation...."
51 |     #echo $DATABRICKS_PAT_TOKEN
52 | 
53 |     echo "Store PAT In Key Vault...."
54 |     az keyvault secret set \
55 |         --vault-name $AZ_KEYVAULT_NAME \
56 |         --name $SECRET_NAME \
57 |         --value $DATABRICKS_PAT_TOKEN
58 |     
59 |     #echo "Databricks Token As Environment Variable..."
60 | 
61 |     #if [[ $DevOps_Agent == "GitHub" ]]; then
62 |     #    echo "DATABRICKS_AAD_TOKEN=$DATABRICKS_AAD_TOKEN" >> $GITHUB_ENV
63 |     #else
64 |     #    echo "##vso[task.setvariable variable="DATABRICKS_AAD_TOKEN";isOutput=true;]$DATABRICKS_AAD_TOKEN"
65 |     #fi  
66 | fi
67 | 


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_key_vault_secrets.sh:
--------------------------------------------------------------------------------
  1 | 
  2 | ###################################################################################################################################################################//
  3 | ##                                                                      Create Key Vault Secrets                                               
  4 | ###################################################################################################################################################################//
  5 | SECRET_VALUE=$ARM_CLIENT_ID
  6 | SECRET_NAME="ARMCLIENTID"
  7 | 
  8 | 
  9 | 
 10 | SECRET_EXISTS=$(az keyvault secret list \
 11 |                 --vault-name $AZ_KEYVAULT_NAME \
 12 |                 --query "contains([].id, \
 13 |                 'https://$AZ_KEYVAULT_NAME.vault.azure.net/secrets/$SECRET_NAME')")
 14 | 
 15 | echo "secret exists: $SECRET_EXISTS"
 16 | 
 17 | if [ $SECRET_EXISTS == true ]; then
 18 |     echo "Secret '$SECRET_NAME' exists! fetching..."
 19 |     SECRET_VALUE=$(az keyvault secret show \
 20 |                     --name $SECRET_NAME \
 21 |                     --vault-name $AZ_KEYVAULT_NAME \
 22 |                     --query "value")
 23 | 
 24 |     echo "Secret Value: $SECRET_VALUE"
 25 | 
 26 | else
 27 |     echo "Secret '$SECRET_NAME' Do Not exist! Creating PAT Token & Store In Key Vault..."
 28 |     # Must Assign SP Minimum Contributor Permissions. Must also give the SP Key Vault Administrator Privileges (Need to Set these in YAML)
 29 |     echo "Store Secret In Key Vault...."
 30 |     az keyvault secret set \
 31 |         --vault-name $AZ_KEYVAULT_NAME \
 32 |         --name $SECRET_NAME \
 33 |         --value $SECRET_VALUE
 34 | fi
 35 | 
 36 | 
 37 | ###################################################################################################################################################################//
 38 | ##                                                                      ARM_TENANT                                               
 39 | ###################################################################################################################################################################//
 40 | 
 41 | 
 42 | SECRET_VALUE=$ARM_TENANT_ID
 43 | SECRET_NAME="ARMTENANTID"
 44 | # Check if secret exists
 45 | SECRET_EXISTS=$(az keyvault secret list \
 46 |                 --vault-name $AZ_KEYVAULT_NAME \
 47 |                 --query "contains([].id, \
 48 |                 'https://$AZ_KEYVAULT_NAME.vault.azure.net/secrets/$SECRET_NAME')")
 49 | 
 50 | echo "secret exists: $SECRET_EXISTS"
 51 | 
 52 | if [ $SECRET_EXISTS == true ]; then
 53 |     echo "Secret '$SECRET_NAME' exists! fetching..."
 54 |     SECRET_VALUE=$(az keyvault secret show \
 55 |                     --name $SECRET_NAME \
 56 |                     --vault-name $AZ_KEYVAULT_NAME \
 57 |                     --query "value")
 58 | 
 59 |     echo "Secret Value: $SECRET_VALUE"
 60 | 
 61 | else
 62 |     echo "Secret '$SECRET_NAME' Do Not exist! Creating PAT Token & Store In Key Vault..."
 63 |     # Must Assign SP Minimum Contributor Permissions. Must also give the SP Key Vault Administrator Privileges (Need to Set these in YAML)
 64 |     echo "Store Secret In Key Vault...."
 65 |     az keyvault secret set \
 66 |         --vault-name $AZ_KEYVAULT_NAME \
 67 |         --name $SECRET_NAME \
 68 |         --value $SECRET_VALUE
 69 | fi
 70 | 
 71 | 
 72 | ###################################################################################################################################################################//
 73 | ##                                                                      ARM_Client_Secret                                               
 74 | ###################################################################################################################################################################//
 75 | 
 76 | 
 77 | SECRET_VALUE=$ARM_CLIENT_SECRET
 78 | SECRET_NAME="ARMCLIENTSECRET"
 79 | # Check if secret exists
 80 | SECRET_EXISTS=$(az keyvault secret list \
 81 |                 --vault-name $AZ_KEYVAULT_NAME \
 82 |                 --query "contains([].id, \
 83 |                 'https://$AZ_KEYVAULT_NAME.vault.azure.net/secrets/$SECRET_NAME')")
 84 | 
 85 | echo "secret exists: $SECRET_EXISTS"
 86 | 
 87 | if [ $SECRET_EXISTS == true ]; then
 88 |     echo "Secret '$SECRET_NAME' exists! fetching..."
 89 |     SECRET_VALUE=$(az keyvault secret show \
 90 |                     --name $SECRET_NAME \
 91 |                     --vault-name $AZ_KEYVAULT_NAME \
 92 |                     --query "value")
 93 | 
 94 |     echo "Secret Value: $SECRET_VALUE"
 95 | 
 96 | else
 97 |     echo "Secret '$SECRET_NAME' Do Not exist! Creating PAT Token & Store In Key Vault..."
 98 |     # Must Assign SP Minimum Contributor Permissions. Must also give the SP Key Vault Administrator Privileges (Need to Set these in YAML)
 99 |     echo "Store Secret In Key Vault...."
100 |     az keyvault secret set \
101 |         --vault-name $AZ_KEYVAULT_NAME \
102 |         --name $SECRET_NAME \
103 |         --value $SECRET_VALUE
104 | fi


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_repo_folder.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import time
 3 | import os
 4 | import json
 5 | from dotenv import load_dotenv
 6 | 
 7 | 
 8 | load_dotenv(".env") # load environment variables
 9 | 
10 | def _ingest_repo_param_file(filename):
11 |     """
12 |         Ingests the Json Parameters File for Databricks Repo Creation
13 |     """
14 |     with open(filename, 'r') as file:
15 |         
16 |         repo_param_file = json.load(file)['Repo_Configuration']        
17 |         
18 |         return repo_param_file
19 |     
20 | def create_databricks_repos(postjson):
21 |     """
22 |         Takes Json object for cluster creation, and invokes the Databricks API.
23 |     """
24 | 
25 |     ARM_CLIENT_ID = os.environ.get("ARM_CLIENT_ID")
26 |     WORKSPACE_ID = os.environ.get("WORKSPACE_ID")
27 |     DATABRICKS_INSTANCE = os.environ.get("DATABRICKS_INSTANCE")
28 |     DATABRICKS_AAD_TOKEN = os.environ.get("DATABRICKS_AAD_TOKEN")
29 |     DATABRICKS_MANAGEMENT_TOKEN = os.environ.get("DATABRICKS_MANAGEMENT_TOKEN")
30 |     
31 | 
32 |     DBRKS_REQ_HEADERS = {
33 |         'Authorization': f'Bearer {DATABRICKS_AAD_TOKEN}',
34 |         'X-Databricks-Azure-SP-Management-Token': f'{DATABRICKS_MANAGEMENT_TOKEN}',
35 |         'X-Databricks-Azure-Workspace-Resource-Id': f'{WORKSPACE_ID}',
36 |         'Content-Type': 'application/json'
37 |     }
38 | 
39 |     path = postjson['path']
40 |     #import pdb; pdb.set_trace()
41 | 
42 |     newData = {
43 |         "path": "/Repos/"+ ARM_CLIENT_ID + "/" + path 
44 |         }
45 |     
46 |     postjson.update(newData)
47 | 
48 |     print("Updated Repo Json String")
49 |     print(postjson)
50 | 
51 |     response = requests.post(
52 |         'https://' + DATABRICKS_INSTANCE + '/api/2.0/repos', headers=DBRKS_REQ_HEADERS, json=postjson
53 |     )
54 | 
55 |     #400: Already Exists
56 |     print(f"Response: {response.content}")
57 | 
58 |     if response.status_code == 200 or response.status_code == 400:
59 |         print(f"Status Code: {response.status_code}")
60 |     else:
61 |         raise Exception(response.status_code)
62 | 
63 | 
64 | def main():
65 | 
66 |     ENVIRONMENT = os.environ.get("ENVIRONMENT")
67 | 
68 |     file_name = 'infrastructure/databricks/databricks_configs/' + ENVIRONMENT + '/repos.json'
69 |     repo_param_file = _ingest_repo_param_file(file_name)
70 | 
71 |     # Extract array from Json object
72 | 
73 |     print(f"Repos To Connect {repo_param_file}")
74 | 
75 |     for repo in repo_param_file:
76 |         print(f"Repo {repo}")
77 |         create_databricks_repos(repo)
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 
83 |     
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_repo_folder.sh:
--------------------------------------------------------------------------------
 1 | echo "Ingest JSON File"
 2 | JSON=$( jq '.' infrastructure/databricks/databricks_configs/$ENVIRONMENT/repos.json)
 3 | 
 4 | #echo "${JSON}" | jq
 5 | 
 6 | 
 7 | echo "User Folders In Databricks Repos Will Be Described Using An Email Address... e.g Ciaranh@Microsoft.com  "
 8 | echo "The DevOps Agent SP Which Is Also A User, However Its Databricks Repo User Folder is Named After The AppID: $ARM_CLIENT_ID"
 9 | echo "All Folders Defined In The JSON Parameters Folder Will Be Appended To /Repos/<AppId>/"
10 | 
11 | for row in $(echo "${JSON}" | jq -r '.Repo_Configuration[] | @base64'); do
12 |     _jq() {
13 |         echo ${row} | base64 --decode | jq -r ${1}
14 |     }
15 | 
16 |     JSON_STRING=$( jq -n -c \
17 |                     --arg url "$(_jq '.url')" \
18 |                     --arg pr "$(_jq '.provider')" \
19 |                     --arg pa "/Repos/$ARM_CLIENT_ID/$(_jq '.path')"  \
20 |                     '{url: $url,
21 |                     provider: $pr,
22 |                     path: $pa}' )
23 |     
24 |     #echo "JSON -D String "
25 |     #echo $JSON_STRING
26 | 
27 |     echo $DATABRICKS_AAD_TOKEN
28 |     echo $DATABRICKS_MANAGEMENT_TOKEN
29 | 
30 |     CREATE_REPO_RESPONSE=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
31 |                 -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
32 |                 -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
33 |                 -H 'Content-Type: application/json' \
34 |                 -d $JSON_STRING \
35 |                 https://$DATABRICKS_INSTANCE/api/2.0/repos )
36 | 
37 |     echo "Repo Response"
38 |     echo $CREATE_REPO_RESPONSE
39 | done


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_role_based_access.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | #echo "Resource Group Name: $RESOURCE_GROUP_NAME"
 5 | echo "ENVIRONMENT: $ENVIRONMENT"
 6 | RESOURCE_GROUP_ID=$( az group show -n $RESOURCE_GROUP_NAME --query id -o tsv )
 7 | 
 8 | echo "Ingest JSON File"
 9 | JSON=$( jq '.' infrastructure/databricks/databricks_configs/$ENVIRONMENT/rbac.json)
10 | #echo "${JSON}" | jq
11 | 
12 | for row in $(echo "${JSON}" | jq -r '.RBAC_Assignments[] | @base64'); do
13 |     _jq() {
14 |         echo ${row} | base64 --decode | jq -r ${1}
15 |     }
16 |     ROLES_ARRAY="$(_jq '.roles')"
17 |     #echo $ROLES_ARRAY
18 | 
19 |     # Before: [ "Contributor", "DBX_Custom_Role", "Key Vault Administrator" ]
20 |     # xargs trims whitespace on either side. -n removes newline characters.
21 |     ROLES_ARRAY_PARSED=$( echo $ROLES_ARRAY | jq -r | tr -d "[]" | tr -d \'\" | xargs echo -n )
22 |     # After: Contributor, DBX_Custom_Role, Key Vault Administrator
23 |     #echo $ROLES_ARRAY_PARSED
24 |     Field_Separator=$IFS
25 |     IFS=,
26 |     for ROLE in $ROLES_ARRAY_PARSED; do
27 |         ROLE=$( echo $ROLE | xargs )
28 |         
29 |         az role assignment create \
30 |         --role "$ROLE" \
31 |         --assignee-object-id $(_jq '.roleBeneficiaryObjID') \
32 |         --assignee-principal-type "$(_jq '.principalType')" \
33 |         --scope "$RESOURCE_GROUP_ID" \
34 |         -o none
35 |         #--scope "$(_jq '.scope')"
36 | 
37 |     done    
38 |     IFS=$Field_Separator
39 | done


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_secret_scopes.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import requests
  4 | 
  5 | __here__ = os.path.dirname(__file__)
  6 | 
  7 | RESOURCE_GROUP_NAME = os.environ['RESOURCE_GROUP_NAME']
  8 | DATABRICKS_INSTANCE = os.environ['DATABRICKS_INSTANCE']
  9 | WORKSPACE_ID = os.environ['WORKSPACE_ID']
 10 | SUBSCRIPTION_ID = os.environ['SUBSCRIPTION_ID']
 11 | DATABRICKS_AAD_TOKEN = os.environ['DATABRICKS_AAD_TOKEN']
 12 | DATABRICKS_MANAGEMENT_TOKEN = os.environ['DATABRICKS_MANAGEMENT_TOKEN']
 13 | ARM_CLIENT_ID = os.environ['ARM_CLIENT_ID']
 14 | ARM_CLIENT_SECRET = os.environ['ARM_CLIENT_SECRET']
 15 | ARM_TENANT_ID = os.environ['ARM_TENANT_ID']
 16 | AML_WS_NAME = os.environ['AML_WS_NAME']
 17 | 
 18 | 
 19 | DBRKS_REQ_HEADERS = {
 20 |     'Authorization': f'Bearer {DATABRICKS_AAD_TOKEN}',
 21 |     'X-Databricks-Azure-SP-Management-Token': f'{DATABRICKS_MANAGEMENT_TOKEN}',
 22 |     'X-Databricks-Azure-Workspace-Resource-Id': f'{WORKSPACE_ID}',
 23 |     'Content-Type': 'application/json'
 24 | }
 25 | 
 26 | 
 27 | 
 28 | def run_cmd(cmd):
 29 |     #May Need To Rmove shell=True
 30 |     process = subprocess.run(cmd, stdout=subprocess.PIPE)
 31 |     output = process.stdout.decode().split('\n')
 32 |     #print(output)
 33 |     output = [line.strip('\n').strip('\r') for line in output]
 34 | 
 35 | 
 36 |     #print(f"Return Code: {process.returncode}")
 37 |     if process.returncode != 0:
 38 |         raise RuntimeError('\n'.join(output))
 39 |     return output
 40 | 
 41 | def get_app_insight_name():
 42 |     cmd = ["az", "resource", "list", "-g", RESOURCE_GROUP_NAME, "--resource-type", "microsoft.insights/components", "--query", "[].name", "-o", "tsv"]
 43 |     name = run_cmd(cmd)
 44 |     return name
 45 | 
 46 | 
 47 | def get_app_insight_key(name):
 48 |     cmd = ["az", "monitor", "app-insights", "component", "show", "-g", RESOURCE_GROUP_NAME, "-a", name, "--query", "connectionString", "-o", "tsv"]
 49 |     key = run_cmd(cmd)
 50 |     return key
 51 | 
 52 | 
 53 | def create_secret_scopes(scope_name=str, initial_manage_principal=str):
 54 |     """
 55 |         Takes Json object for cluster creation, and invokes the Databricks API.
 56 |     """
 57 |     postjson = {
 58 |         "scope": scope_name,
 59 |         "initial_manage_principal": initial_manage_principal
 60 |     }
 61 | 
 62 |     response = requests.post(
 63 |         'https://' + DATABRICKS_INSTANCE + '/api/2.0/secrets/scopes/create', headers=DBRKS_REQ_HEADERS, json=postjson
 64 |     )
 65 | 
 66 |     #print(response.status_code)
 67 |     #if response.status_code != 200:
 68 |     #    raise Exception(response.text)
 69 | 
 70 |     #print(response.json())
 71 | 
 72 | def insert_secret(secret_value=str, scope_name=str, key=str):
 73 |     """
 74 |         Takes Json object for cluster creation, and invokes the Databricks API.
 75 |     """
 76 |     postjson = {
 77 |         "scope": scope_name,
 78 |         "key": key,
 79 |         "string_value": secret_value
 80 |     }
 81 | 
 82 |     response = requests.post(
 83 |         'https://' + DATABRICKS_INSTANCE + '/api/2.0/secrets/put', headers=DBRKS_REQ_HEADERS, json=postjson
 84 |     )
 85 |     #print(response.status_code)
 86 |     if response.status_code != 200:
 87 |         raise Exception(response.text)
 88 | 
 89 |     #print(response.json())
 90 |     
 91 | 
 92 | if __name__ == '__main__':
 93 |     app_insight_name = get_app_insight_name()[0]
 94 |     #print(app_insight_name)
 95 |     app_insight_key = get_app_insight_key(app_insight_name)[0]
 96 |     #print(app_insight_key)
 97 | 
 98 | 
 99 |     # Create Secret Scopes
100 |     create_secret_scopes(scope_name="DBX_SP_Credentials", initial_manage_principal="users")
101 |     create_secret_scopes(scope_name="AzureResourceSecrets", initial_manage_principal="users")
102 | 
103 |     # Insert Secrets into Secret Scope "DBX_SP_Credentials"
104 |     insert_secret(secret_value=ARM_CLIENT_ID, scope_name="DBX_SP_Credentials", key="DBX_SP_Client_ID")
105 |     insert_secret(secret_value=ARM_CLIENT_SECRET, scope_name="DBX_SP_Credentials", key="DBX_SP_Client_Secret")
106 |     insert_secret(secret_value=ARM_TENANT_ID, scope_name="DBX_SP_Credentials", key="DBX_SP_Tenant_ID")
107 |     insert_secret(secret_value=SUBSCRIPTION_ID, scope_name="DBX_SP_Credentials", key="SUBSCRIPTION_ID")
108 | 
109 |     # Insert Secrets into Secret Scope "AzureResourceSecrets"
110 |     insert_secret(secret_value=app_insight_key, scope_name="AzureResourceSecrets", key="AppInsightsKey")
111 |     insert_secret(secret_value=RESOURCE_GROUP_NAME, scope_name="AzureResourceSecrets", key="RESOURCE_GROUP_NAME")
112 |     insert_secret(secret_value=AML_WS_NAME, scope_name="AzureResourceSecrets", key="AML_WS_NAME")
113 | 


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_create_secret_scopes.sh:
--------------------------------------------------------------------------------
  1 | az config set extension.use_dynamic_install=yes_without_promp
  2 | az extension add --name application-insights
  3 | 
  4 | echo $RESOURCE_GROUP_NAME
  5 | echo $DATABRICKS_INSTANCE
  6 | echo $WORKSPACE_ID
  7 | echo $SUBSCRIPTION_ID
  8 | 
  9 | APP_INSIGHT_NAME=$(az resource list \
 10 |                 -g $RESOURCE_GROUP_NAME \
 11 |                 --resource-type 'microsoft.insights/components' \
 12 |                 --query [].name \
 13 |                 -o tsv )
 14 | 
 15 | APP_INSIGHT_INSTRUMENT_KEY=$( az monitor app-insights component show \
 16 |                             -g $RESOURCE_GROUP_NAME \
 17 |                             -a $APP_INSIGHT_NAME \
 18 |                             --query connectionString )
 19 | 
 20 | echo "Test"
 21 | 
 22 | echo $APP_INSIGHT_NAME
 23 | echo $APP_INSIGHT_INSTRUMENT_KEY
 24 | echo $SUBSCRIPTION_ID
 25 | 
 26 | echo "Creating Secret Scopes...."
 27 | 
 28 | echo "Create DBX_SP_Credentials Scope...."
 29 | 
 30 | Create_Secret_Scope=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
 31 |                             -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
 32 |                             -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
 33 |                             -H 'Content-Type: application/json' -d \
 34 |                             '{
 35 |                                 "scope": "DBX_SP_Credentials", 
 36 |                                 "initial_manage_principal": "users" 
 37 |                             }' https://$DATABRICKS_INSTANCE/api/2.0/secrets/scopes/create )
 38 | 
 39 | echo "Inserting Service Principal + Other Secrets Into Scope.... "
 40 | 
 41 | 
 42 | 
 43 | 
 44 | JSON_STRING=$( jq -n -c \
 45 |                 --arg scope "DBX_SP_Credentials" \
 46 |                 --arg key "DBX_SP_Client_Secret" \
 47 |                 --arg value "$ARM_CLIENT_SECRET"  \
 48 |                 '{
 49 |                     scope: $scope,
 50 |                     key: $key,
 51 |                     string_value: $value
 52 |                 }' )
 53 | 
 54 | echo $JSON_STRING
 55 | 
 56 | Create_DBX_Client_Secret=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
 57 |                             -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
 58 |                             -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
 59 |                             -H 'Content-Type: application/json' \
 60 |                             -d $JSON_STRING \
 61 |                             https://$DATABRICKS_INSTANCE/api/2.0/secrets/put )
 62 | 
 63 | 
 64 | 
 65 | JSON_STRING=$( jq -n -c \
 66 |                 --arg scope "DBX_SP_Credentials" \
 67 |                 --arg key "DBX_SP_ClientID" \
 68 |                 --arg value "$ARM_CLIENT_ID"  \
 69 |                 '{
 70 |                     scope: $scope,
 71 |                     key: $key,
 72 |                     string_value: $value
 73 |                 }' )
 74 | echo $JSON_STRING
 75 |                                         
 76 | Create_DBX_ClientID_Secret=$(curl -X POST \
 77 |                             -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
 78 |                             -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
 79 |                             -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
 80 |                             -H 'Content-Type: application/json' \
 81 |                             -d $JSON_STRING \
 82 |                             https://$DATABRICKS_INSTANCE/api/2.0/secrets/put )
 83 | 
 84 | 
 85 | 
 86 | JSON_STRING=$( jq -n -c --arg scope "DBX_SP_Credentials" --arg key "DBX_SP_TenantID" --arg value "$ARM_TENANT_ID"  \
 87 |                             '{
 88 |                                 scope: $scope,
 89 |                                 key: $key,
 90 |                                 string_value: $value
 91 |                             }' )
 92 | 
 93 | echo $JSON_STRING
 94 | 
 95 | Create_DBX_TenantID_Secret=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
 96 |                             -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
 97 |                             -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
 98 |                             -H 'Content-Type: application/json' \
 99 |                             -d $JSON_STRING \
100 |                             https://$DATABRICKS_INSTANCE/api/2.0/secrets/put )
101 | 
102 | 
103 | 
104 | JSON_STRING=$( jq -n -c --arg scope "DBX_SP_Credentials" --arg key "SUBSCRIPTION_ID" --arg value "$SUBSCRIPTION_ID"  \
105 |                             '{
106 |                                 scope: $scope,
107 |                                 key: $key,
108 |                                 string_value: $value
109 |                             }' )
110 | 
111 | echo $JSON_STRING
112 | 
113 | CREATE_SUBSCRIPTIONID=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
114 |                             -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
115 |                             -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
116 |                             -H 'Content-Type: application/json' \
117 |                             -d $JSON_STRING \
118 |                             https://$DATABRICKS_INSTANCE/api/2.0/secrets/put )
119 | 
120 | 
121 | 
122 | 
123 | 
124 | echo "Create Azure Resources Secrets Scope...."
125 | 
126 | Create_Secret_Scope=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
127 |                             -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
128 |                             -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
129 |                             -H 'Content-Type: application/json' -d \
130 |                             '{
131 |                                 "scope": "AzureResourceSecrets", 
132 |                                 "initial_manage_principal": "users" 
133 |                             }' https://$DATABRICKS_INSTANCE/api/2.0/secrets/scopes/create )
134 | 
135 | #There can be encoding problems passing some variables directly into the api request. Use json_String below with jq to solve this issue
136 | JSON_STRING=$( jq -n -c --arg scope "AzureResourceSecrets" --arg key "appi_ik" --arg value "$APP_INSIGHT_INSTRUMENT_KEY"  \
137 |                             '{
138 |                                 scope: $scope,
139 |                                 key: $key,
140 |                                 string_value: $value
141 |                             }' )
142 | 
143 | Create_APP_INSIGHT_INSTRUMENT_KEY_Secret=$(curl -X POST \
144 |                             -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
145 |                             -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
146 |                             -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
147 |                             -H 'Content-Type: application/json' \
148 |                             -d $JSON_STRING \
149 |                             https://$DATABRICKS_INSTANCE/api/2.0/secrets/put )
150 | 
151 | 
152 | JSON_STRING=$( jq -n -c --arg scope "AzureResourceSecrets" --arg key "RESOURCE_GROUP_NAME" --arg value "$RESOURCE_GROUP_NAME"  \
153 |                             '{
154 |                                 scope: $scope,
155 |                                 key: $key,
156 |                                 string_value: $value
157 |                             }' )
158 | 
159 | CREATE_RESOURCE_GROUP_NAME_SECRET=$(curl -X POST \
160 |                             -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
161 |                             -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
162 |                             -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
163 |                             -H 'Content-Type: application/json' \
164 |                             -d $JSON_STRING \
165 |                             https://$DATABRICKS_INSTANCE/api/2.0/secrets/put )
166 | 
167 | 
168 | JSON_STRING=$( jq -n -c --arg scope "AzureResourceSecrets" --arg key "AML_WS_NAME" --arg value "$AML_WS_NAME"  \
169 |                             '{
170 |                                 scope: $scope,
171 |                                 key: $key,
172 |                                 string_value: $value
173 |                             }' )
174 | 
175 | CREATE_AML_WS_NAME_SECRET=$(curl -X POST \
176 |                             -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
177 |                             -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
178 |                             -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
179 |                             -H 'Content-Type: application/json' \
180 |                             -d $JSON_STRING \
181 |                             https://$DATABRICKS_INSTANCE/api/2.0/secrets/put )
182 | 


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_git_configuration.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # If You Want To Run A Job Which Is Linked To A Git Repo, The Service Principal Will Run The Job As It will Be Owner...
 3 | # ... The Service Principal, Without Receiving Git Authentication, Will Not Be Able To Access The Ropo Files For Which...
 4 | # ... The Job Needs.  
 5 | 
 6 | 
 7 | import requests 
 8 | import os
 9 | import json
10 | 
11 | 
12 | def configureGit(gitConfig, workspaceId, databricksInstance, bearerToken, managementToken, githubToken, environment):
13 | 
14 |     DBRKS_REQ_HEADERS  = {
15 |         'Authorization': f'Bearer {bearerToken}',
16 |         'X-Databricks-Azure-SP-Management-Token': f'{managementToken}',
17 |         'X-Databricks-Azure-Workspace-Resource-Id': f'{workspaceId}',
18 |         'Content-Type': 'application/json'
19 |     }
20 | 
21 |     newData = {
22 |         "personal_access_token": githubToken
23 |         }
24 |     
25 |     gitConfig.update(newData)
26 |     print(gitConfig)
27 |     print(DBRKS_REQ_HEADERS)
28 | 
29 |     response = requests.post('https://' + databricksInstance + '/api/2.0/git-credentials', headers=DBRKS_REQ_HEADERS, json=gitConfig)
30 |     print(response)
31 |     print(response.json())
32 | 
33 |     if response.status_code != 200:
34 | 
35 |         response = requests.get('https://' + databricksInstance + '/api/2.0/git-credentials', headers=DBRKS_REQ_HEADERS)
36 |         print(response.json())
37 |         credential = response.json()["credentials"][0]["credential_id"]
38 |         print(f"Credential is {credential}")
39 |         response = requests.patch('https://' + databricksInstance + '/api/2.0/git-credentials/'+ str(credential), headers=DBRKS_REQ_HEADERS, json=gitConfig)
40 |     
41 |     print(response.json())
42 | 
43 | if __name__ == "__main__":
44 | 
45 |     with open('infrastructure/databricks/databricks_configs/' + os.environ['ENVIRONMENT'] +'/repos.json', 'r') as f:
46 |         json = json.load(f)
47 | 
48 |     gitConfigs = json['Git_Configuration']
49 |     #print(gitConfigs)
50 | 
51 |     #print(os.environ['WORKSPACE_ID'])
52 |     #print(os.environ['DATABRICKS_INSTANCE'])
53 |     #print(os.environ['DATABRICKS_AAD_TOKEN'])
54 |     #print(os.environ['DATABRICKS_MANAGEMENT_TOKEN'])
55 |     #print(os.environ['PAT_GITHUB'])
56 |     #print(os.environ['ENVIRONMENT'])
57 |     for gitConfig in gitConfigs:
58 |         response = configureGit(
59 |             gitConfig=gitConfig, 
60 |             workspaceId=os.environ['WORKSPACE_ID'], 
61 |             databricksInstance=os.environ['DATABRICKS_INSTANCE'], 
62 |             bearerToken=os.environ['DATABRICKS_AAD_TOKEN'], 
63 |             managementToken=os.environ['DATABRICKS_MANAGEMENT_TOKEN'], 
64 |             githubToken=os.environ['PAT_GITHUB'], 
65 |             environment=os.environ['ENVIRONMENT'])       


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_repo_pull.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import os
  3 | import json
  4 | from dotenv import load_dotenv
  5 | 
  6 | 
  7 | load_dotenv(".env") # load environment variables
  8 | 
  9 | 
 10 | def _ingest_repo_param_file(filename):
 11 |     """
 12 |         Ingests the Json Parameters File for Repo Pull
 13 |     """
 14 |     with open(filename, 'r') as file:
 15 |         
 16 |         repo_param_file = json.load(file)['Repo_Configuration']        
 17 |         
 18 |         return repo_param_file
 19 | 
 20 | 
 21 | def get_repos_with_management_permissions():
 22 |     """
 23 |         Invokes Databricks API to get all repos with management permissions
 24 |     """
 25 | 
 26 |     WORKSPACE_ID = os.environ.get("WORKSPACE_ID")
 27 |     DATABRICKS_INSTANCE = os.environ.get("DATABRICKS_INSTANCE")
 28 |     DATABRICKS_AAD_TOKEN = os.environ.get("DATABRICKS_AAD_TOKEN")
 29 |     DATABRICKS_MANAGEMENT_TOKEN = os.environ.get("DATABRICKS_MANAGEMENT_TOKEN")
 30 |     
 31 | 
 32 |     DBRKS_REQ_HEADERS = {
 33 |         'Authorization': f'Bearer {DATABRICKS_AAD_TOKEN}',
 34 |         'X-Databricks-Azure-SP-Management-Token': f'{DATABRICKS_MANAGEMENT_TOKEN}',
 35 |         'X-Databricks-Azure-Workspace-Resource-Id': f'{WORKSPACE_ID}',
 36 |         'Content-Type': 'application/json'
 37 |     }
 38 | 
 39 |     response = requests.get(
 40 |     'https://' + DATABRICKS_INSTANCE + '/api/2.0/repos', headers=DBRKS_REQ_HEADERS
 41 |     )
 42 | 
 43 |     status_code = response.status_code
 44 |     repos_with_management_permissions = response.json()
 45 | 
 46 |     if response.status_code != 200:
 47 |         raise Exception(response.status_code)
 48 |     else:
 49 |         repos_with_management_permissions = repos_with_management_permissions['repos']
 50 |         return repos_with_management_permissions, status_code
 51 | 
 52 | 
 53 | def update_repo(repo_id, update_branch):
 54 |     """
 55 |         Invoked Databricks API to update repo
 56 |     """
 57 | 
 58 |     repo_id = str(repo_id)
 59 | 
 60 |     WORKSPACE_ID = os.environ.get("WORKSPACE_ID")
 61 |     DATABRICKS_INSTANCE = os.environ.get("DATABRICKS_INSTANCE")
 62 |     DATABRICKS_AAD_TOKEN = os.environ.get("DATABRICKS_AAD_TOKEN")
 63 |     DATABRICKS_MANAGEMENT_TOKEN = os.environ.get("DATABRICKS_MANAGEMENT_TOKEN")
 64 |     DATABRICKS_MANAGEMENT_TOKEN = os.environ.get("DATABRICKS_MANAGEMENT_TOKEN")
 65 |     
 66 |     DBRKS_REQ_HEADERS = {
 67 |         'Authorization': f'Bearer {DATABRICKS_AAD_TOKEN}',
 68 |         'X-Databricks-Azure-SP-Management-Token': f'{DATABRICKS_MANAGEMENT_TOKEN}',
 69 |         'X-Databricks-Azure-Workspace-Resource-Id': f'{WORKSPACE_ID}',
 70 |         'Content-Type': 'application/json'
 71 |     }
 72 | 
 73 |     postjson = {
 74 |         "branch": str(update_branch)
 75 |         }
 76 |     
 77 |     print("Updated Repo Json String")
 78 |     print(postjson)
 79 | 
 80 |     response = requests.patch(
 81 |         'https://' + DATABRICKS_INSTANCE + '/api/2.0/repos/'+ repo_id, headers=DBRKS_REQ_HEADERS, json=postjson
 82 |     )
 83 | 
 84 |     if response.status_code != 200:
 85 |         raise Exception(response.content)
 86 |     else:
 87 |         #print(f"Status Code: {response.status_code}")
 88 |         #print(response.json())
 89 |         return response.status_code
 90 |   
 91 | 
 92 | def main():
 93 | 
 94 |     ENVIRONMENT = os.environ.get("ENVIRONMENT")
 95 | 
 96 |     file_name = 'infrastructure/databricks/databricks_configs/' + ENVIRONMENT + '/repos.json'
 97 |     repo_param_file = _ingest_repo_param_file(file_name)
 98 | 
 99 |     print(f"Repos To Connect {repo_param_file}")
100 | 
101 |     repos_with_management_permissions, status_code = get_repos_with_management_permissions()
102 |     
103 |     for repo in repo_param_file:
104 | 
105 |   
106 |         update_folder = repo['path']
107 |         update_branch = repo['branch']
108 |         
109 |         for item in repos_with_management_permissions:
110 |             print(f" The Update Folder is {update_folder} and path is  {item['path']}")
111 |             
112 |             if update_folder in item['path']:
113 |                 print(f" The Update Folder {update_folder} is Contained within the Path {item['path']}")
114 |                 print("Retrieve the Repo ID")
115 | 
116 |                 repo_id = str(item['id'])
117 | 
118 |                 #Update repo
119 |                 #import pdb; pdb.set_trace()
120 |                 status_code = update_repo(repo_id, update_branch)
121 |     
122 |     return status_code
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     main()
127 | 
128 | 
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_repo_pull.sh:
--------------------------------------------------------------------------------
 1 | REPOS_WITH_MANAGEMENT_PERMISSIONS=$(curl -X GET \
 2 |                 -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
 3 |                 -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
 4 |                 -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
 5 |                 -H 'Content-Type: application/json' \
 6 |                 https://$DATABRICKS_INSTANCE/api/2.0/repos )
 7 | 
 8 | 
 9 | echo "Ingest JSON File"
10 | JSON=$( jq '.' infrastructure/databricks/databricks_configs/$ENVIRONMENT/repos.json)
11 | for row in $(echo "${JSON}" | jq -r '.Repo_Configuration[] | @base64'); do
12 |     _jq() {
13 |         echo ${row} | base64 --decode | jq -r ${1}
14 |     }
15 |     
16 |     echo "PULL_BRANCH: $PULL_BRANCH"
17 |     UPDATE_FOLDER=$(_jq '.path')
18 |     echo "UPDATE FOLDER: $UPDATE_FOLDER"
19 | 
20 |     if [ -z "$PULL_BRANCH" ];
21 |     then
22 |         PULL_BRANCH=$DBX_REPO_BRANCH
23 |         "Use Release Branch: $PULL_BRANCH"
24 |     fi
25 | 
26 |     echo "Display Repos In DBX With Manage Permissions...."
27 |     echo $REPOS_WITH_MANAGEMENT_PERMISSIONS
28 | 
29 |     echo "Retrieve Repo ID For ..."
30 |     REPO_ID=$( jq -r --arg UPDATE_FOLDER "$UPDATE_FOLDER" ' .repos[] | select( .path | contains($UPDATE_FOLDER)) | .id ' <<< "$REPOS_WITH_MANAGEMENT_PERMISSIONS")
31 | 
32 |     echo "Repo ID: $REPO_ID"
33 | 
34 |     echo "Git Pull on DBX Repo $UPDATE_FOLDER With $PULL_BRANCH Branch "
35 | 
36 |     JSON_STRING=$( jq -n -c --arg tb "$PULL_BRANCH" \
37 |             '{branch: $tb}' )
38 | 
39 | 
40 |     GIT_PULL_RESPONSE=$(curl -X PATCH \
41 |     -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \
42 |     -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \
43 |     -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \
44 |     -H 'Content-Type: application/json' \
45 |     -d $JSON_STRING \
46 |     https://$DATABRICKS_INSTANCE/api/2.0/repos/$REPO_ID )
47 | 
48 |     echo "Git Pull Response..."
49 |     echo $GIT_PULL_RESPONSE
50 | done
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/src/pkg/dbx_utils/utils_set_env_vars.sh:
--------------------------------------------------------------------------------
  1 | 
  2 | ### Lets Retrieve Important Variables That Are Important For Later Steps
  3 | 
  4 | echo $ENVIRONMENT
  5 | 
  6 | echo "Ingest JSON File"
  7 | JSON=$( jq '.' infrastructure/bicep/params/$ENVIRONMENT/bicep.parameters.json)
  8 | 
  9 | 
 10 | RESOURCE_GROUP_NAME=$( jq -r '.parameters.resourceGroupName.value' <<< "$JSON")
 11 | echo "Resource Group Name: $RESOURCE_GROUP_NAME"
 12 | 
 13 | DATABRICKS_WS_NAME=$( az databricks workspace list -g $RESOURCE_GROUP_NAME --query [].name -o tsv )
 14 | AML_WS_NAME=$(az ml workspace list -g $RESOURCE_GROUP_NAME  --query [].workspaceName -o tsv)
 15 | DATABRICKS_ORDGID=$(az databricks workspace list -g $RESOURCE_GROUP_NAME --query "[].workspaceId" -o tsv)
 16 | DATABRICKS_INSTANCE="$(az databricks workspace list -g $RESOURCE_GROUP_NAME --query "[].workspaceUrl" -o tsv)"
 17 | WORKSPACE_ID=$(az databricks workspace list -g $RESOURCE_GROUP_NAME --query "[].id" -o tsv)
 18 | AZ_KEYVAULT_NAME=$(az keyvault list -g $RESOURCE_GROUP_NAME --query "[].name" -o tsv)
 19 | SUBSCRIPTION_ID=$( az account show --query id -o tsv )
 20 | 
 21 | echo $SUBSCRIPTION_ID
 22 | echo $DATABRICKS_ORDGID
 23 | echo $WORKSPACE_ID
 24 | echo $AZ_KEYVAULT_NAME
 25 | echo $SUBSCRIPTION_ID
 26 | echo $AML_WS_NAME
 27 | echo $DATABRICKS_WS_NAME
 28 | #DATABRICKS_TOKEN=$(az keyvault secret show --name "dbkstoken" --vault-name $AZ_KEYVAULT_NAME --query "value" -o tsv)
 29 | 
 30 | 
 31 | if [[ $DevOps_Agent == "GitHub" ]]; then
 32 |     # Creation Of Important Environment Variables For Later Steps.
 33 |     echo "Set Environment Variables For Later Stages..."
 34 | 
 35 |     echo "Set Environment Name As Environment Variable..."
 36 |     echo "ENVIRONMENT=$ENVIRONMENT" >> $GITHUB_ENV
 37 | 
 38 |     echo "Set Resource Group Name Name As Environment Variable..."
 39 |     echo "RESOURCE_GROUP_NAME=$RESOURCE_GROUP_NAME" >> $GITHUB_ENV
 40 | 
 41 |     echo "Set Key Vault Name As Environment Variable..."
 42 |     echo "AZ_KEYVAULT_NAME=$AZ_KEYVAULT_NAME" >> $GITHUB_ENV
 43 | 
 44 |     echo "Set Databricks OrgID As Environment Variable..."
 45 |     echo "DATABRICKS_ORDGID=$DATABRICKS_ORDGID" >> $GITHUB_ENV
 46 | 
 47 |     echo "Set Workspace ID As Environment Variable..."
 48 |     echo "WORKSPACE_ID=$WORKSPACE_ID" >> $GITHUB_ENV
 49 | 
 50 |     echo "Set Datbricks Instance As Environment Variable..."
 51 |     echo "DATABRICKS_INSTANCE=$DATABRICKS_INSTANCE" >> $GITHUB_ENV
 52 | 
 53 |     echo "Set Databricks Host As Environment Variable..."
 54 |     echo "DATABRICKS_HOST=https://$DATABRICKS_INSTANCE" >> $GITHUB_ENV
 55 | 
 56 |     #echo "Set Databricks Token ID As Environment Variable..."
 57 |     #echo "DATABRICKS_TOKEN=$DATABRICKS_TOKEN" >> $GITHUB_ENV
 58 | 
 59 |     echo "Set SUBSCRIPTION_ID As Environment Variable..."
 60 |     echo "SUBSCRIPTION_ID=$SUBSCRIPTION_ID" >> $GITHUB_ENV
 61 | 
 62 |     echo "Set AML_WS_NAME As Environment Variable..."
 63 |     echo "AML_WS_NAME=$AML_WS_NAME" >> $GITHUB_ENV
 64 | 
 65 |     echo "Set DATABRICKS_WS_NAME As Environment Variable..."
 66 |     echo "DATABRICKS_WS_NAME=$DATABRICKS_WS_NAME" >> $GITHUB_ENV
 67 |     
 68 | else
 69 | 
 70 |     # Creation Of Important Environment Variables For Later Steps.
 71 |     echo "Set Environment Variables For Later Stages..."
 72 | 
 73 | 
 74 |     echo "ENVIRONMENT Name As Environment Variable..."
 75 |     echo "##vso[task.setvariable variable="ENVIRONMENT";isOutput=true;]$ENVIRONMENT"
 76 | 
 77 | 
 78 |     echo "Resource Group Name As Environment Variable..."
 79 |     echo "##vso[task.setvariable variable="RESOURCE_GROUP_NAME";isOutput=true;]$RESOURCE_GROUP_NAME"
 80 | 
 81 |     echo "Set Key Vault Name As Environment Variable..."
 82 |     echo "##vso[task.setvariable variable="AZ_KEYVAULT_NAME";isOutput=true;]$AZ_KEYVAULT_NAME"
 83 | 
 84 |     echo "Set Databricks OrgID As Environment Variable..."
 85 |     echo "##vso[task.setvariable variable="DATABRICKS_ORDGID";isOutput=true;]$DATABRICKS_ORDGID"
 86 | 
 87 |     echo "Set Workspace ID As Environment Variable..."
 88 |     echo "##vso[task.setvariable variable="WORKSPACE_ID";isOutput=true;]$WORKSPACE_ID"
 89 | 
 90 | 
 91 |     echo "Set Datbricks Instance As Environment Variable..."
 92 |     echo "##vso[task.setvariable variable="DATABRICKS_INSTANCE";isOutput=true;]$DATABRICKS_INSTANCE"
 93 | 
 94 |     echo "Set Databricks Host As Environment Variable..."
 95 |     echo "##vso[task.setvariable variable="DATABRICKS_HOST";isOutput=true;]https://$DATABRICKS_INSTANCE"
 96 | 
 97 |     echo "Set Databricks Host As Environment Variable..."
 98 |     echo "##vso[task.setvariable variable="SUBSCRIPTION_ID";isOutput=true;]$SUBSCRIPTION_ID"
 99 | 
100 |     echo "Set AML_WS_NAME As Environment Variable..."
101 |     echo "##vso[task.setvariable variable="AML_WS_NAME";isOutput=true;]$AML_WS_NAME"
102 | 
103 |     echo "Set DATABRICKS_WS_NAME As Environment Variable..."
104 |     echo "##vso[task.setvariable variable="DATABRICKS_WS_NAME";isOutput=true;]$DATABRICKS_WS_NAME"
105 | fi
106 | 


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/build/lib/common/__init__.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | 
  3 | from mlflow.tracking import MlflowClient
  4 | import math
  5 | from datetime import timedelta
  6 | from pytz import timezone
  7 | from pyspark.sql.types import FloatType, IntegerType, StringType
  8 | import mlflow
  9 | #from databricks import feature_store
 10 | from pyspark.sql.functions import *
 11 | from pyspark.sql.types import FloatType, IntegerType, StringType
 12 | from pytz import timezone
 13 | import mlflow 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | def utils_test_function():
 20 |     a = 8
 21 |     b = 10
 22 | 
 23 |     c = a + b
 24 |     return c
 25 | 
 26 | @udf(returnType=IntegerType())
 27 | def is_weekend(dt):
 28 |     tz = "America/New_York"
 29 |     return int(dt.astimezone(timezone(tz)).weekday() >= 5)  # 5 = Saturday, 6 = Sunday
 30 | 
 31 | @udf(returnType=StringType())  
 32 | def partition_id(dt):
 33 |     # datetime -> "YYYY-MM"
 34 |     return f"{dt.year:04d}-{dt.month:02d}"
 35 | 
 36 | 
 37 | def filter_df_by_ts(df, ts_column, start_date, end_date):
 38 |     if ts_column and start_date:
 39 |         df = df.filter(col(ts_column) >= start_date)
 40 |     if ts_column and end_date:
 41 |         df = df.filter(col(ts_column) < end_date)
 42 |     return df
 43 | 
 44 | 
 45 | def rounded_unix_timestamp(dt, num_minutes=15):
 46 |     """
 47 |     Ceilings datetime dt to interval num_minutes, then returns the unix timestamp.
 48 |     """
 49 |     nsecs = dt.minute * 60 + dt.second + dt.microsecond * 1e-6
 50 |     delta = math.ceil(nsecs / (60 * num_minutes)) * (60 * num_minutes) - nsecs
 51 |     return int((dt + timedelta(seconds=delta)).timestamp())
 52 | 
 53 | 
 54 | rounded_unix_timestamp_udf = udf(rounded_unix_timestamp, IntegerType())
 55 | 
 56 | 
 57 | def rounded_taxi_data(
 58 |         spark,
 59 |         taxi_data_df
 60 |     ):
 61 |     # Round the taxi data timestamp to 15 and 30 minute intervals so we can join with the pickup and dropoff features
 62 |     # respectively
 63 |     taxi_data_df = (
 64 |         taxi_data_df.withColumn(
 65 |             "rounded_pickup_datetime",
 66 |             rounded_unix_timestamp_udf(taxi_data_df["tpep_pickup_datetime"], lit(15)),
 67 |         )
 68 |         .withColumn(
 69 |             "rounded_dropoff_datetime",
 70 |             rounded_unix_timestamp_udf(taxi_data_df["tpep_dropoff_datetime"], lit(30)),
 71 |         )
 72 |         .drop("tpep_pickup_datetime")
 73 |         .drop("tpep_dropoff_datetime")
 74 |     )
 75 |     taxi_data_df.createOrReplaceTempView("taxi_data")
 76 |     return taxi_data_df
 77 | 
 78 | def get_latest_model_version(model_name):
 79 |     latest_version = 1
 80 |     
 81 |     mlflow_client = MlflowClient()
 82 |     #mlflow.set_experiment()
 83 |     for mv in mlflow_client.search_model_versions(f"name='{model_name}'"):
 84 |         version_int = int(mv.version)
 85 |         if version_int > latest_version:
 86 |             latest_version = version_int
 87 |     return latest_version
 88 | 
 89 | 
 90 | class fareClassifier(mlflow.pyfunc.PythonModel):
 91 |     def __init__(self, trained_model):
 92 |         self.model = trained_model
 93 |       
 94 |     def preprocess_result(self, model_input):
 95 |         return model_input
 96 |       
 97 |     def postprocess_result(self, results):
 98 |         '''Return post-processed results.
 99 |         Creates a set of fare ranges
100 |         and returns the predicted range.'''
101 |         
102 |         return ["$0 - $9.99" if result < 10 else "$10 - $19.99" if result < 20 else " > $20" for result in results]
103 |     
104 |     def predict(self, context, model_input):
105 |         processed_df = self.preprocess_result(model_input.copy())
106 |         results = self.model.predict(processed_df)
107 |         return self.postprocess_result(results)


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/build/lib/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # TO DO


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/build/lib/prediction/__init__.py:
--------------------------------------------------------------------------------
1 | # TO DO


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/build/lib/registration/__init__.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | 
  3 | from databricks.sdk.runtime import *
  4 | from databricks import feature_store
  5 | from pyspark.sql.types import *
  6 | from pyspark.sql.functions import *
  7 | from pyspark.sql.types import FloatType, IntegerType, StringType
  8 | import mlflow
  9 | from mlflow.tracking import MlflowClient
 10 | from databricks import feature_store
 11 | from common import *
 12 | 
 13 | # COMMAND ----------
 14 | 
 15 | def wait_until_ready(model_name, model_version, client):
 16 |     for _ in range(10):
 17 |         model_version_details = client.get_model_version(
 18 |             name=model_name,
 19 |             version=model_version,
 20 |         )
 21 |         status = ModelVersionStatus.from_string(model_version_details.status)
 22 |         print("Model status: %s" % ModelVersionStatus.to_string(status))
 23 |         if status == ModelVersionStatus.READY:
 24 |             break
 25 |         time.sleep(1)
 26 | 
 27 | 
 28 | def get_model_uri(
 29 |     fs,
 30 |     model_name,
 31 |     model_stage
 32 |     ):
 33 | 
 34 |     fs = feature_store.FeatureStoreClient()
 35 | 
 36 |     model_uri_production = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage)
 37 | 
 38 |     return model_uri_production
 39 | 
 40 | def get_data(feature_table_name):
 41 |     
 42 |     
 43 |     taxi_data = spark.read.table("feature_store_taxi_example.nyc_yellow_taxi_with_zips")
 44 |     taxi_data = rounded_taxi_data(spark, taxi_data_df = taxi_data)
 45 | 
 46 |     return taxi_data
 47 | 
 48 | def predict(
 49 |     fs,
 50 |     model_uri,
 51 |     taxi_data
 52 |     ):
 53 | 
 54 |     with_predictions = fs.score_batch(
 55 |         model_uri,
 56 |         taxi_data
 57 |     )
 58 | 
 59 |     expected_y = with_predictions.select('fare_amount').toPandas()
 60 |     predicted_y = with_predictions.select('prediction').toPandas()
 61 | 
 62 |     from sklearn import metrics
 63 |     r2 = metrics.r2_score(
 64 |         expected_y, 
 65 |         predicted_y
 66 |         )
 67 | 
 68 |     display(expected_y)
 69 |     display(with_predictions)
 70 | 
 71 |     print(f"R2: {r2}")
 72 | 
 73 |     # Display Data For Demo Purposes
 74 | 
 75 |     import pyspark.sql.functions as func
 76 |     cols = ['prediction', 'fare_amount', 'trip_distance', 'pickup_zip', 'dropoff_zip', 
 77 |             'rounded_pickup_datetime', 'rounded_dropoff_datetime', 'mean_fare_window_1h_pickup_zip', 
 78 |             'count_trips_window_1h_pickup_zip', 'count_trips_window_30m_dropoff_zip', 'dropoff_is_weekend']
 79 | 
 80 |     with_predictions_reordered = (
 81 |         with_predictions.select(
 82 |             cols,
 83 |         )
 84 |         .withColumnRenamed(
 85 |             "prediction",
 86 |             "predicted_fare_amount",
 87 |         )
 88 |         .withColumn(
 89 |         "predicted_fare_amount",
 90 |         func.round("predicted_fare_amount", 2),
 91 |         )
 92 |     )
 93 |     display(with_predictions_reordered)
 94 | 
 95 |     return r2
 96 | 
 97 | def evaluation(
 98 |     score_latest_model,
 99 |     score_production_model
100 |     ):
101 |     model_name = "taxi_example_fare_packaged"
102 | 
103 |     if score_latest_model > score_production_model:
104 |         print("Latest Model Is Better Than Production Model")
105 |         
106 |         # Demote Production
107 |         production_stage = 'production'
108 | 
109 |         # Get the latest model version in the production stage
110 | 
111 |         mlflow_client = MlflowClient()
112 | 
113 |         latest_production_version = mlflow_client.get_latest_versions(
114 |             name=model_name,
115 |             stages=[production_stage])[0].version
116 | 
117 |         #print(latest_production_version[0].version)
118 |         #print(type(latest_production_version))
119 | 
120 | 
121 |         # Promote Latest Model To Production
122 |         latest_model_version = get_latest_model_version(model_name)
123 |         mlflow_client.transition_model_version_stage(
124 |                 name=model_name,
125 |                 version=latest_model_version,
126 |                 stage="production",
127 |                 archive_existing_versions = True
128 |         )
129 | 
130 | 
131 | def run_registration(model_name):
132 |     fs = feature_store.FeatureStoreClient()
133 | 
134 |     latest_model_version = get_latest_model_version(model_name)
135 | 
136 |     taxi_data = get_data(
137 |         feature_table_name = "feature_store_taxi_example.nyc_yellow_taxi_with_zips"
138 |     )
139 | 
140 |     model_uri_latest = get_model_uri(
141 |         fs=fs,
142 |         model_name = model_name,
143 |         model_stage = "latest"
144 | 
145 |     )
146 | 
147 |     print(model_uri_latest)
148 | 
149 |     model_uri_production = get_model_uri(
150 |         fs=fs,
151 |         model_name = model_name,
152 |         model_stage = "production"
153 |     )
154 | 
155 |     print(model_uri_production)
156 | 
157 |     new = production_model_exists(
158 |         model_name = model_name,
159 |         model_stage = "production"
160 |     )
161 | 
162 |     print(new)
163 | 
164 |     if production_model_exists(
165 |         model_name = model_name,
166 |         model_stage = "Production"
167 |         ):
168 | 
169 |         score_latest_model = predict(
170 |             fs = fs,
171 |             model_uri = model_uri_latest,
172 |             taxi_data = taxi_data
173 |         )
174 | 
175 |         score_production_model = predict(
176 |             fs = fs,
177 |             model_uri = model_uri_production,
178 |             taxi_data = taxi_data
179 |         )
180 | 
181 |         evaluation(
182 |             score_latest_model = score_latest_model,
183 |             score_production_model = score_production_model
184 |         )
185 |     else:
186 |         print("No production model found. Promoting latest model to production")
187 |         mlflow_client = MlflowClient()
188 |         mlflow_client.transition_model_version_stage(
189 |             name="taxi_example_fare_packaged",
190 |             version=latest_model_version,
191 |             stage="production",
192 |             archive_existing_versions = True
193 |         )
194 | 
195 | 
196 | def production_model_exists(
197 |     model_name,
198 |     model_stage
199 |     ):
200 | 
201 | 
202 |     mlflow_client = MlflowClient()
203 |     for mv in mlflow_client.search_model_versions("name = '%s'" % model_name):
204 |         if mv.current_stage == model_stage:
205 |             return True
206 | 
207 | # COMMAND ----------
208 | 
209 | if __name__ == "__main__":
210 |     run_registration(
211 |         model_name = "taxi_example_fare_packaged"
212 |     )
213 | 
214 |     #latest_model_version = get_latest_model_version(model_name)
215 |     #mlflow_client = MlflowClient()
216 |     #mlflow_client.get_latest_versions(name=model_name, stages=[model_stage], order_by=['creation_time desc'], max_results=1)
217 |     #experiment = mlflow_client.get_experiment_by_name("/Shared/ciaran_experiment_nyc_taxi")
218 |     #experiment_id = experiment.experiment_id
219 | 
220 |     # INCREDIBLY IMPORTANT - "runs" IS GIVING US EVERYTHING, INCLUDING R2 AND PARAMETERS - USE THIS FOR POWERBI
221 |     #runs = mlflow.search_runs(
222 |     #    experiment_ids=experiment_id
223 |     #)
224 |     #display(runs)
225 |     #runs_2 = mlflow.search_runs(
226 |     #    experiment_ids=experiment_id,
227 |     #    filter_string=f"tags.model_version = '{latest_model_version}'")
228 |     #display(runs_2)
229 |     #model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage)
230 |     #model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage="latest")


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/common/__init__.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | 
  3 | from mlflow.tracking import MlflowClient
  4 | import math
  5 | from datetime import timedelta
  6 | from pytz import timezone
  7 | from pyspark.sql.types import FloatType, IntegerType, StringType
  8 | import mlflow
  9 | #from databricks import feature_store
 10 | from pyspark.sql.functions import *
 11 | from pyspark.sql.types import FloatType, IntegerType, StringType
 12 | from pytz import timezone
 13 | import mlflow 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | def utils_test_function():
 20 |     a = 8
 21 |     b = 10
 22 | 
 23 |     c = a + b
 24 |     return c
 25 | 
 26 | @udf(returnType=IntegerType())
 27 | def is_weekend(dt):
 28 |     tz = "America/New_York"
 29 |     return int(dt.astimezone(timezone(tz)).weekday() >= 5)  # 5 = Saturday, 6 = Sunday
 30 | 
 31 | @udf(returnType=StringType())  
 32 | def partition_id(dt):
 33 |     # datetime -> "YYYY-MM"
 34 |     return f"{dt.year:04d}-{dt.month:02d}"
 35 | 
 36 | 
 37 | def filter_df_by_ts(df, ts_column, start_date, end_date):
 38 |     if ts_column and start_date:
 39 |         df = df.filter(col(ts_column) >= start_date)
 40 |     if ts_column and end_date:
 41 |         df = df.filter(col(ts_column) < end_date)
 42 |     return df
 43 | 
 44 | 
 45 | def rounded_unix_timestamp(dt, num_minutes=15):
 46 |     """
 47 |     Ceilings datetime dt to interval num_minutes, then returns the unix timestamp.
 48 |     """
 49 |     nsecs = dt.minute * 60 + dt.second + dt.microsecond * 1e-6
 50 |     delta = math.ceil(nsecs / (60 * num_minutes)) * (60 * num_minutes) - nsecs
 51 |     return int((dt + timedelta(seconds=delta)).timestamp())
 52 | 
 53 | 
 54 | rounded_unix_timestamp_udf = udf(rounded_unix_timestamp, IntegerType())
 55 | 
 56 | 
 57 | def rounded_taxi_data(
 58 |         spark,
 59 |         taxi_data_df
 60 |     ):
 61 |     # Round the taxi data timestamp to 15 and 30 minute intervals so we can join with the pickup and dropoff features
 62 |     # respectively
 63 |     taxi_data_df = (
 64 |         taxi_data_df.withColumn(
 65 |             "rounded_pickup_datetime",
 66 |             rounded_unix_timestamp_udf(taxi_data_df["tpep_pickup_datetime"], lit(15)),
 67 |         )
 68 |         .withColumn(
 69 |             "rounded_dropoff_datetime",
 70 |             rounded_unix_timestamp_udf(taxi_data_df["tpep_dropoff_datetime"], lit(30)),
 71 |         )
 72 |         .drop("tpep_pickup_datetime")
 73 |         .drop("tpep_dropoff_datetime")
 74 |     )
 75 |     taxi_data_df.createOrReplaceTempView("taxi_data")
 76 |     return taxi_data_df
 77 | 
 78 | def get_latest_model_version(model_name):
 79 |     latest_version = 1
 80 |     
 81 |     mlflow_client = MlflowClient()
 82 |     #mlflow.set_experiment()
 83 |     for mv in mlflow_client.search_model_versions(f"name='{model_name}'"):
 84 |         version_int = int(mv.version)
 85 |         if version_int > latest_version:
 86 |             latest_version = version_int
 87 |     return latest_version
 88 | 
 89 | 
 90 | class fareClassifier(mlflow.pyfunc.PythonModel):
 91 |     def __init__(self, trained_model):
 92 |         self.model = trained_model
 93 |       
 94 |     def preprocess_result(self, model_input):
 95 |         return model_input
 96 |       
 97 |     def postprocess_result(self, results):
 98 |         '''Return post-processed results.
 99 |         Creates a set of fare ranges
100 |         and returns the predicted range.'''
101 |         
102 |         return ["$0 - $9.99" if result < 10 else "$10 - $19.99" if result < 20 else " > $20" for result in results]
103 |     
104 |     def predict(self, context, model_input):
105 |         processed_df = self.preprocess_result(model_input.copy())
106 |         results = self.model.predict(processed_df)
107 |         return self.postprocess_result(results)


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1-py3-none-any.whl


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1.tar.gz


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/entrypoint.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | 
 3 | from featurization import run_feature_store_refresh
 4 | run_feature_store_refresh()
 5 | 
 6 | # COMMAND ----------
 7 | from training import run_training 
 8 | 
 9 | run_training(
10 |     experiment_name = "nyc_e2e_mlops",
11 |     model_name = "taxi_example_fare_packaged",
12 |     model_params = {
13 |         "objective": "regression",
14 |         "metric": "rmse",
15 |         "num_leaves": 25,
16 |         "learning_rate": 0.2,
17 |         "bagging_fraction": 0.9,
18 |         "feature_fraction": 0.9,
19 |         "bagging_seed": 42,
20 |         "verbosity": -1,
21 |         "seed": 42,
22 |         "num_rounds": 100
23 |     }
24 | )
25 | from registration import run_registration
26 | run_registration(
27 |     model_name = "taxi_example_fare_packaged"
28 | )


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # TO DO


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/prediction/__init__.py:
--------------------------------------------------------------------------------
1 | # TO DO


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools"
4 | ]
5 | build-backend = "setuptools.build_meta"
6 | 
7 | [tool.distutils.bdist_wheel]
8 | universal = true
9 | 


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/registration/__init__.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | 
  3 | from databricks.sdk.runtime import *
  4 | from databricks import feature_store
  5 | from pyspark.sql.types import *
  6 | from pyspark.sql.functions import *
  7 | from pyspark.sql.types import FloatType, IntegerType, StringType
  8 | import mlflow
  9 | from mlflow.tracking import MlflowClient
 10 | from databricks import feature_store
 11 | from common import *
 12 | 
 13 | # COMMAND ----------
 14 | 
 15 | def wait_until_ready(model_name, model_version, client):
 16 |     for _ in range(10):
 17 |         model_version_details = client.get_model_version(
 18 |             name=model_name,
 19 |             version=model_version,
 20 |         )
 21 |         status = ModelVersionStatus.from_string(model_version_details.status)
 22 |         print("Model status: %s" % ModelVersionStatus.to_string(status))
 23 |         if status == ModelVersionStatus.READY:
 24 |             break
 25 |         time.sleep(1)
 26 | 
 27 | 
 28 | def get_model_uri(
 29 |     fs,
 30 |     model_name,
 31 |     model_stage
 32 |     ):
 33 | 
 34 |     fs = feature_store.FeatureStoreClient()
 35 | 
 36 |     model_uri_production = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage)
 37 | 
 38 |     return model_uri_production
 39 | 
 40 | def get_data(feature_table_name):
 41 |     
 42 |     
 43 |     taxi_data = spark.read.table("feature_store_taxi_example.nyc_yellow_taxi_with_zips")
 44 |     taxi_data = rounded_taxi_data(spark, taxi_data_df = taxi_data)
 45 | 
 46 |     return taxi_data
 47 | 
 48 | def predict(
 49 |     fs,
 50 |     model_uri,
 51 |     taxi_data
 52 |     ):
 53 | 
 54 |     with_predictions = fs.score_batch(
 55 |         model_uri,
 56 |         taxi_data
 57 |     )
 58 | 
 59 |     expected_y = with_predictions.select('fare_amount').toPandas()
 60 |     predicted_y = with_predictions.select('prediction').toPandas()
 61 | 
 62 |     from sklearn import metrics
 63 |     r2 = metrics.r2_score(
 64 |         expected_y, 
 65 |         predicted_y
 66 |         )
 67 | 
 68 |     display(expected_y)
 69 |     display(with_predictions)
 70 | 
 71 |     print(f"R2: {r2}")
 72 | 
 73 |     # Display Data For Demo Purposes
 74 | 
 75 |     import pyspark.sql.functions as func
 76 |     cols = ['prediction', 'fare_amount', 'trip_distance', 'pickup_zip', 'dropoff_zip', 
 77 |             'rounded_pickup_datetime', 'rounded_dropoff_datetime', 'mean_fare_window_1h_pickup_zip', 
 78 |             'count_trips_window_1h_pickup_zip', 'count_trips_window_30m_dropoff_zip', 'dropoff_is_weekend']
 79 | 
 80 |     with_predictions_reordered = (
 81 |         with_predictions.select(
 82 |             cols,
 83 |         )
 84 |         .withColumnRenamed(
 85 |             "prediction",
 86 |             "predicted_fare_amount",
 87 |         )
 88 |         .withColumn(
 89 |         "predicted_fare_amount",
 90 |         func.round("predicted_fare_amount", 2),
 91 |         )
 92 |     )
 93 |     display(with_predictions_reordered)
 94 | 
 95 |     return r2
 96 | 
 97 | def evaluation(
 98 |     score_latest_model,
 99 |     score_production_model
100 |     ):
101 |     model_name = "taxi_example_fare_packaged"
102 | 
103 |     if score_latest_model > score_production_model:
104 |         print("Latest Model Is Better Than Production Model")
105 |         
106 |         # Demote Production
107 |         production_stage = 'production'
108 | 
109 |         # Get the latest model version in the production stage
110 | 
111 |         mlflow_client = MlflowClient()
112 | 
113 |         latest_production_version = mlflow_client.get_latest_versions(
114 |             name=model_name,
115 |             stages=[production_stage])[0].version
116 | 
117 |         #print(latest_production_version[0].version)
118 |         #print(type(latest_production_version))
119 | 
120 | 
121 |         # Promote Latest Model To Production
122 |         latest_model_version = get_latest_model_version(model_name)
123 |         mlflow_client.transition_model_version_stage(
124 |                 name=model_name,
125 |                 version=latest_model_version,
126 |                 stage="production",
127 |                 archive_existing_versions = True
128 |         )
129 | 
130 | 
131 | def run_registration(model_name):
132 |     fs = feature_store.FeatureStoreClient()
133 | 
134 |     latest_model_version = get_latest_model_version(model_name)
135 | 
136 |     taxi_data = get_data(
137 |         feature_table_name = "feature_store_taxi_example.nyc_yellow_taxi_with_zips"
138 |     )
139 | 
140 |     model_uri_latest = get_model_uri(
141 |         fs=fs,
142 |         model_name = model_name,
143 |         model_stage = "latest"
144 | 
145 |     )
146 | 
147 |     print(model_uri_latest)
148 | 
149 |     model_uri_production = get_model_uri(
150 |         fs=fs,
151 |         model_name = model_name,
152 |         model_stage = "production"
153 |     )
154 | 
155 |     print(model_uri_production)
156 | 
157 |     new = production_model_exists(
158 |         model_name = model_name,
159 |         model_stage = "production"
160 |     )
161 | 
162 |     print(new)
163 | 
164 |     if production_model_exists(
165 |         model_name = model_name,
166 |         model_stage = "Production"
167 |         ):
168 | 
169 |         score_latest_model = predict(
170 |             fs = fs,
171 |             model_uri = model_uri_latest,
172 |             taxi_data = taxi_data
173 |         )
174 | 
175 |         score_production_model = predict(
176 |             fs = fs,
177 |             model_uri = model_uri_production,
178 |             taxi_data = taxi_data
179 |         )
180 | 
181 |         evaluation(
182 |             score_latest_model = score_latest_model,
183 |             score_production_model = score_production_model
184 |         )
185 |     else:
186 |         print("No production model found. Promoting latest model to production")
187 |         mlflow_client = MlflowClient()
188 |         mlflow_client.transition_model_version_stage(
189 |             name="taxi_example_fare_packaged",
190 |             version=latest_model_version,
191 |             stage="production",
192 |             archive_existing_versions = True
193 |         )
194 | 
195 | 
196 | def production_model_exists(
197 |     model_name,
198 |     model_stage
199 |     ):
200 | 
201 | 
202 |     mlflow_client = MlflowClient()
203 |     for mv in mlflow_client.search_model_versions("name = '%s'" % model_name):
204 |         if mv.current_stage == model_stage:
205 |             return True
206 | 
207 | # COMMAND ----------
208 | 
209 | if __name__ == "__main__":
210 |     run_registration(
211 |         model_name = "taxi_example_fare_packaged"
212 |     )
213 | 
214 |     #latest_model_version = get_latest_model_version(model_name)
215 |     #mlflow_client = MlflowClient()
216 |     #mlflow_client.get_latest_versions(name=model_name, stages=[model_stage], order_by=['creation_time desc'], max_results=1)
217 |     #experiment = mlflow_client.get_experiment_by_name("/Shared/ciaran_experiment_nyc_taxi")
218 |     #experiment_id = experiment.experiment_id
219 | 
220 |     # INCREDIBLY IMPORTANT - "runs" IS GIVING US EVERYTHING, INCLUDING R2 AND PARAMETERS - USE THIS FOR POWERBI
221 |     #runs = mlflow.search_runs(
222 |     #    experiment_ids=experiment_id
223 |     #)
224 |     #display(runs)
225 |     #runs_2 = mlflow.search_runs(
226 |     #    experiment_ids=experiment_id,
227 |     #    filter_string=f"tags.model_version = '{latest_model_version}'")
228 |     #display(runs_2)
229 |     #model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage)
230 |     #model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage="latest")


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/setup.cfg:
--------------------------------------------------------------------------------
  1 | [metadata]
  2 | name = src_nyc_taxi
  3 | version = 0.0.1
  4 | description = NYC Taxi Data Source
  5 | long_description = file: README.md
  6 | long_description_content_type = text/markdown; charset=UTF-8
  7 | author = Ciaran Hamill Diamond
  8 | 
  9 | [options]
 10 | package_dir =
 11 |     src_nyc_taxi = src_nyc_taxi
 12 | packages = find:
 13 | platforms = any
 14 | include_package_data = True
 15 | python_requires = ==3.10.*
 16 | install_requires =
 17 |     packaging==21.*
 18 |     azure-identity
 19 |     azure-keyvault-secrets
 20 |     azure-keyvault-keys
 21 |     datetime
 22 |     argparse
 23 |     pathlib
 24 |     argon2-cffi==20.1.0
 25 |     astor==0.8.1
 26 |     astunparse==1.6.3
 27 |     async-generator==1.10
 28 |     attrs==21.2.0
 29 |     azure-core
 30 |     backcall==0.2.0
 31 |     backports.entry-points-selectable==1.1.1
 32 |     bcrypt==4.0.0
 33 |     black==22.3.0
 34 |     bleach==4.0.0
 35 |     blis==0.7.8
 36 |     boto3==1.21.18
 37 |     botocore==1.24.18
 38 |     cachetools==5.2.0
 39 |     certifi==2021.10.8
 40 |     cffi==1.14.6
 41 |     chardet==4.0.0
 42 |     charset-normalizer==2.0.4
 43 |     click==8.0.3
 44 |     databricks-automl-runtime==0.2.11
 45 |     databricks-cli==0.17.3
 46 |     Flask==1.1.2
 47 |     importlib-metadata==4.8.1
 48 |     ipykernel==6.12.1
 49 |     ipython==7.32.0
 50 |     ipython-genutils==0.2.0
 51 |     ipywidgets==7.7.0
 52 |     Jinja2==2.11.3
 53 |     jupyter-client==6.1.12
 54 |     jupyter-core==4.8.1
 55 |     jupyterlab-pygments==0.1.2
 56 |     jupyterlab-widgets==1.0.0
 57 |     mlflow-databricks-artifacts==2.0.0
 58 |     mlflow-skinny==1.29.0
 59 |     pip==23.1.2
 60 |     pydantic==1.9.2
 61 |     pytz==2021.3
 62 |     PyYAML==6.0
 63 |     pyzmq==22.2.1
 64 |     regex==2021.8.3
 65 |     requests==2.28.1
 66 |     requests-oauthlib==1.3.1
 67 |     requests-unixsocket==0.2.0
 68 |     urllib3==1.26.7
 69 |     virtualenv==20.8.0
 70 |     visions==0.7.4
 71 |     wasabi==0.10.1
 72 |     wcwidth==0.2.5
 73 |     webencodings==0.5.1
 74 |     websocket-client==1.3.1
 75 |     Werkzeug==2.0.2
 76 |     wheel==0.37.0
 77 |     widgetsnbextension==3.6.0
 78 |     wrapt==1.12.1
 79 |     xgboost==1.6.2
 80 |     zipp==3.6.0
 81 |     azureml-mlflow==1.50.0
 82 |     azureml-core==1.50.0
 83 |     azure-ai-ml==1.4.0
 84 |     sklearn_pandas==2.2.0
 85 |     azureml-sdk==1.50.0
 86 |     uszipcode
 87 |     lightgbm
 88 |     azureml-sdk[databricks]==1.50.0
 89 |     python-dotenv
 90 |     databricks-feature-store==0.10.*
 91 |     azure-cosmos==4.3.1
 92 | 
 93 | [options.extras_require]
 94 | test =
 95 |     bandit==1.7.4
 96 |     freezegun==1.2.2
 97 |     pydocstyle==6.1.1
 98 |     pylint==2.15.0
 99 |     pylint_junit==0.3.2
100 |     pytest==7.1.2
101 |     pytest-cov==3.0.0
102 | 
103 | build =
104 |     databricks-cli==0.17.0
105 |     wheel==0.37.1
106 |  
107 | 
108 | [pylint]
109 | disable =
110 |     missing-class-docstring,
111 |     missing-function-docstring,
112 |     too-few-public-methods
113 | jobs = 4
114 | output-format = colorized
115 | 
116 | # Maximum number of locals for function / method body
117 | max-locals = 20
118 | 
119 | # Maximum number of arguments for function / method
120 | max-args = 10
121 | 
122 | good-names = df
123 | 


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | 
3 | if __name__ == '__main__':
4 |     setuptools.setup()
5 | 


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/src_nyc_taxi.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.1
 2 | Name: src-nyc-taxi
 3 | Version: 0.0.1
 4 | Summary: NYC Taxi Data Source
 5 | Author: Ciaran Hamill Diamond
 6 | Requires-Python: ==3.10.*
 7 | Description-Content-Type: text/markdown; charset=UTF-8
 8 | Provides-Extra: test
 9 | Provides-Extra: build
10 | 


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/src_nyc_taxi.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | pyproject.toml
 2 | setup.cfg
 3 | setup.py
 4 | common/__init__.py
 5 | evaluation/__init__.py
 6 | featurization/__init__.py
 7 | prediction/__init__.py
 8 | registration/__init__.py
 9 | src_nyc_taxi.egg-info/PKG-INFO
10 | src_nyc_taxi.egg-info/SOURCES.txt
11 | src_nyc_taxi.egg-info/dependency_links.txt
12 | src_nyc_taxi.egg-info/requires.txt
13 | src_nyc_taxi.egg-info/top_level.txt
14 | training/__init__.py


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/src_nyc_taxi.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/src_nyc_taxi.egg-info/requires.txt:
--------------------------------------------------------------------------------
 1 | packaging==21.*
 2 | azure-identity
 3 | azure-keyvault-secrets
 4 | azure-keyvault-keys
 5 | datetime
 6 | argparse
 7 | pathlib
 8 | argon2-cffi==20.1.0
 9 | astor==0.8.1
10 | astunparse==1.6.3
11 | async-generator==1.10
12 | attrs==21.2.0
13 | azure-core
14 | backcall==0.2.0
15 | backports.entry-points-selectable==1.1.1
16 | bcrypt==4.0.0
17 | black==22.3.0
18 | bleach==4.0.0
19 | blis==0.7.8
20 | boto3==1.21.18
21 | botocore==1.24.18
22 | cachetools==5.2.0
23 | certifi==2021.10.8
24 | cffi==1.14.6
25 | chardet==4.0.0
26 | charset-normalizer==2.0.4
27 | click==8.0.3
28 | databricks-automl-runtime==0.2.11
29 | databricks-cli==0.17.3
30 | Flask==1.1.2
31 | importlib-metadata==4.8.1
32 | ipykernel==6.12.1
33 | ipython==7.32.0
34 | ipython-genutils==0.2.0
35 | ipywidgets==7.7.0
36 | Jinja2==2.11.3
37 | jupyter-client==6.1.12
38 | jupyter-core==4.8.1
39 | jupyterlab-pygments==0.1.2
40 | jupyterlab-widgets==1.0.0
41 | mlflow-databricks-artifacts==2.0.0
42 | mlflow-skinny==1.29.0
43 | pip==23.1.2
44 | pydantic==1.9.2
45 | pytz==2021.3
46 | PyYAML==6.0
47 | pyzmq==22.2.1
48 | regex==2021.8.3
49 | requests==2.28.1
50 | requests-oauthlib==1.3.1
51 | requests-unixsocket==0.2.0
52 | urllib3==1.26.7
53 | virtualenv==20.8.0
54 | visions==0.7.4
55 | wasabi==0.10.1
56 | wcwidth==0.2.5
57 | webencodings==0.5.1
58 | websocket-client==1.3.1
59 | Werkzeug==2.0.2
60 | wheel==0.37.0
61 | widgetsnbextension==3.6.0
62 | wrapt==1.12.1
63 | xgboost==1.6.2
64 | zipp==3.6.0
65 | azureml-mlflow==1.50.0
66 | azureml-core==1.50.0
67 | azure-ai-ml==1.4.0
68 | sklearn_pandas==2.2.0
69 | azureml-sdk==1.50.0
70 | uszipcode
71 | lightgbm
72 | azureml-sdk[databricks]==1.50.0
73 | python-dotenv
74 | databricks-feature-store==0.10.*
75 | azure-cosmos==4.3.1
76 | 
77 | [build]
78 | databricks-cli==0.17.0
79 | wheel==0.37.1
80 | 
81 | [test]
82 | bandit==1.7.4
83 | freezegun==1.2.2
84 | pydocstyle==6.1.1
85 | pylint==2.15.0
86 | pylint_junit==0.3.2
87 | pytest==7.1.2
88 | pytest-cov==3.0.0
89 | 


--------------------------------------------------------------------------------
/src/pkg/nyc_taxi/src_nyc_taxi.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | common
2 | evaluation
3 | featurization
4 | prediction
5 | registration
6 | training
7 | 


--------------------------------------------------------------------------------
/test/entrypoint.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | import pytest
4 | 
5 | if __name__ == '__main__':
6 |     print(sys.argv[1:])
7 |     pytest.main(sys.argv[1:])
8 | 


--------------------------------------------------------------------------------
/test/test_dbx_utils_pkg/test_utils_azure_login.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import unittest
 4 | from unittest.mock import patch
 5 | 
 6 | from dbx_utils.utils_azure_login import start_azure_login, run_cmd
 7 | 
 8 | 
 9 | class TestRunCmd(unittest.TestCase):
10 |     def test_run_cmd(self):
11 |         test_cmd = ['echo', 'hello, world']
12 |         output, return_code = run_cmd(test_cmd)
13 |         self.assertEqual(return_code, 0)
14 |         self.assertEqual(output, ['hello, world'])
15 | 
16 |     def test_run_cmd_failure(self):
17 |         test_cmd = ['12345']
18 |         with self.assertRaises(subprocess.CalledProcessError):
19 |             run_cmd(test_cmd)
20 | 
21 | 
22 | class TestAzureLogin(unittest.TestCase):
23 |     
24 |     @patch('python.utils_azure_login.ARM_TENANT_ID', 'test_tenant_id')
25 |     @patch('python.utils_azure_login.ARM_CLIENT_SECRET', 'test_client_secret')
26 |     @patch('python.utils_azure_login.ARM_CLIENT_ID', 'test_client_id')
27 |     @patch('python.utils_azure_login.run_cmd')
28 |     def test_start_azure_login(self, mock_run_cmd):
29 |         mock_run_cmd.return_value = ('', 0)
30 |         return_code = start_azure_login()
31 |         self.assertEqual(return_code, 0) # 0 is the expected return code for a successful login
32 | 
33 |         # This must be assessing the correct parameters are being passed to the run_cmd function
34 |         # If someone changes the code, then the tests will fail
35 |         mock_run_cmd.assert_called_once_with(
36 |             [
37 |                 'az', 'login',
38 |                 '--service-principal',
39 |                 '-u', 'test_client_id',
40 |                 '-p', 'test_client_secret',
41 |                 '--tenant', 'test_tenant_id'
42 |             ]
43 |         )
44 | 


--------------------------------------------------------------------------------
/test/test_dbx_utils_pkg/test_utils_create_azure_resources.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import subprocess
  4 | import unittest
  5 | import pytest
  6 | from unittest.mock import patch, mock_open
  7 | from dbx_utils.utils_create_azure_resources import deploy_azure_resources, run_cmd, LoadJson
  8 | 
  9 | 
 10 | 
 11 | test_json = {"parameters": {
 12 |             "TemplateFilePath": {
 13 |                 "value": "mlOps/devOps/infra/master_templates/main.bicep"
 14 |             },
 15 |             "TemplateParamFilePath": {
 16 |                 "value": "mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json"
 17 |             },
 18 |             "location": {
 19 |                 "value": "eastus"
 20 |             }
 21 |         }
 22 |     }
 23 | 
 24 | class TestLoadJson:
 25 |     @patch("builtins.open", new_callable=mock_open, read_data=test_json)
 26 |     def test_load_json(self, mock_file):
 27 |         load_json = LoadJson()
 28 |         result = load_json.load_json()
 29 |         expected_result = {
 30 |             "parameters": {
 31 |                 "TemplateFilePath": {
 32 |                     "value": "mlOps/devOps/infra/master_templates/main.bicep"
 33 |                 },
 34 |                 "TemplateParamFilePath": {
 35 |                     "value": "mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json"
 36 |                 },
 37 |                 "location": {
 38 |                     "value": "eastus"
 39 |                 }
 40 |             }
 41 |         }
 42 |         assert result == expected_result
 43 | 
 44 | 
 45 |     @patch.object(LoadJson, "load_json", return_value=test_json)
 46 |     def test_get_param_file_path(self, mock_load_json):
 47 |         load_json = LoadJson()
 48 |         result = load_json.get_param_file_path()
 49 |         expected_result = "mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json"
 50 |         assert result == expected_result
 51 | 
 52 | 
 53 |     @patch.object(LoadJson, "load_json", return_value=test_json)
 54 |     def test_get_template_file_path(self, mock_load_json):
 55 |         load_json = LoadJson()
 56 |         result = load_json.get_template_file_path()
 57 |         expected_result = "mlOps/devOps/infra/master_templates/main.bicep"
 58 |         assert result == expected_result
 59 | 
 60 | 
 61 |     @patch.object(LoadJson, "load_json", return_value=test_json)
 62 |     def test_get_location(self, mock_load_json):
 63 |         load_json = LoadJson()
 64 |         result = load_json.get_location()
 65 |         expected_result = "eastus"
 66 |         assert result == expected_result
 67 | 
 68 | 
 69 | class TestRunCmd:
 70 |     def test_run_cmd_success(self):
 71 |         cmd = ["echo", "Hello, world!"]
 72 |         result = run_cmd(cmd)
 73 |         expected_result = ["Hello, world!"]
 74 |         assert result == expected_result
 75 | 
 76 |     def test_run_cmd_error(self):
 77 |         cmd = ["nonexistentcommand"]
 78 |         with pytest.raises(RuntimeError):
 79 |             run_cmd(cmd)
 80 | 
 81 | 
 82 | class TestDeployAzureResources:
 83 |     @patch("python.utils_create_azure_resources.run_cmd")
 84 |     @patch.object(LoadJson, "get_param_file_path", return_value="mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json")
 85 |     @patch.object(LoadJson, "get_template_file_path", return_value="mlOps/devOps/infra/master_templates/main.bicep")
 86 |     @patch.object(LoadJson, "get_location", return_value="eastus")
 87 |     def test_deploy_azure_resources_success(self, mock_location, mock_template_file_path, mock_param_file_path, mock_run_cmd):
 88 |         mock_run_cmd.return_value = ('', 0)
 89 |         deploy_azure_resources()
 90 |         mock_run_cmd.assert_called_with(
 91 |             [
 92 |                 "az", "deployment", "sub", "create",
 93 |                 "--location", "eastus",
 94 |                 "--template-file", "path/to/template/file",
 95 |                 "--parameters", "path/to/param/file",
 96 |                 "--name", "test_environment",
 97 |             ]
 98 |         )
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | def test_load_json():
127 |     # Mock the `open` function to return a mocked file object
128 |     mocked_open = mock_open(read_data='''{
129 |                                 "parameters": {
130 |                                     "TemplateFilePath": {
131 |                                         "value": "mlOps/devOps/infra/master_templates/main.bicep"
132 |                                     },
133 |                                     "TemplateParamFilePath": {
134 |                                         "value": "mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json"
135 |                                     },
136 |                                     "location": {
137 |                                         "value": "eastus"
138 |                                     }
139 |                                 }
140 |                             }''')
141 | 
142 |     with patch('builtins.open', mocked_open):
143 |         # Create an instance of the LoadJson class
144 |         load_json_obj = LoadJson()
145 | 
146 |         # Call the load_json method and assert that it returns the expected dictionary
147 |         assert load_json_obj.load_json() == {
148 |             'parameters': {
149 |                 'TemplateFilePath': {
150 |                     'value': 'mlOps/devOps/infra/master_templates/main.bicep'
151 |                 },
152 |                 'TemplateParamFilePath': {
153 |                     'value': 'mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json'
154 |                 },
155 |                 'location': {
156 |                     'value': 'eastus'
157 |                 }
158 |             }
159 |         }
160 | 
161 |         # Call the get_template_file_path method and assert that it returns the expected value
162 |         assert load_json_obj.get_template_file_path() == 'mlOps/devOps/infra/master_templates/main.bicep'
163 | 
164 |         # Call the get_param_file_path method and assert that it returns the expected value
165 |         assert load_json_obj.get_param_file_path() == 'mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json'
166 | 
167 |         # Call the get_location method and assert that it returns the expected value
168 |         assert load_json_obj.get_location() == 'eastus'
169 | 
170 | 
171 | class TestRunCmd(unittest.TestCase):
172 |     def test_run_cmd(self):
173 |         test_cmd = ['echo', 'hello, world']
174 |         output, return_code = run_cmd(test_cmd)
175 |         self.assertEqual(return_code, 0)
176 |         self.assertEqual(output, ['hello, world'])
177 | 
178 |     def test_run_cmd_failure(self):
179 |         test_cmd = ['12345']
180 |         with self.assertRaises(subprocess.CalledProcessError):
181 |             run_cmd(test_cmd)
182 | 
183 | 
184 | class TestCreateAzureResources(unittest.TestCase):
185 |     
186 |     @patch('python.utils_azure_login.ARM_TENANT_ID', 'test_tenant_id')
187 |     @patch('python.utils_azure_login.ARM_CLIENT_SECRET', 'test_client_secret')
188 |     @patch('python.utils_azure_login.ARM_CLIENT_ID', 'test_client_id')
189 |     @patch('python.utils_azure_login.run_cmd')
190 |     def test_start_azure_login(self, mock_run_cmd):
191 |         mock_run_cmd.return_value = ('', 0)
192 |         return_code = deploy_azure_resources()
193 |         self.assertEqual(return_code, 0) # 0 is the expected return code for a successful login
194 | 
195 |         # This must be assessing the correct parameters are being passed to the run_cmd function
196 |         # If someone changes the code, then the tests will fail. 
197 |         
198 |         mock_run_cmd.assert_called_once_with(
199 |             [
200 |                 "az", "deployment", "sub", "create",
201 |                 "--location", location,
202 |                 "--template-file", template_file_path,
203 |                 "--parameters", template_param_file_path,
204 |                 "--name", ENVIRONMENT,
205 |                 "--only-show-errors"  ]
206 |         )
207 |         
208 | 


--------------------------------------------------------------------------------
/test/test_dbx_utils_pkg/test_utils_create_repo_folder.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from unittest.mock import patch, MagicMock, mock_open
  3 | from unittest import mock
  4 | import pytest
  5 | from _pytest.monkeypatch import MonkeyPatch
  6 | import json 
  7 | import requests
  8 | 
  9 | from dbx_utils.utils_create_repo_folder import _ingest_repo_param_file, create_databricks_repos
 10 | 
 11 | 
 12 | class TestCreateRepoFolder(unittest.TestCase):
 13 | 
 14 |     @patch('requests.post')
 15 |     def test_create_databricks_repos_success(self, mock_post):
 16 |         monkeypatch = MonkeyPatch()
 17 |         monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id')
 18 |         monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id')
 19 |         monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token')
 20 |         monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token')
 21 |         monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance')
 22 |         
 23 | 
 24 |         mock_post.return_value.status_code = 200
 25 | 
 26 |         mock_repo_json = {
 27 |                 "url":  "test_url",
 28 |                 "provider":  "test_provider",
 29 |                 "path":  "test_folder"
 30 |             }
 31 |         
 32 |         status_code = create_databricks_repos(mock_repo_json)
 33 | 
 34 |         assert status_code == 200
 35 |         expected_dbkrs_req_headers = {
 36 |             'Authorization': 'Bearer test_databricks_aad_token',
 37 |             'X-Databricks-Azure-SP-Management-Token': 'test_databricks_management_token',
 38 |             'X-Databricks-Azure-Workspace-Resource-Id': 'test_workspace_id',
 39 |             'Content-Type': 'application/json'}
 40 |         
 41 |         mock_post.assert_called_once_with(
 42 |             'https://test_databricks_instance/api/2.0/repos',
 43 |             headers=expected_dbkrs_req_headers,
 44 |             json=mock_repo_json)
 45 | 
 46 |     @patch('requests.post')
 47 |     def test_create_databricks_repos_failure(self, mock_post):
 48 |         monkeypatch = MonkeyPatch()
 49 |         monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id')
 50 |         monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id')
 51 |         monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token')
 52 |         monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token')
 53 |         monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance')
 54 | 
 55 |         mock_post.return_value.status_code = 400
 56 | 
 57 |         mock_repo_json = {
 58 |                 "url":  "test_url",
 59 |                 "provider":  "test_provider",
 60 |                 "path":  "test_folder"
 61 |             }
 62 | 
 63 |         with pytest.raises(Exception) as e:
 64 |             status_code = create_databricks_repos(mock_repo_json)
 65 |             assert status_code == 400
 66 |     
 67 | 
 68 | 
 69 | class TestIngestRepoParamFile(unittest.TestCase):
 70 | 
 71 |     test_repo_json = {
 72 |         "Git_Configuration": [ 
 73 |             {
 74 |             "git_username":  "test_username",
 75 |             "git_provider":  "test_provider",
 76 |             }
 77 |         ],
 78 |         "Repo_Configuration": [
 79 |             {
 80 |                 "url":  "test_url",
 81 |                 "provider":  "test_provider",
 82 |                 "path":  "test_folder"
 83 |             }
 84 |         ]
 85 |     }
 86 | 
 87 |     test_repo_json = json.dumps(test_repo_json)
 88 | 
 89 | 
 90 |     @patch("builtins.open", new_callable=mock_open, read_data=test_repo_json)
 91 |     def test_load_json(self, mock_open):
 92 |         monkeypatch = MonkeyPatch()
 93 |         monkeypatch.setenv('ENVIRONMENT', 'test_environment')
 94 |         #cluster = Cluster()
 95 | 
 96 |         result = _ingest_repo_param_file( "test_cluster_param_file.json")
 97 |         
 98 |         # Expected result is an array and not an object
 99 |         expected_result = [
100 |             {
101 |                 "url":  "test_url",
102 |                 "provider":  "test_provider",
103 |                 "path":  "test_folder"
104 |             }
105 |         ]
106 |         assert result == expected_result


--------------------------------------------------------------------------------
/test/test_dbx_utils_pkg/test_utils_repo_pull.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from unittest.mock import patch, MagicMock, mock_open
  3 | from unittest import mock
  4 | import pytest
  5 | from _pytest.monkeypatch import MonkeyPatch
  6 | import json 
  7 | import requests
  8 | 
  9 | from dbx_utils.utils_repo_pull import _ingest_repo_param_file, get_repos_with_management_permissions, update_repo, main
 10 | 
 11 | 
 12 | class TestIngestRepoParamFile(unittest.TestCase):
 13 | 
 14 |     test_repo_json = {
 15 |         "Git_Configuration": [ 
 16 |             {
 17 |             "git_username":  "test_username",
 18 |             "git_provider":  "test_provider",
 19 |             }
 20 |         ],
 21 |         "Repo_Configuration": [
 22 |             {
 23 |                 "url":  "test_url",
 24 |                 "provider":  "test_provider",
 25 |                 "path":  "test_folder"
 26 |             }
 27 |         ]
 28 |     }
 29 | 
 30 |     test_repo_json = json.dumps(test_repo_json)
 31 | 
 32 | 
 33 |     @patch("builtins.open", new_callable=mock_open, read_data=test_repo_json)
 34 |     def test_load_json(self, mock_open):
 35 |         monkeypatch = MonkeyPatch()
 36 |         monkeypatch.setenv('ENVIRONMENT', 'test_environment')
 37 |         #cluster = Cluster()
 38 | 
 39 |         result = _ingest_repo_param_file( "test_cluster_param_file.json")
 40 |         
 41 |         # Expected result is an array and not an object
 42 |         expected_result = [
 43 |             {
 44 |                 "url":  "test_url",
 45 |                 "provider":  "test_provider",
 46 |                 "path":  "test_folder"
 47 |             }
 48 |         ]
 49 |         assert result == expected_result
 50 | 
 51 | #get_repos_with_management_permissions
 52 | class GetReposWithManagementPermissions(unittest.TestCase):
 53 | 
 54 |     @patch('requests.get')
 55 |     def test_get_repos_with_management_permissions_success(self, mock_get):
 56 |         monkeypatch = MonkeyPatch()
 57 | 
 58 |         monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id')
 59 |         monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id')
 60 |         monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token')
 61 |         monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token')
 62 |         monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance')
 63 | 
 64 |         mock_get.return_value.status_code = 200
 65 | 
 66 |         mock_return = {
 67 |             "repos":[
 68 |                 {
 69 |                     "id":61449681029719,
 70 |                     "path":"/Repos/***/test_dbx_repo_folder_one",
 71 |                     "url":"https://github.com/test_repo_profile/test_repo_one",
 72 |                     "provider":"gitHub",
 73 |                     "branch":"main",
 74 |                     "head_commit_id":"test_commit_id"
 75 |                  }
 76 |             ]
 77 |         }
 78 | 
 79 |         mock_get.return_value.json.return_value = mock_return
 80 | 
 81 |         repos_with_management_permissions, status_code = get_repos_with_management_permissions()
 82 | 
 83 |         assert repos_with_management_permissions == mock_return["repos"]
 84 |         assert status_code == 200
 85 | 
 86 | 
 87 |         expected_dbkrs_req_headers = {
 88 |             'Authorization': 'Bearer test_databricks_aad_token',
 89 |             'X-Databricks-Azure-SP-Management-Token': 'test_databricks_management_token',
 90 |             'X-Databricks-Azure-Workspace-Resource-Id': 'test_workspace_id',
 91 |             'Content-Type': 'application/json'}
 92 |         
 93 | 
 94 |         mock_get.assert_called_once_with(
 95 |             'https://test_databricks_instance/api/2.0/repos',
 96 |             headers=expected_dbkrs_req_headers
 97 |         )
 98 | 
 99 |         
100 |         @patch('requests.get')
101 |         def test_get_repos_with_management_permissions_failure(mock_get):
102 |             monkeypatch = MonkeyPatch()
103 | 
104 |             monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id')
105 |             monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id')
106 |             monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token')
107 |             monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token')
108 |             monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance')
109 | 
110 |             mock_get.return_value.status_code = 500
111 | 
112 |             with pytest.raises(Exception) as e:
113 |                 status_code = get_repos_with_management_permissions()
114 |                 assert status_code == 500
115 | 
116 | 
117 | # update_repo
118 | 
119 | class UpdateRepo(unittest.TestCase):
120 |     
121 |     @patch('requests.patch')
122 |     def test_update_repo_success(self, mock_patch):
123 |         monkeypatch = MonkeyPatch()
124 |         monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id')
125 |         monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id')
126 |         monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token')
127 |         monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token')
128 |         monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance')
129 | 
130 |         mock_repo_id = 123456789
131 |         mock_update_branch = "test_main_branch"
132 | 
133 |         mock_patch.return_value.status_code = 200
134 | 
135 |         status_code = update_repo(mock_repo_id, mock_update_branch )
136 | 
137 |         assert status_code == 200
138 | 
139 | 
140 |         expected_dbkrs_req_headers = {
141 |             'Authorization': 'Bearer test_databricks_aad_token',
142 |             'X-Databricks-Azure-SP-Management-Token': 'test_databricks_management_token',
143 |             'X-Databricks-Azure-Workspace-Resource-Id': 'test_workspace_id',
144 |             'Content-Type': 'application/json'}
145 |         
146 |         mock_patch.assert_called_once_with(
147 |             "https://test_databricks_instance/api/2.0/repos/" + str(mock_repo_id),
148 |             headers=expected_dbkrs_req_headers,
149 |             json={
150 |                 "branch": mock_update_branch
151 |             }
152 |         )
153 | 
154 |     @patch('requests.post')
155 |     def test_update_repo_failure(self, mock_post):
156 | 
157 |         mock_repo_id = 123456789
158 |         mock_update_branch = "test_main_branch"
159 |         
160 |         mock_post.return_value.status_code = 500
161 | 
162 |         with pytest.raises(Exception) as e:
163 |             status_code = update_repo(mock_repo_id, mock_update_branch )
164 |             assert status_code == 500
165 | 
166 | 
167 | class Main(unittest.TestCase):
168 | 
169 |     test_repo_json = {
170 |         "Git_Configuration": [ 
171 |             {
172 |             "git_username":  "test_username",
173 |             "git_provider":  "test_provider",
174 |             }
175 |         ],
176 |         "Repo_Configuration": [
177 |             {
178 |                 "url":  "test_url",
179 |                 "provider":  "test_provider",
180 |                 "path":  "test_folder"
181 |             }
182 |         ]
183 |     }
184 |     test_repo_json = json.dumps(test_repo_json)
185 | 
186 |  
187 |     @patch('python.utils_repo_pull.update_repo')
188 |     @patch('python.utils_repo_pull.get_repos_with_management_permissions')
189 |     @patch('python.utils_repo_pull._ingest_repo_param_file')
190 |     def test_main_success(self, mock_ingest_repo_param_file, mock_get_repos_with_management_permissions, mock_update_repo):
191 |         
192 |         # monkey patch environment variables
193 |         monkeypatch = MonkeyPatch()
194 |         monkeypatch.setenv('ENVIRONMENT', 'test_environment')
195 |         monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id')
196 |         monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id')
197 |         monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token')
198 |         monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token')
199 |         monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance')
200 | 
201 |         
202 |         mock_ingest_repo_param_file_json_return = [{
203 |                 "url":  "test_url",
204 |                 "provider":  "test_provider",
205 |                 "path":  "test_folder",
206 |                 "branch": "test_branch"
207 |             }]
208 |         
209 |         mock_ingest_repo_param_file.return_value = mock_ingest_repo_param_file_json_return
210 | 
211 |         # Should be doing a mock open instead !!!
212 |         #mock_ingest_repo_param_file.return_value = mock_ingest_repo_param_file_json_return
213 | 
214 |         # mock return value from get repos with management permissions
215 |         mock_get_repos_with_management_permissions_json_return = [
216 |             {
217 |                 "id":61449681029719,
218 |                 "path":"/Repos/***/test_folder",
219 |                 "url":"https://github.com/test_repo_profile/test_repo_one",
220 |                 "provider":"gitHub",
221 |                 "branch":"main",
222 |                 "head_commit_id":"test_commit_id"
223 |             }
224 |         ]
225 | 
226 |         mock_get_repos_with_management_permissions.return_value = (mock_get_repos_with_management_permissions_json_return, 200)
227 | 
228 |         # mock return value from update repo
229 |         mock_update_repo.return_value = 200
230 | 
231 |         # call main function
232 |         status_code = main()
233 | 
234 |         # assert main function returns 200
235 |         assert status_code == 200
236 | 
237 |         
238 |         # assert mock functions were called using correct arguments 
239 |         mock_ingest_repo_param_file.assert_called_once_with('mlOps/devOps/params/test_environment/repos.json')
240 |         mock_update_repo.assert_called_once_with("61449681029719", "test_branch")
241 | 
242 |         
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 |         
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 


--------------------------------------------------------------------------------