├── .azure_devops └── workflows │ ├── 1-master-pipelines │ ├── cicd-pipeline.yaml │ └── manual-pipeline.yaml │ ├── 2-jobs-pipelines │ └── job-deployment.yaml │ └── 3-steps-pipelines │ ├── step-deployment.yaml │ └── step-pr-tests.yaml ├── .dbx ├── lock.json └── project.json ├── .github └── workflows │ ├── onDeploy.yaml │ ├── onRelease.yaml │ └── taskDatabricks.yaml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── New Text Document.txt ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── dataops └── src_nyc_taxi │ ├── data_quality.py │ └── transform.py ├── docs ├── README.md └── images │ ├── AppInsightConnectionString.jpg │ ├── AzureResources.JPG │ ├── Azure_Machine_Learning_GIF.gif │ ├── DatabricksNotebookExecution.JPG │ ├── DatabricksORGIDandHOSTID.JPG │ ├── DatabricksTokenGeneration.jpg │ ├── DevContainer.jpg │ ├── DockerImageLoad.jpg │ ├── InstallExtensions.jpg │ ├── MLOps_for_databricks_Solution_Acclerator_logo.JPG │ ├── OutputOfTheConfigurationStep.jpg │ ├── Overview.JPG │ ├── PipelineSteps.JPG │ ├── PowershellScreen.jpg │ ├── SecretsFileImage.jpg │ ├── SuccessfulClusterCreation.JPG │ ├── Verify_Python_Interpreter.jpg │ ├── YoutubeThumbNail.png │ ├── cluster-upload-wheel.jpg │ ├── databricks-connect-pass.jpg │ ├── dstoolitgif.gif │ ├── final.jpg │ ├── map01.png │ ├── map02.png │ ├── map03.png │ ├── map04.png │ ├── map05.png │ ├── map06.png │ ├── map07.png │ ├── pythonversion.jpg │ └── workspaceselection.jpg ├── experiments ├── notebooks │ └── ciaran_experiments │ │ └── nyc_taxi │ │ └── nyc_taxi_lgbm_1.py └── pipelines │ └── ciaran_experiments │ ├── workflow.yaml │ └── workflow_configs │ ├── featurization.yaml │ ├── training.yaml │ └── workflow_params.yaml ├── infrastructure ├── bicep │ ├── az_templates │ │ ├── az_app_insights │ │ │ └── az_app_insights.bicep │ │ ├── az_data_lake │ │ │ └── az_data_lake.bicep │ │ ├── az_databricks │ │ │ └── az_databricks.bicep │ │ ├── az_key_vault │ │ │ └── az_key_vault.bicep │ │ └── az_machine_learning │ │ │ └── az_machine_learning.bicep │ ├── main.bicep │ └── params │ │ ├── development │ │ └── bicep.parameters.json │ │ ├── production │ │ └── bicep.parameters.json │ │ ├── sandbox │ │ └── bicep.parameters.json │ │ └── uat │ │ └── bicep.parameters.json └── databricks │ └── databricks_configs │ ├── development │ ├── clusters.json │ ├── rbac.json │ └── repos.json │ ├── production │ ├── clusters.json │ ├── rbac.json │ └── repos.json │ ├── sandbox │ ├── clusters.json │ ├── rbac.json │ └── repos.json │ └── uat │ ├── clusters.json │ ├── rbac.json │ └── repos.json ├── mlops └── nyc_taxi │ ├── aml_pipelines │ ├── v1 │ │ └── nyc_pipeline.py │ └── v2 │ │ └── dontdelete │ │ ├── databricks │ │ └── listclusters.py │ │ ├── dependencies │ │ └── conda.yaml │ │ └── pipelines │ │ └── databricks.ipynb │ ├── databricks_workflows │ ├── nyc_taxi.yaml │ └── unit_tests.yaml │ └── monitoring │ ├── data_drift_monitor.py │ ├── mflow_experiment_dashboard_pbi.py │ └── model_serving_monitor.py ├── pyproject.toml ├── score.py ├── setup.ps1 ├── src └── pkg │ ├── dbx_utils │ ├── __init__.py │ ├── common.py │ ├── utils_azure_login.py │ ├── utils_azure_login.sh │ ├── utils_create_aad_tokens.py │ ├── utils_create_aad_tokens.sh │ ├── utils_create_azure_resources.py │ ├── utils_create_azure_resources.sh │ ├── utils_create_cluster.py │ ├── utils_create_databricks_token.sh │ ├── utils_create_key_vault_secrets.sh │ ├── utils_create_repo_folder.py │ ├── utils_create_repo_folder.sh │ ├── utils_create_role_based_access.sh │ ├── utils_create_secret_scopes.py │ ├── utils_create_secret_scopes.sh │ ├── utils_git_configuration.py │ ├── utils_repo_pull.py │ ├── utils_repo_pull.sh │ └── utils_set_env_vars.sh │ ├── nyc_taxi │ ├── build │ │ └── lib │ │ │ ├── common │ │ │ └── __init__.py │ │ │ ├── evaluation │ │ │ └── __init__.py │ │ │ ├── featurization │ │ │ └── __init__.py │ │ │ ├── prediction │ │ │ └── __init__.py │ │ │ ├── registration │ │ │ └── __init__.py │ │ │ └── training │ │ │ └── __init__.py │ ├── common │ │ └── __init__.py │ ├── dist │ │ ├── src_nyc_taxi-0.0.1-py3-none-any.whl │ │ └── src_nyc_taxi-0.0.1.tar.gz │ ├── entrypoint.py │ ├── evaluation │ │ └── __init__.py │ ├── featurization │ │ └── __init__.py │ ├── prediction │ │ └── __init__.py │ ├── pyproject.toml │ ├── registration │ │ └── __init__.py │ ├── setup.cfg │ ├── setup.py │ ├── src_nyc_taxi.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ ├── requires.txt │ │ └── top_level.txt │ └── training │ │ └── __init__.py │ └── wine_quality │ ├── combined_wine_data.csv │ ├── wine_quality.py │ └── winedata.csv └── test ├── entrypoint.py └── test_dbx_utils_pkg ├── test_utils_azure_login.py ├── test_utils_create_azure_resources.py ├── test_utils_create_cluster.py ├── test_utils_create_repo_folder.py └── test_utils_repo_pull.py /.azure_devops/workflows/1-master-pipelines/cicd-pipeline.yaml: -------------------------------------------------------------------------------- 1 | # UPDATES PENDING - MAY NOT WORK 2 | 3 | 4 | name: Databricks Deployment 5 | 6 | trigger: 7 | branches: 8 | include: 9 | - main 10 | - dev 11 | #- features/* 12 | - releases/* 13 | paths: 14 | exclude: 15 | - README.md 16 | 17 | #pr: none 18 | 19 | pool: 20 | vmImage: 'ubuntu-latest' 21 | 22 | 23 | #Secrets 24 | variables: 25 | - group: ADO-Secrets 26 | - name: isPR 27 | value: $[startsWith(variables['Build.SourceBranch'], 'refs/pull/')] 28 | 29 | - name: isMain 30 | value: $[eq(variables['Build.SourceBranch'], 'refs/heads/main')] 31 | 32 | - name: isPRFeatureOnMain 33 | value: $[eq(variables['System.PullRequest.SourceBranch'], 'features/*')] 34 | 35 | 36 | #resources: 37 | # repositories: 38 | # - repository: self 39 | # ref: 'refs/heads/$(branchName)' 40 | 41 | # PROTECT THE MAIN BRANCH SO YOU CANT PUSH DIRECTLY TO IT 42 | stages: 43 | - stage: developmentDeploy 44 | condition: and(eq(variables['Build.SourceBranchName'], 'dev'), eq(variables['Build.Reason'], 'IndividualCI')) 45 | displayName: developmentDeploy 46 | jobs: 47 | - template: ..\2-jobs-pipelines\job-deployment.yaml 48 | parameters: 49 | environment: development 50 | azureSubscription: DBX_ADO_DSTOOLKIT 51 | branchName: $(Build.SourceBranchName) 52 | 53 | # This will deploy code on the source branch for the PR. If PR from Feature to Dev, then this will deploy Feature. 54 | - stage: pullRequestChecks 55 | condition: and(startsWith(variables['system.pullRequest.sourceBranch'], 'features/'), eq(variables['system.pullRequest.targetBranch'], 'dev')) 56 | displayName: pullRequestChecks 57 | jobs: 58 | - template: ..\3-steps-pipelines\step-pr-tests.yaml 59 | parameters: 60 | environment: development 61 | azureSubscription: DBX_ADO_DSTOOLKIT 62 | branchName: $(Build.SourceBranchName) 63 | 64 | 65 | # Change To Main Branch --> Deploy To Test Environment 66 | - stage: uatDeploy 67 | displayName: uatDeploy 68 | condition: and(eq(variables['Build.SourceBranchName'], 'main'), eq(variables['Build.Reason'], 'IndividualCI')) 69 | jobs: 70 | - template: ..\2-jobs-pipelines\job-deployment.yaml 71 | parameters: 72 | environment: uat 73 | azureSubscription: DBX_ADO_DSTOOLKIT 74 | branchName: $(Build.SourceBranchName) 75 | 76 | 77 | - stage: pullRequestChecks_dev_to_main 78 | condition: and(eq(variables['system.pullRequest.sourceBranch'], 'dev'), eq(variables['system.pullRequest.targetBranch'], 'main')) 79 | displayName: pullRequestChecks 80 | jobs: 81 | - template: ..\3-steps-pipelines\step-pr-tests.yaml 82 | parameters: 83 | environment: uat 84 | azureSubscription: DBX_ADO_DSTOOLKIT 85 | branchName: $(Build.SourceBranchName) 86 | 87 | 88 | # Tag Release Branch --> Deploy To Production Environment 89 | # - stage: ProductionDeploy 90 | # displayName: ProductionDeploy 91 | # condition: and(startsWith(variables['Build.SourceBranch'], 'refs/tags/v'), eq(variables['Build.Reason'], 'IndividualCI')) 92 | # jobs: 93 | # - template: ..\2-Jobs\Job-Databricks.yaml 94 | # parameters: 95 | # Environment: Production 96 | # azureSubscription: DBX_ADO_DSTOOLKIT 97 | # enableRepoPull: true 98 | # branchName: $(Build.SourceBranchName) 99 | # updateFolder: DevelopmentFolder 100 | 101 | 102 | # Implement a condition to ignore a azure resource deployment if Infra folder is unchanged. 103 | # https://pumpingco.de/blog/run-an-azure-pipelines-job-only-if-source-code-has-changed/ 104 | -------------------------------------------------------------------------------- /.azure_devops/workflows/1-master-pipelines/manual-pipeline.yaml: -------------------------------------------------------------------------------- 1 | name: Service Principal MLOps Databricks Deployment 2 | 3 | trigger: none 4 | pr: none 5 | 6 | pool: 7 | vmImage: ubuntu-latest 8 | 9 | variables: 10 | - group: ADO-Secrets 11 | 12 | parameters: 13 | - name: environment 14 | displayName: Choose Environment 15 | type: string 16 | default: sandbox 17 | values: 18 | - sandbox 19 | - development 20 | - uat 21 | - production 22 | - all 23 | 24 | - name: azureSubscription 25 | displayName: Enter Service Connection Name 26 | default: DBX_ADO_DSTOOLKIT 27 | type: string 28 | 29 | stages: 30 | - stage: sandboxDeployment 31 | condition: or( eq('${{ parameters.environment }}', 'sandbox'), eq('${{ parameters.environment }}', 'all')) 32 | displayName: sandboxDeployment 33 | jobs: 34 | - template: ..\2-jobs-pipelines\job-deployment.yaml 35 | parameters: 36 | environment: sandbox 37 | azureSubscription: ${{ parameters.azureSubscription }} 38 | branchName: main 39 | 40 | - stage: developmentDeployment 41 | condition: or( eq('${{ parameters.environment }}', 'development'), eq('${{ parameters.environment }}', 'all')) 42 | displayName: developmentDeployment 43 | dependsOn: [] 44 | jobs: 45 | - template: ..\2-jobs-pipelines\job-deployment.yaml 46 | parameters: 47 | environment: development 48 | azureSubscription: ${{ parameters.azureSubscription }} 49 | branchName: main 50 | 51 | - stage: uatDeployment 52 | condition: or( eq('${{ parameters.ENVIRONMENT }}', 'uat'), eq('${{ parameters.ENVIRONMENT }}', 'all')) 53 | displayName: uatDeployment 54 | dependsOn: [] 55 | jobs: 56 | - template: ..\2-jobs-pipelines\job-deployment.yaml 57 | parameters: 58 | environment: uat 59 | azureSubscription: ${{ parameters.azureSubscription }} 60 | releaseBranch: 'release/1' 61 | 62 | 63 | -------------------------------------------------------------------------------- /.azure_devops/workflows/2-jobs-pipelines/job-deployment.yaml: -------------------------------------------------------------------------------- 1 | parameters: 2 | environment: String 3 | azureSubscription: String 4 | branchName: String 5 | 6 | jobs: 7 | - deployment: databricks_mlops_${{ parameters.environment }} 8 | displayName: databricks_mlops_${{ parameters.environment }} 9 | variables: 10 | - name: PYSPARK_PYTHON 11 | value: python3.9 12 | environment: ${{ parameters.environment }} 13 | strategy: 14 | runOnce: 15 | deploy: 16 | steps: 17 | - checkout: self 18 | fetchDepth: 2 19 | #ref: ${{ parameters.branchName }} 20 | # Paramount for fetchDepth to 2 for Git File Changes Check 21 | - template: ../3-steps-pipelines/step-deployment.yaml 22 | parameters: 23 | azureSubscription: ${{ parameters.azureSubscription }} 24 | environment: ${{ parameters.environment }} 25 | branchName: ${{ parameters.branchName }} 26 | -------------------------------------------------------------------------------- /.azure_devops/workflows/3-steps-pipelines/step-pr-tests.yaml: -------------------------------------------------------------------------------- 1 | parameters: 2 | azureSubscription: String 3 | environment: String 4 | branchName: String 5 | 6 | steps: 7 | 8 | - task: UsePythonVersion@0 9 | inputs: 10 | versionSpec: '3.8' 11 | architecture: 'x64' 12 | 13 | - script: | 14 | sudo apt update && sudo apt install jq -y 15 | python -m pip install requests python-dotenv poetry databricks-cli setuptools 16 | python -m pip install azure-cli==2.49.0 azure-mgmt-storage==21.0.0 17 | az extension add -n azure-cli-ml 18 | #python -m pip install azureml azureml-core azureml-pipeline 19 | displayName: Install Packages 20 | 21 | - script: | 22 | az config set extension.use_dynamic_install=yes_without_prompt 23 | az extension add --name databricks 24 | displayName: Configure Azure CLI 25 | 26 | - script: | 27 | python -m poetry install 28 | displayName: 'Install Testing Requirements locally' 29 | 30 | - bash: | 31 | mkdir -p tests 32 | displayName: 'Create Unit Test Directory' 33 | 34 | #- script: | 35 | # python -m poetry run pylint --output-format=pylint_junit.JUnitReporter:tests/testresults.xml src/pkg/dbx_utils 36 | # displayName: 'Pylinting dbx_utils' 37 | 38 | - script: | 39 | python -m poetry run bandit -rv src/pkg/dbx_utils/ 40 | displayName: 'Security Checks Bandit' 41 | continueOnError: true 42 | 43 | 44 | - task: AzureCLI@2 45 | displayName: Generate AAD Tokens 46 | name: aad_tokens 47 | continueOnError: true 48 | inputs: 49 | azureSubscription: ${{ parameters.azureSubscription }} 50 | scriptType: bash 51 | scriptLocation: scriptPath 52 | scriptPath: $(Build.SourcesDirectory)/src/pkg/dbx_utils/utils_create_aad_tokens.sh 53 | env: 54 | DBX_RESOURCE_ID: 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d 55 | 56 | - script: | 57 | python -m poetry run python $(Build.SourcesDirectory)/src/pkg/dbx_utils/utils_azure_login.py 58 | displayName: Azure Login 59 | continueOnError: true 60 | env: 61 | ARM_CLIENT_ID : $(ARM_CLIENT_ID) 62 | ARM_TENANT_ID: $(ARM_TENANT_ID) 63 | ARM_CLIENT_SECRET: $(ARM_CLIENT_SECRET) 64 | 65 | 66 | - task: AzureCLI@2 67 | displayName: Set Environment Variables 68 | name: "env_variables" 69 | inputs: 70 | scriptType: bash 71 | scriptLocation: scriptPath 72 | azureSubscription: ${{ parameters.azureSubscription }} 73 | scriptPath: $(Build.SourcesDirectory)/src/pkg/dbx_utils/utils_set_env_vars.sh 74 | env: 75 | ENVIRONMENT: ${{ parameters.environment }} 76 | DevOps_Agent: "Azure DevOps Agent" 77 | 78 | 79 | - script: | 80 | set -e 81 | python -m poetry run dbx configure 82 | 83 | python -m poetry run dbx execute DatabricksUtilsTesting \ 84 | --deployment-file mlops/nyc_taxi/databricks_workflows/unit_tests.yaml \ 85 | --cluster-name=ml_cluster 86 | 87 | databricks fs cp dbfs:/FileStore/databricks_utils_unit_testresults.xml $(Build.ArtifactStagingDirectory)/databricks_utils_unit_testresults.xml 88 | 89 | databricks fs cp dbfs:/FileStore/databricks_utils_cov_report.xml $(Build.ArtifactStagingDirectory)/databricks_utils_cov_report.xml 90 | 91 | displayName: Unit Testing - DBX Cluster 92 | env: 93 | DATABRICKS_TOKEN: $(aad_tokens.DATABRICKS_AAD_TOKEN) 94 | DATABRICKS_HOST: $(env_variables.DATABRICKS_HOST) 95 | 96 | - task: PublishTestResults@2 97 | inputs: 98 | testResultsFormat: 'JUnit' 99 | testResultsFiles: '$(Build.ArtifactStagingDirectory)/*_testresults.xml' 100 | testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version) - Unit Test results' 101 | condition: succeededOrFailed() 102 | displayName: 'Publish Unit Test Results' 103 | 104 | - task: PublishTestResults@2 105 | inputs: 106 | testResultsFormat: 'JUnit' 107 | testResultsFiles: '**/*_testresults.xml' 108 | testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version) - Linting Test results' 109 | condition: succeededOrFailed() 110 | displayName: 'Publish Linting Test Results' 111 | 112 | 113 | - task: PublishCodeCoverageResults@1 114 | inputs: 115 | codeCoverageTool: Cobertura 116 | summaryFileLocation: '$(Build.ArtifactStagingDirectory)/*_cov_report.xml' 117 | displayName: 'Publish Coverage Results' 118 | condition: succeededOrFailed() 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /.dbx/lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "context_id": "8789636436477646098" 3 | } -------------------------------------------------------------------------------- /.dbx/project.json: -------------------------------------------------------------------------------- 1 | { 2 | "environments": { 3 | "default": { 4 | "profile": "DEFAULT", 5 | "storage_type": "mlflow", 6 | "properties": { 7 | "workspace_directory": "/Shared/dbx/projects/dstoolkit-mlops-databricks", 8 | "artifact_location": "dbfs:/dbx/dstoolkit-mlops-databricks" 9 | } 10 | }, 11 | "sandbox": { 12 | "profile": "sandbox", 13 | "storage_type": "mlflow", 14 | "properties": { 15 | "workspace_directory": "/Shared/dbx/projects/dstoolkit-mlops-databricks", 16 | "artifact_location": "dbfs:/dbx/dstoolkit-mlops-databricks" 17 | } 18 | }, 19 | "development": { 20 | "profile": "development", 21 | "storage_type": "mlflow", 22 | "properties": { 23 | "workspace_directory": "/Shared/dbx/projects/dstoolkit-mlops-databricks", 24 | "artifact_location": "dbfs:/dbx/dstoolkit-mlops-databricks" 25 | } 26 | }, 27 | "uat": { 28 | "profile": "uat", 29 | "storage_type": "mlflow", 30 | "properties": { 31 | "workspace_directory": "/Shared/dbx/projects/dstoolkit-mlops-databricks", 32 | "artifact_location": "dbfs:/dbx/dstoolkit-mlops-databricks" 33 | } 34 | }, 35 | "production": { 36 | "profile": "production", 37 | "storage_type": "mlflow", 38 | "properties": { 39 | "workspace_directory": "/Shared/dbx/projects/dstoolkit-mlops-databricks", 40 | "artifact_location": "dbfs:/dbx/dstoolkit-mlops-databricks" 41 | } 42 | }, 43 | "ciaran_sandbox": { 44 | "profile": "ciaran_sandbox", 45 | "workspace_dir": "/Shared/ciaran_sandbox", 46 | "artifact_location": "dbfs:/Shared/cicd_workflows/ciaran_sandbox" 47 | } 48 | }, 49 | "inplace_jinja_support": true, 50 | "failsafe_cluster_reuse_with_assets": false, 51 | "context_based_upload_for_execute": false 52 | } -------------------------------------------------------------------------------- /.github/workflows/onDeploy.yaml: -------------------------------------------------------------------------------- 1 | 2 | name: onDeploy Databricks 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | script_language: 7 | type: choice 8 | description: Python or Bash (Databricks API) 9 | options: 10 | - python 11 | - bash 12 | default: python 13 | 14 | jobs: 15 | ReuseableMatrixJobForDeployment: 16 | name: Master Deployment 17 | strategy: 18 | matrix: 19 | targetEnvironment: [ sandbox ] # development, uat, production 20 | uses: ./.github/workflows/taskDatabricks.yaml 21 | with: 22 | ENVIRONMENT: ${{ matrix.targetEnvironment }} 23 | DBX_REPO_BRANCH: 'main' 24 | SCRIPT_LANGUAGE: ${{ github.event.inputs.script_language }} 25 | DevOps_Agent: GitHub 26 | secrets: 27 | ARM_CLIENT_ID: ${{ secrets.ARM_CLIENT_ID }} 28 | ARM_CLIENT_SECRET: ${{ secrets.ARM_CLIENT_SECRET }} 29 | ARM_TENANT_ID: ${{ secrets.ARM_TENANT_ID }} 30 | PAT_GITHUB: ${{ secrets.PAT_GITHUB }} 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /.github/workflows/onRelease.yaml: -------------------------------------------------------------------------------- 1 | 2 | 3 | name: onRelease Databricks 4 | on: 5 | pull_request: 6 | branches: 7 | - main 8 | - feature/** 9 | - release/** 10 | tags: 11 | - 'v**' 12 | types: 13 | - opened 14 | - closed 15 | 16 | jobs: 17 | pr_CI_Development: 18 | 19 | if: github.event_name == 'pull_request' && github.event.action == 'opened' && github.base_ref == 'main' 20 | name: Checks 21 | runs-on: ubuntu-latest 22 | steps: 23 | - run: | 24 | echo "Insert Continuous Integration Tests" 25 | 26 | # IMPORTANT: The testing framework is not yet implemented, and therefore still under development. 27 | 28 | #cd mlOps/devOps/utils 29 | 30 | #python -m pytest -v 31 | 32 | prApproved_CD_Development: 33 | if: github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged == true && contains(github.head_ref, 'feature') && github.base_ref == 'main' 34 | uses: ./.github/workflows/taskDatabricks.yaml 35 | with: 36 | ENVIRONMENT: development 37 | DBX_REPO_BRANCH: main 38 | SCRIPT_LANGUAGE: python 39 | DevOps_Agent: GitHub 40 | secrets: 41 | ARM_CLIENT_ID: ${{ secrets.ARM_CLIENT_ID }} 42 | ARM_CLIENT_SECRET: ${{ secrets.ARM_CLIENT_SECRET }} 43 | ARM_TENANT_ID: ${{ secrets.ARM_TENANT_ID }} 44 | PAT_GITHUB: ${{ secrets.PAT_GITHUB }} 45 | 46 | pr_CI_UAT: 47 | 48 | if: github.event_name == 'pull_request' && github.event.action == 'opened' && contains(github.base_ref, 'release') 49 | name: Checks 50 | runs-on: ubuntu-latest 51 | steps: 52 | - run: | 53 | echo "Insert Continuous Integration Tests" 54 | - run: | 55 | echo "${{ github.head_ref }}" 56 | echo "${{ github.base_ref }}" 57 | 58 | prApproved_CD_UAT: 59 | if: github.event_name == 'pull_request' && github.event.action == 'closed' && github.head_ref == 'main' && contains(github.base_ref, 'release') 60 | uses: ./.github/workflows/taskDatabricks.yaml 61 | with: 62 | ENVIRONMENT: uat 63 | DBX_REPO_BRANCH: 'release/1' 64 | SCRIPT_LANGUAGE: python 65 | DevOps_Agent: GitHub 66 | secrets: 67 | ARM_CLIENT_ID: ${{ secrets.ARM_CLIENT_ID }} 68 | ARM_CLIENT_SECRET: ${{ secrets.ARM_CLIENT_SECRET }} 69 | ARM_TENANT_ID: ${{ secrets.ARM_TENANT_ID }} 70 | PAT_GITHUB: ${{ secrets.PAT_GITHUB }} 71 | 72 | 73 | # git tag -a v1.0.1 -m "my version 1.0.1" 74 | # git tag -l 75 | # git push origin v1.0.1 76 | # git tag -d v1.0.1 77 | # ( Create Security Rule That Only Allow Release Branch to be Tagged). 78 | 79 | prApproved_CD_Production: 80 | if: ${{ startsWith(github.ref, 'refs/tags/v') }} 81 | uses: ./.github/workflows/taskDatabricks.yaml 82 | with: 83 | ENVIRONMENT: production 84 | DBX_REPO_BRANCH: 'release/1' 85 | SCRIPT_LANGUAGE: python 86 | DevOps_Agent: GitHub 87 | secrets: 88 | ARM_CLIENT_ID: ${{ secrets.ARM_CLIENT_ID }} 89 | ARM_CLIENT_SECRET: ${{ secrets.ARM_CLIENT_SECRET }} 90 | ARM_TENANT_ID: ${{ secrets.ARM_TENANT_ID }} 91 | PAT_GITHUB: ${{ secrets.PAT_GITHUB }} 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/.vscode/ 2 | .venv 3 | .databricks 4 | deprecated 5 | mlOps/modelOps/data_science/deprecated 6 | mlOps/devOps/params/deprecated 7 | .dbx/sync 8 | localdev 9 | azureDevOps 10 | .env 11 | .venv_dbx_con13 12 | dev.env 13 | data_science/src_nyc_taxi/build 14 | data_science/src_nyc_taxi/dist 15 | data_science/src_nyc_taxi/*/__pycache__ 16 | data_science/src_nyc_taxi/src_nyc_taxi.egg-info 17 | data_science/src_nyc_taxi/src_nyc_taxi-0.0.1 18 | poetry.lock 19 | .linux_venv 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Microsoft 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /New Text Document.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/New Text Document.txt -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /dataops/src_nyc_taxi/data_quality.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/dataops/src_nyc_taxi/data_quality.py -------------------------------------------------------------------------------- /dataops/src_nyc_taxi/transform.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md ## Define data preprocessing helper functions 7 | 8 | # COMMAND ---------- 9 | 10 | from uszipcode import SearchEngine 11 | import sqlite3 12 | import pandas as pd 13 | from pyspark.sql.functions import udf, col 14 | from pyspark.sql.types import IntegerType 15 | import math 16 | from urllib import request 17 | import os 18 | 19 | BAD_ZIPCODE_VALUE = 'bad_zipcode' 20 | file_location = "dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/" 21 | file_type = "csv" 22 | target_year = 2016 23 | 24 | def push_zipcode_data_to_executors(): 25 | # Download directly from github since the default download location can be flaky 26 | target_dir = '/tmp/db/' 27 | print(target_dir) 28 | target_file = os.path.join(target_dir, 'simple_db.sqlite') 29 | print(target_file) 30 | remote_url = 'https://github.com/MacHu-GWU/uszipcode-project/files/5183256/simple_db.log' 31 | os.makedirs(target_dir, exist_ok=True) 32 | print(os.makedirs(target_dir, exist_ok=True)) 33 | request.urlretrieve(remote_url, target_file) 34 | print(request.urlretrieve(remote_url, target_file)) 35 | # Query the zipcode database into a pandas dataframe 36 | #search = SearchEngine(db_file_dir=target_dir) 37 | conn = sqlite3.connect(target_file) 38 | pdf = pd.read_sql_query('''select zipcode, lat, lng, radius_in_miles, 39 | bounds_west, bounds_east, bounds_north, bounds_south from 40 | simple_zipcode''',conn) 41 | return sc.broadcast(pdf) 42 | 43 | # Define UDF to lookup ZIP code based on latitude and longitude 44 | @udf('string') 45 | def get_zipcode(lat, lng): 46 | if lat is None or lng is None: 47 | return BAD_ZIPCODE_VALUE 48 | dist_btwn_lat_deg = 69.172 49 | dist_btwn_lon_deg = math.cos(lat) * 69.172 50 | radius = 5 51 | lat_degr_rad = abs(radius / dist_btwn_lat_deg) 52 | lon_degr_rad = abs(radius / dist_btwn_lon_deg) 53 | lat_lower = lat - lat_degr_rad 54 | lat_upper = lat + lat_degr_rad 55 | lng_lower = lng - lon_degr_rad 56 | lng_upper = lng + lon_degr_rad 57 | pdf = zipcodes_broadcast_df.value 58 | try: 59 | out = pdf[(pdf['lat'].between(lat_lower, lat_upper)) & (pdf['lng'].between(lng_lower, lng_upper))] 60 | dist = [None]*len(out) 61 | for i in range(len(out)): 62 | dist[i] = (out['lat'].iloc[i]-lat)**2 + (out['lng'].iloc[i]-lng)**2 63 | zip = out['zipcode'].iloc[dist.index(min(dist))] 64 | except: 65 | zip = BAD_ZIPCODE_VALUE 66 | return zip 67 | 68 | def get_data_files(yyyy, months): 69 | data_files = [] 70 | for mm in months: 71 | mm = str(mm) if mm >= 10 else f"0{mm}" 72 | month_data_files = list(filter(lambda file_name: f"{yyyy}-{mm}" in file_name, 73 | [f.path for f in dbutils.fs.ls(file_location)])) 74 | data_files += month_data_files 75 | return data_files 76 | 77 | def load_data(data_files, sample=1.0): 78 | df = (spark.read.format("csv") 79 | .option("inferSchema", "true") 80 | .option("header", "true") 81 | .option("ignoreLeadingWhiteSpace", "true") 82 | .option("ignoreTrailingWhiteSpace", "true") 83 | .option("sep", ",") 84 | .load(data_files) 85 | ).sample(False, sample, 123) 86 | 87 | # Rename, cast types, and filter columns 88 | column_allow_list = { 89 | "pickup_datetime": ["tpep_pickup_datetime", "timestamp"], 90 | "tpep_pickup_datetime": ["tpep_pickup_datetime", "timestamp"], 91 | 92 | # type conversion 93 | "dropoff_datetime": ["tpep_dropoff_datetime", "timestamp"], 94 | "tpep_dropoff_datetime": ["tpep_dropoff_datetime", "timestamp"], 95 | 96 | "pickup_zip": ["pickup_zip", "integer"], 97 | "dropoff_zip": ["dropoff_zip", "integer"], 98 | "trip_distance": ["trip_distance", "double"], 99 | "fare_amount": ["fare_amount", "double"], 100 | "pickup_latitude": ["pickup_latitude", "double"], 101 | "pickup_longitude": ["pickup_longitude", "double"], 102 | "dropoff_latitude": ["dropoff_latitude", "double"], 103 | "dropoff_longitude": ["dropoff_longitude", "double"], 104 | } 105 | columns = [] 106 | for orig in df.columns: 107 | orig_lower = orig.lower() 108 | if orig_lower in column_allow_list: 109 | new_name, data_type = column_allow_list[orig_lower] 110 | columns.append(col(orig).cast(data_type).alias(new_name.lower())) 111 | 112 | return df.select(columns) 113 | 114 | def annotate_zipcodes(df): 115 | to_zip = lambda lat, lng: get_zipcode(col(lat).astype("double"), col(lng).astype("double")) 116 | # Add ZIP code columns, drop intermediate columns 117 | df = (df 118 | .withColumn('pickup_zip', to_zip("pickup_latitude", "pickup_longitude")) 119 | .withColumn('dropoff_zip', to_zip("dropoff_latitude", "dropoff_longitude")) 120 | .drop('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude') 121 | ) 122 | # Filter out rows with bad data 123 | df = df.filter(df.pickup_zip != BAD_ZIPCODE_VALUE) 124 | df = df.filter(df.dropoff_zip != BAD_ZIPCODE_VALUE) 125 | 126 | # Cast ZIP code to int 127 | df = df.withColumn("pickup_zip", df["pickup_zip"].cast(IntegerType())) 128 | df = df.withColumn("dropoff_zip", df["dropoff_zip"].cast(IntegerType())) 129 | return df 130 | 131 | def write_to_table(df, database, table): 132 | (df.write 133 | .format("delta") 134 | .mode("overwrite") 135 | .option("overwriteSchema", "true") 136 | .saveAsTable(f"{database}.{table}")) 137 | 138 | 139 | # COMMAND ---------- 140 | 141 | spark.sql("CREATE DATABASE IF NOT EXISTS feature_store_taxi_example;") 142 | 143 | # COMMAND ---------- 144 | 145 | # MAGIC %md ## Generate DataFrame and write to table 146 | 147 | # COMMAND ---------- 148 | 149 | # Read ZIP code data and push a broadcast dataframe to executors to speed up the UDF 150 | zipcodes_broadcast_df = push_zipcode_data_to_executors() 151 | 152 | # Generate data file names for the first 2 months of data in 2016 153 | data_files = get_data_files(target_year,months=[1,2]) 154 | 155 | # Load in a small subsample of data to speed things up for this example 156 | df = load_data(data_files, sample=.001) 157 | 158 | # Repartition -- by default this dataset only has a single partition. 159 | # Use a small parition count since the dataset is already small. 160 | df = df.repartition(6) 161 | 162 | # Enhance the DataFrame by converting latitude and longitude coordinates into ZIP codes 163 | df_with_zip = annotate_zipcodes(df) 164 | 165 | # Write the DataFrame to a Delta table 166 | write_to_table(df_with_zip, database="feature_store_taxi_example", table="nyc_yellow_taxi_with_zips") 167 | 168 | # COMMAND ---------- 169 | 170 | raw_data = spark.read.table("feature_store_taxi_example.nyc_yellow_taxi_with_zips") 171 | 172 | # COMMAND ---------- 173 | 174 | display(raw_data) 175 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /docs/images/AppInsightConnectionString.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/AppInsightConnectionString.jpg -------------------------------------------------------------------------------- /docs/images/AzureResources.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/AzureResources.JPG -------------------------------------------------------------------------------- /docs/images/Azure_Machine_Learning_GIF.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/Azure_Machine_Learning_GIF.gif -------------------------------------------------------------------------------- /docs/images/DatabricksNotebookExecution.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/DatabricksNotebookExecution.JPG -------------------------------------------------------------------------------- /docs/images/DatabricksORGIDandHOSTID.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/DatabricksORGIDandHOSTID.JPG -------------------------------------------------------------------------------- /docs/images/DatabricksTokenGeneration.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/DatabricksTokenGeneration.jpg -------------------------------------------------------------------------------- /docs/images/DevContainer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/DevContainer.jpg -------------------------------------------------------------------------------- /docs/images/DockerImageLoad.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/DockerImageLoad.jpg -------------------------------------------------------------------------------- /docs/images/InstallExtensions.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/InstallExtensions.jpg -------------------------------------------------------------------------------- /docs/images/MLOps_for_databricks_Solution_Acclerator_logo.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/MLOps_for_databricks_Solution_Acclerator_logo.JPG -------------------------------------------------------------------------------- /docs/images/OutputOfTheConfigurationStep.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/OutputOfTheConfigurationStep.jpg -------------------------------------------------------------------------------- /docs/images/Overview.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/Overview.JPG -------------------------------------------------------------------------------- /docs/images/PipelineSteps.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/PipelineSteps.JPG -------------------------------------------------------------------------------- /docs/images/PowershellScreen.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/PowershellScreen.jpg -------------------------------------------------------------------------------- /docs/images/SecretsFileImage.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/SecretsFileImage.jpg -------------------------------------------------------------------------------- /docs/images/SuccessfulClusterCreation.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/SuccessfulClusterCreation.JPG -------------------------------------------------------------------------------- /docs/images/Verify_Python_Interpreter.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/Verify_Python_Interpreter.jpg -------------------------------------------------------------------------------- /docs/images/YoutubeThumbNail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/YoutubeThumbNail.png -------------------------------------------------------------------------------- /docs/images/cluster-upload-wheel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/cluster-upload-wheel.jpg -------------------------------------------------------------------------------- /docs/images/databricks-connect-pass.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/databricks-connect-pass.jpg -------------------------------------------------------------------------------- /docs/images/dstoolitgif.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/dstoolitgif.gif -------------------------------------------------------------------------------- /docs/images/final.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/final.jpg -------------------------------------------------------------------------------- /docs/images/map01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map01.png -------------------------------------------------------------------------------- /docs/images/map02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map02.png -------------------------------------------------------------------------------- /docs/images/map03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map03.png -------------------------------------------------------------------------------- /docs/images/map04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map04.png -------------------------------------------------------------------------------- /docs/images/map05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map05.png -------------------------------------------------------------------------------- /docs/images/map06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map06.png -------------------------------------------------------------------------------- /docs/images/map07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/map07.png -------------------------------------------------------------------------------- /docs/images/pythonversion.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/pythonversion.jpg -------------------------------------------------------------------------------- /docs/images/workspaceselection.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/docs/images/workspaceselection.jpg -------------------------------------------------------------------------------- /experiments/notebooks/ciaran_experiments/nyc_taxi/nyc_taxi_lgbm_1.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | from featurization import run_feature_store_refresh 3 | run_feature_store_refresh() 4 | 5 | # COMMAND ---------- 6 | 7 | from training import run_training 8 | 9 | num_rounds_arr = [20,40,60,80,100,120,170] 10 | 11 | for num_rounds in num_rounds_arr: 12 | run_training( 13 | experiment_name = "ciaran_experiment_nyc_taxi", 14 | model_name = "taxi_example_fare_packaged", 15 | model_params = { 16 | "objective": "regression", 17 | "metric": "rmse", 18 | "num_leaves": 25, 19 | "learning_rate": 0.2, 20 | "bagging_fraction": 0.9, 21 | "feature_fraction": 0.9, 22 | "bagging_seed": 42, 23 | "verbosity": -1, 24 | "seed": 42, 25 | "num_rounds": num_rounds 26 | } 27 | ) 28 | from registration import run_registration 29 | run_registration( 30 | model_name = "taxi_example_fare_packaged" 31 | ) 32 | -------------------------------------------------------------------------------- /experiments/pipelines/ciaran_experiments/workflow.yaml: -------------------------------------------------------------------------------- 1 | custom: 2 | 3 | # Cluster configs for each environment 4 | default-cluster-spec: &default-cluster-spec 5 | spark_version: '11.3.x-cpu-ml-scala2.12' 6 | node_type_id: 'Standard_DS3_v2' 7 | driver_node_type_id: 'Standard_DS3_v2' 8 | num_workers: 1 9 | # To reduce start up time for each job, it is advisable to use a cluster pool. To do so involves supplying the following 10 | # two fields with a pool_id to acquire both the driver and instances from. 11 | # If driver_instance_pool_id and instance_pool_id are set, both node_type_id and driver_node_type_id CANNOT be supplied. 12 | # As such, if providing a pool_id for driver and worker instances, please ensure that node_type_id and driver_node_type_id are not present 13 | # driver_instance_pool_id: '0617-151415-bells2-pool-hh7h6tjm' 14 | # instance_pool_id: '0617-151415-bells2-pool-hh7h6tjm' 15 | 16 | dev-cluster-config: &dev-cluster-config 17 | new_cluster: 18 | <<: *default-cluster-spec 19 | 20 | staging-cluster-config: &staging-cluster-config 21 | new_cluster: 22 | <<: *default-cluster-spec 23 | 24 | prod-cluster-config: &prod-cluster-config 25 | new_cluster: 26 | <<: *default-cluster-spec 27 | 28 | #build: 29 | # no_build: true 30 | build: 31 | python: poetry 32 | #python: "poetry build -f wheel" 33 | 34 | environments: 35 | default: 36 | workflows: 37 | - name: NEW_FUNCTION 38 | tasks: 39 | - task_key: "NEW_FUNCTION" 40 | existing_cluster_id: "0524-153828-e2rk9h52" 41 | spark_python_task: 42 | python_file: "{{var['parameters']['file_path']}}" 43 | 44 | - name: FEATURE_TABLE_REFRESH 45 | tasks: 46 | - task_key: "FEATURE_TABLE_REFRESH" 47 | existing_cluster_id: "0524-153828-e2rk9h52" 48 | spark_python_task: 49 | python_file: "{{var['parameters']['file_path']}}" 50 | 51 | - name: MODEL_TRAINING 52 | tasks: 53 | - task_key: "MODEL_TRAINING" 54 | existing_cluster_id: "0524-153828-e2rk9h52" 55 | spark_python_task: 56 | python_file: "{{var['parameters']['file_path']}}" -------------------------------------------------------------------------------- /experiments/pipelines/ciaran_experiments/workflow_configs/featurization.yaml: -------------------------------------------------------------------------------- 1 | parameters: 2 | file_path: 'file://data_science/src_nyc_taxi/featurization/__init__.py' -------------------------------------------------------------------------------- /experiments/pipelines/ciaran_experiments/workflow_configs/training.yaml: -------------------------------------------------------------------------------- 1 | parameters: 2 | file_path: 'file://data_science/src_nyc_taxi/training/__init__.py' 3 | -------------------------------------------------------------------------------- /experiments/pipelines/ciaran_experiments/workflow_configs/workflow_params.yaml: -------------------------------------------------------------------------------- 1 | ML_PIPELINE_FILES: 2 | 3 | DATA_INGEST_PREP: 4 | FILE_PATH: '/Repos/ciaranh@microsoft.com/experiments/mlOps/dataOps/nyc_taxi/data_prep' 5 | WHL_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/pyWheels/Helper_Functions/dist/helperfunctions-0.0.1-py3-none-any.whl' 6 | 7 | FEATURE_ENGINEERING: 8 | FILE_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/feature_eng.py' 9 | WHL_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/pyWheels/Helper_Functions/dist/helperfunctions-0.0.1-py3-none-any.whl' 10 | PARAMETERS: 11 | ENV: '--env' 12 | FILE: 'file:fuse://mlOps/modelOps/ml_pipelines/az_databricks/cicd/workflow_params.yaml' 13 | EXPERIMENT_NAME: 'dbx_workflow_fe' 14 | TRACK_IN_AZURE_ML: True 15 | 16 | TRAIN_REGISTER: 17 | FILE_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi//train_register.py' 18 | WHL_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/pyWheels/Helper_Functions/dist/helperfunctions-0.0.1-py3-none-any.whl' 19 | PARAMETERS: 20 | ENV: '--env' 21 | FILE: 'file:fuse://mlOps/modelOps/ml_pipelines/az_databricks/cicd/workflow_params.yaml' 22 | EXPERIMENT_NAME: 'dbx_workflow_train' 23 | TRACK_IN_AZURE_ML: True 24 | 25 | 26 | MODEL_INFERENCE: 27 | FILE_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/score.py' 28 | WHL_PATH: 'file://mlOps/modelOps/data_science/nyc_taxi/pyWheels/Helper_Functions/dist/helperfunctions-0.0.1-py3-none-any.whl' 29 | PARAMETERS: 30 | ENV: '--env' 31 | FILE: 'file:fuse://mlOps/modelOps/ml_pipelines/az_databricks/cicd/workflow_params.yaml' 32 | EXPERIMENT_NAME: 'dbx_workflow_inference' 33 | TRACK_IN_AZURE_ML: True 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /infrastructure/bicep/az_templates/az_app_insights/az_app_insights.bicep: -------------------------------------------------------------------------------- 1 | param location string = 'uksouth' 2 | 3 | param logwsname string 4 | var varlogwsname = '${logwsname}${substring(uniqueString(resourceGroup().id), 0, 4)}' 5 | 6 | param appinsightname string 7 | var varappinsightname = '${appinsightname}${substring(uniqueString(resourceGroup().id), 0, 4)}' 8 | 9 | 10 | 11 | resource logAnalytics 'Microsoft.OperationalInsights/workspaces@2021-12-01-preview' = { 12 | name: varlogwsname 13 | location: location 14 | properties: { 15 | sku: { 16 | name: 'PerGB2018' 17 | } 18 | retentionInDays: 30 19 | features: { 20 | legacy: 0 21 | searchVersion: 1 22 | enableLogAccessUsingOnlyResourcePermissions: true 23 | } 24 | workspaceCapping: { 25 | dailyQuotaGb: -1 26 | } 27 | publicNetworkAccessForIngestion: 'Enabled' 28 | publicNetworkAccessForQuery: 'Enabled' 29 | } 30 | } 31 | 32 | resource appInsight 'Microsoft.Insights/components@2020-02-02' = { 33 | name: varappinsightname 34 | location: location 35 | kind: 'web' 36 | properties: { 37 | Application_Type: 'web' 38 | Flow_Type: 'Redfield' 39 | Request_Source: 'IbizaAIExtension' 40 | WorkspaceResourceId: logAnalytics.id 41 | IngestionMode: 'LogAnalytics' 42 | publicNetworkAccessForIngestion: 'Enabled' 43 | publicNetworkAccessForQuery: 'Enabled' 44 | 45 | } 46 | } 47 | 48 | output azAppInsightsID string = appInsight.id 49 | -------------------------------------------------------------------------------- /infrastructure/bicep/az_templates/az_data_lake/az_data_lake.bicep: -------------------------------------------------------------------------------- 1 | // ################################################################################################################################################################// 2 | // Define Parameters 3 | // ################################################################################################################################################################// 4 | param storageConfig object 5 | param location string 6 | param containerNames array 7 | param ShouldCreateContainers bool = true 8 | param storageAccountName string 9 | param workspaceName string 10 | param resourceGroupName string 11 | param azKeyVaultName string 12 | 13 | 14 | // ################################################################################################################################################################// 15 | // Define Variables 16 | // ################################################################################################################################################################// 17 | var varstorageAccountName = '${storageAccountName}${substring(uniqueString(resourceGroup().id), 0, 4)}' 18 | 19 | 20 | 21 | // ################################################################################################################################################################// 22 | // Deploy Storage Account Per Environment 23 | // ################################################################################################################################################################// 24 | 25 | resource azStorage 'Microsoft.Storage/storageAccounts@2021-08-01' = { 26 | name: varstorageAccountName 27 | location: location 28 | kind: storageConfig.kind 29 | sku: { 30 | name: storageConfig.sku_name 31 | } 32 | properties: { 33 | allowBlobPublicAccess: storageConfig.allowBlobPublicAccess 34 | isHnsEnabled: storageConfig.isHnsEnabled 35 | accessTier: storageConfig.accessTier 36 | } 37 | 38 | // Nested Resource Deployment - Containers within Storage Account 39 | resource blobServices 'blobServices' = { 40 | name: 'default' 41 | resource containersCreate 'containers' = [for ContainerName in containerNames: if (ShouldCreateContainers) { 42 | name: ContainerName 43 | properties: { 44 | publicAccess: 'None' 45 | } 46 | }] 47 | } 48 | } 49 | 50 | 51 | 52 | // ################################################################################################################################################################// 53 | // Outputs 54 | // ################################################################################################################################################################// 55 | // output storagekey string = listKeys(resourceId('Microsoft.Storage/storageAccounts', name), '2021-08-01').keys[0].value 56 | output varstorageAccountName string = azStorage.name 57 | output varstorageAccountID string = azStorage.id 58 | output workspaceName string = workspaceName 59 | output resourceGroupName string = resourceGroupName 60 | output azKeyVaultName string = azKeyVaultName 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /infrastructure/bicep/az_templates/az_databricks/az_databricks.bicep: -------------------------------------------------------------------------------- 1 | // ################################################################################################################################################################// 2 | // Define Parameters 3 | // ################################################################################################################################################################// 4 | 5 | param azMachineLearningWSId string 6 | param location string 7 | param workspaceName string 8 | var varworkspaceName = '${workspaceName}-${substring(uniqueString(resourceGroup().id), 0, 4)}' 9 | var managedResourceGroupName = '${workspaceName}-mrg-${substring(uniqueString(resourceGroup().id), 0, 4)}' 10 | 11 | @allowed([ 12 | 'standard' 13 | 'premium' 14 | ]) 15 | param pricingTier string = 'premium' 16 | 17 | 18 | // ################################################################################################################################################################// 19 | // Define Variables 20 | // ################################################################################################################################################################// 21 | var roleDefinitionUser = guid('${resourceGroup().id}/8e3af657-a8ff-443c-a75c-2fe8c4bcb635/') 22 | 23 | 24 | 25 | // ################################################################################################################################################################// 26 | // Deploy AzDatabricks Workspace 27 | // ################################################################################################################################################################// 28 | resource azDatabricksWS 'Microsoft.Databricks/workspaces@2023-02-01' = { 29 | name: varworkspaceName 30 | 31 | location: location 32 | properties: { 33 | managedResourceGroupId: '${subscription().id}/resourceGroups/${managedResourceGroupName}' 34 | publicNetworkAccess: 'Enabled' 35 | //parameters: { 36 | // amlWorkspaceId: { 37 | // value: azMachineLearningWSId 38 | // } 39 | //} 40 | authorizations: [ 41 | { 42 | principalId: '0e3c30b0-dd4e-4937-96ca-3fe88bd8f259' 43 | roleDefinitionId: roleDefinitionUser 44 | } 45 | ] 46 | 47 | } 48 | sku: { 49 | name: pricingTier 50 | } 51 | 52 | 53 | 54 | } 55 | 56 | //resource spRoleAssignment 'Microsoft.Authorization/roleAssignments@2020-10-01-preview' = { 57 | // name: guid(azDatabricksWS.id, roleDefinitionAzureEventHubsDataOwner) 58 | // dependsOn: [ 59 | // azDatabricksWS 60 | // ] 61 | // properties: { 62 | // principalId: 'ab926dd1-657d-4bb2-9987-c7857046d0dd' 63 | // roleDefinitionId: roleDefinitionUser 64 | // principalType: 'ServicePrincipal' 65 | // } 66 | //} 67 | 68 | 69 | output azDatabricksWorkspaceID string = azDatabricksWS.id 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /infrastructure/bicep/az_templates/az_key_vault/az_key_vault.bicep: -------------------------------------------------------------------------------- 1 | 2 | param environment string 3 | param location string 4 | var keyVaultName = 'keyvault-${environment}-${substring(uniqueString(resourceGroup().id), 0, 4)}' 5 | 6 | 7 | resource azKeyVault 'Microsoft.KeyVault/vaults@2021-10-01' = { 8 | name: keyVaultName 9 | location: location 10 | properties: { 11 | sku: { 12 | family: 'A' 13 | name: 'premium' 14 | } 15 | tenantId: subscription().tenantId 16 | networkAcls: { 17 | defaultAction: 'Allow' 18 | bypass: 'AzureServices' 19 | ipRules: [] 20 | virtualNetworkRules: [] 21 | } 22 | enableRbacAuthorization: true // if this is false then you cannot use RBAC assignments, on acl (below). If true acl (below) is ignored 23 | enableSoftDelete: true 24 | enabledForTemplateDeployment: true 25 | accessPolicies: [ 26 | ] 27 | } 28 | 29 | } 30 | 31 | output azKeyVaultName string = azKeyVault.name 32 | output azKeyVaultID string = azKeyVault.id 33 | -------------------------------------------------------------------------------- /infrastructure/bicep/az_templates/az_machine_learning/az_machine_learning.bicep: -------------------------------------------------------------------------------- 1 | param location string 2 | param azAppInsightsID string 3 | param azKeyVaultID string 4 | param amlwsname string 5 | param amlblobname string 6 | 7 | 8 | var varamlblobname = '${amlblobname}${substring(uniqueString(resourceGroup().id), 0, 4)}' 9 | var varamlwsname = '${amlwsname}-${substring(uniqueString(resourceGroup().id), 0, 4)}' 10 | 11 | 12 | resource amlBlob 'Microsoft.Storage/storageAccounts@2021-08-01' = { 13 | name: varamlblobname 14 | location: location 15 | kind: 'StorageV2' 16 | sku: { 17 | name: 'Standard_LRS' 18 | } 19 | properties: { 20 | allowBlobPublicAccess: true 21 | isHnsEnabled: false 22 | accessTier: 'Hot' 23 | } 24 | } 25 | 26 | 27 | resource AzMachineLearning 'Microsoft.MachineLearningServices/workspaces@2023-04-01' = { 28 | name: varamlwsname 29 | location: location 30 | 31 | identity: { 32 | type: 'SystemAssigned' 33 | } 34 | properties: { 35 | publicNetworkAccess: 'Enabled' 36 | applicationInsights: azAppInsightsID 37 | storageAccount: amlBlob.id 38 | keyVault: azKeyVaultID 39 | } 40 | 41 | sku: { 42 | name: 'Enterprise' 43 | } 44 | 45 | } 46 | 47 | output azMachineLearningWSId string = AzMachineLearning.id 48 | -------------------------------------------------------------------------------- /infrastructure/bicep/main.bicep: -------------------------------------------------------------------------------- 1 | targetScope = 'subscription' 2 | 3 | param location string 4 | param environment string 5 | param storageConfig object 6 | param containerNames array 7 | param resourceGroupName string 8 | param workspaceName string 9 | param pricingTier string 10 | param ShouldCreateContainers bool = true 11 | param loganalyticswsname string 12 | param appInsightswsname string 13 | param storageAccountName string 14 | param TemplateParamFilePath string 15 | param TemplateFilePath string 16 | param AZURE_DATABRICKS_APP_ID string 17 | param MANAGEMENT_RESOURCE_ENDPOINT string 18 | param amlblobname string 19 | param amlwsname string 20 | 21 | // ################################################################################################################################################################// 22 | // Create Resource Group 23 | // ################################################################################################################################################################// 24 | resource azResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' = { 25 | dependsOn: [] 26 | name: resourceGroupName 27 | // Location of the Resource Group Does Not Have To Match That of The Resouces Within. Metadata for all resources within groups can reside in 'uksouth' below 28 | location: location 29 | } 30 | 31 | 32 | // ################################################################################################################################################################// 33 | // KEY VAULT - SELECT KV // 34 | // ################################################################################################################################################################// 35 | 36 | module azKeyVault 'az_templates/az_key_vault/az_key_vault.bicep' = { 37 | dependsOn: [ 38 | azResourceGroup 39 | 40 | ] 41 | scope: azResourceGroup 42 | name: 'azKeyVault' 43 | params: { 44 | environment: environment 45 | location: location 46 | } 47 | } 48 | 49 | // ################################################################################################################################################################// 50 | // Module for Create Azure Data Lake Storage 51 | // RBAC is assigned -> azDatabricks given access to Storage 52 | // ################################################################################################################################################################// 53 | module azDataLake 'az_templates/az_data_lake/az_data_lake.bicep' = { 54 | dependsOn: [ 55 | azResourceGroup 56 | ] 57 | scope: resourceGroup(resourceGroupName) 58 | name: 'azDataLake' 59 | params: { 60 | storageAccountName: storageAccountName 61 | storageConfig: storageConfig 62 | location: location 63 | containerNames: containerNames 64 | ShouldCreateContainers: ShouldCreateContainers 65 | workspaceName: workspaceName 66 | resourceGroupName: resourceGroupName 67 | azKeyVaultName: azKeyVault.outputs.azKeyVaultName 68 | 69 | 70 | } 71 | } 72 | 73 | 74 | module logAnalytics 'az_templates/az_app_insights/az_app_insights.bicep' = { 75 | dependsOn: [ 76 | azResourceGroup 77 | ] 78 | scope: resourceGroup(resourceGroupName) 79 | name: 'logAnalytics' 80 | params: { 81 | location: location 82 | logwsname: loganalyticswsname 83 | appinsightname: appInsightswsname 84 | } 85 | } 86 | 87 | 88 | // ################################################################################################################################################################// 89 | // Module for Creating Azure Machine Learning Workspace 90 | // Outputs AzDatabricks Workspace ID, which is used when Assigning RBACs. 91 | // ################################################################################################################################################################// 92 | module azMachineLearning 'az_templates/az_machine_learning/az_machine_learning.bicep' = { 93 | dependsOn: [ 94 | logAnalytics 95 | azDataLake 96 | azKeyVault 97 | 98 | ] 99 | scope: resourceGroup(resourceGroupName) 100 | name: 'amlws' 101 | params: { 102 | location: location 103 | azAppInsightsID: logAnalytics.outputs.azAppInsightsID 104 | azKeyVaultID: azKeyVault.outputs.azKeyVaultID 105 | amlwsname: amlwsname 106 | amlblobname: amlblobname 107 | 108 | 109 | 110 | } 111 | } 112 | 113 | // ################################################################################################################################################################// 114 | // Module for Creating Azure Databricks Workspace 115 | // Outputs AzDatabricks Workspace ID, which is used when Assigning RBACs 116 | // ################################################################################################################################################################// 117 | 118 | module azDatabricks 'br/public:avm/res/databricks/workspace:0.1.0' = { 119 | dependsOn: [ 120 | azMachineLearning 121 | ] 122 | scope: resourceGroup(resourceGroupName) 123 | name: 'azDatabricks-dbrws' 124 | params: { 125 | name: workspaceName 126 | location: location 127 | skuName: pricingTier 128 | } 129 | } 130 | 131 | 132 | output azDatabricksWorkspaceID string = azDatabricks.outputs.resourceId 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /infrastructure/bicep/params/development/bicep.parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "TemplateParamFilePath": { 6 | "value": "infrastructure/bicep/params/development/bicep.parameters.json" 7 | }, 8 | "TemplateFilePath": { 9 | "value": "infrastructure/bicep/main.bicep" 10 | }, 11 | "AZURE_DATABRICKS_APP_ID": { 12 | "value": "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d" 13 | }, 14 | "MANAGEMENT_RESOURCE_ENDPOINT": { 15 | "value": "https://management.core.windows.net/" 16 | }, 17 | "location": { 18 | "value": "eastus" 19 | }, 20 | "environment": { 21 | "value": "dev" 22 | }, 23 | 24 | "containerNames": { 25 | "value": [ 26 | "bronze", 27 | "silver", 28 | "gold" 29 | ] 30 | }, 31 | "storageConfig": { 32 | "value": { 33 | "kind": "StorageV2", 34 | "sku_name": "Standard_LRS", 35 | "allowBlobPublicAccess": true, 36 | "isHnsEnabled": true, 37 | "accessTier": "Hot" 38 | } 39 | }, 40 | "resourceGroupName" :{ 41 | "value": "databricks-dev-rg" 42 | }, 43 | "workspaceName" : { 44 | "value": "dbxwsdev" 45 | }, 46 | "pricingTier": { 47 | "value": "premium" 48 | }, 49 | "ShouldCreateContainers": { 50 | "value": true 51 | }, 52 | "loganalyticswsname": { 53 | "value": "loganalyticsdev" 54 | }, 55 | "appInsightswsname": { 56 | "value": "appinsightsdev" 57 | }, 58 | "storageAccountName": { 59 | "value": "adlsdev" 60 | }, 61 | "amlwsname": { 62 | "value": "amldev" 63 | }, 64 | "amlblobname": { 65 | "value": "amlblobdev" 66 | } 67 | 68 | } 69 | } 70 | 71 | -------------------------------------------------------------------------------- /infrastructure/bicep/params/production/bicep.parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "TemplateParamFilePath": { 6 | "value": "infrastructure/bicep/params/production/bicep.parameters.json" 7 | }, 8 | "TemplateFilePath": { 9 | "value": "infrastructure/bicep/main.bicep" 10 | }, 11 | "AZURE_DATABRICKS_APP_ID": { 12 | "value": "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d" 13 | }, 14 | "MANAGEMENT_RESOURCE_ENDPOINT": { 15 | "value": "https://management.core.windows.net/" 16 | }, 17 | "location": { 18 | "value": "eastus" 19 | }, 20 | "environment": { 21 | "value": "prod" 22 | }, 23 | "containerNames": { 24 | "value": [ 25 | "bronze", 26 | "silver", 27 | "gold" 28 | ] 29 | }, 30 | "storageConfig": { 31 | "value": { 32 | "kind": "StorageV2", 33 | "sku_name": "Standard_LRS", 34 | "allowBlobPublicAccess": true, 35 | "isHnsEnabled": true, 36 | "accessTier": "Hot" 37 | } 38 | }, 39 | "resourceGroupName" :{ 40 | "value": "databricks-prod-rg" 41 | }, 42 | "workspaceName" : { 43 | "value": "dbxwsprod" 44 | }, 45 | "pricingTier": { 46 | "value": "premium" 47 | }, 48 | "ShouldCreateContainers": { 49 | "value": true 50 | }, 51 | "loganalyticswsname": { 52 | "value": "loganalyticsprod" 53 | }, 54 | "appInsightswsname": { 55 | "value": "appinsightsprod" 56 | }, 57 | "storageAccountName": { 58 | "value": "adlsprod" 59 | }, 60 | "amlwsname": { 61 | "value": "amlprod" 62 | }, 63 | "amlblobname": { 64 | "value": "amlblobprod" 65 | } 66 | } 67 | } 68 | 69 | -------------------------------------------------------------------------------- /infrastructure/bicep/params/sandbox/bicep.parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "TemplateParamFilePath": { 6 | "value": "infrastructure/bicep/params/sandbox/bicep.parameters.json" 7 | }, 8 | "TemplateFilePath": { 9 | "value": "infrastructure/bicep/main.bicep" 10 | }, 11 | "AZURE_DATABRICKS_APP_ID": { 12 | "value": "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d" 13 | }, 14 | "MANAGEMENT_RESOURCE_ENDPOINT": { 15 | "value": "https://management.core.windows.net/" 16 | }, 17 | "location": { 18 | "value": "eastus" 19 | }, 20 | "environment": { 21 | "value": "sandbox" 22 | }, 23 | 24 | "containerNames": { 25 | "value": [ 26 | "bronze", 27 | "silver", 28 | "gold" 29 | ] 30 | }, 31 | "storageConfig": { 32 | "value": { 33 | "kind": "StorageV2", 34 | "sku_name": "Standard_LRS", 35 | "allowBlobPublicAccess": true, 36 | "isHnsEnabled": true, 37 | "accessTier": "Hot" 38 | } 39 | }, 40 | "resourceGroupName" :{ 41 | "value": "databricks-sandbox-rg" 42 | }, 43 | "workspaceName" : { 44 | "value": "dbxwssandbox" 45 | }, 46 | "pricingTier": { 47 | "value": "premium" 48 | }, 49 | "ShouldCreateContainers": { 50 | "value": true 51 | }, 52 | "loganalyticswsname": { 53 | "value": "loganalyticssandbox" 54 | }, 55 | "appInsightswsname": { 56 | "value": "appinsightssandbox" 57 | }, 58 | "storageAccountName": { 59 | "value": "adlssandbox" 60 | }, 61 | "amlwsname": { 62 | "value": "amlsandbox" 63 | }, 64 | "amlblobname": { 65 | "value": "amlblobsandbox" 66 | } 67 | } 68 | } 69 | 70 | -------------------------------------------------------------------------------- /infrastructure/bicep/params/uat/bicep.parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "TemplateParamFilePath": { 6 | "value": "infrastructure/bicep/params/uat/bicep.parameters.json" 7 | }, 8 | "TemplateFilePath": { 9 | "value": "infrastructure/bicep/main.bicep" 10 | }, 11 | "AZURE_DATABRICKS_APP_ID": { 12 | "value": "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d" 13 | }, 14 | "MANAGEMENT_RESOURCE_ENDPOINT": { 15 | "value": "https://management.core.windows.net/" 16 | }, 17 | "location": { 18 | "value": "eastus" 19 | }, 20 | "environment": { 21 | "value": "uat" 22 | }, 23 | "containerNames": { 24 | "value": [ 25 | "bronze", 26 | "silver", 27 | "gold" 28 | ] 29 | }, 30 | "storageConfig": { 31 | "value": { 32 | "kind": "StorageV2", 33 | "sku_name": "Standard_LRS", 34 | "allowBlobPublicAccess": true, 35 | "isHnsEnabled": true, 36 | "accessTier": "Hot" 37 | } 38 | }, 39 | "resourceGroupName" :{ 40 | "value": "databricks-uat-rg" 41 | }, 42 | "workspaceName" : { 43 | "value": "dbxwsuat" 44 | }, 45 | "pricingTier": { 46 | "value": "premium" 47 | }, 48 | "ShouldCreateContainers": { 49 | "value": true 50 | }, 51 | "loganalyticswsname": { 52 | "value": "loganalyticsuat" 53 | }, 54 | "appInsightswsname": { 55 | "value": "appinsightsuat" 56 | }, 57 | "storageAccountName": { 58 | "value": "adlsuat" 59 | }, 60 | "amlwsname": { 61 | "value": "amluat" 62 | }, 63 | "amlblobname": { 64 | "value": "amlblobuat" 65 | } 66 | } 67 | } 68 | 69 | -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/development/clusters.json: -------------------------------------------------------------------------------- 1 | { 2 | "Clusters": [ 3 | 4 | { 5 | "cluster_name": "ml_cluster", 6 | "spark_version": "13.3.x-cpu-ml-scala2.12", 7 | "node_type_id": "Standard_DS3_v2", 8 | "spark_conf": { 9 | "spark.databricks.delta.preview.enabled": "true" 10 | }, 11 | "autotermination_minutes": 30, 12 | "runtime_engine": "STANDARD", 13 | "autoscale": { 14 | "min_workers": 2, 15 | "max_workers": 3 16 | } 17 | } 18 | ] 19 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/development/rbac.json: -------------------------------------------------------------------------------- 1 | { 2 | "RBAC_Assignments": [ 3 | { 4 | "roles": [ 5 | "Key Vault Administrator" 6 | ], 7 | "roleBeneficiaryObjID": "d30dd2e8-25d0-49cb-b99a-80ae061aac2c", 8 | "Description": "Your Object ID", 9 | "principalType": "User" 10 | }, 11 | { 12 | "roles": [ 13 | "Contributor", 14 | "Key Vault Administrator", 15 | "Storage Blob Data Contributor" 16 | ], 17 | "roleBeneficiaryObjID": "eb578d1b-72d9-4aa7-97be-97ace3a8954e", 18 | "Description": "Databricks SPN", 19 | "principalType": "ServicePrincipal" 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/development/repos.json: -------------------------------------------------------------------------------- 1 | { 2 | "Git_Configuration": [ 3 | { 4 | "git_username": "clintgrove", 5 | "git_provider": "gitHub" 6 | } 7 | ], 8 | "Repo_Configuration": [ 9 | { 10 | "url": "https://github.com/clintgrove/dstoolkit-mlops-databricks", 11 | "provider": "gitHub", 12 | "path": "DevelopmentFolder", 13 | "branch": "main" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/production/clusters.json: -------------------------------------------------------------------------------- 1 | { 2 | "Clusters": [ 3 | { 4 | "cluster_name": "ml_cluster", 5 | "spark_version": "13.0.x-cpu-ml-scala2.12", 6 | "node_type_id": "Standard_DS3_v2", 7 | "spark_conf": { 8 | }, 9 | "autotermination_minutes": 30, 10 | "runtime_engine": "STANDARD", 11 | "autoscale": { 12 | "min_workers": 2, 13 | "max_workers": 3 14 | } 15 | } 16 | ] 17 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/production/rbac.json: -------------------------------------------------------------------------------- 1 | { 2 | "RBAC_Assignments": [ 3 | { 4 | "roles": [ 5 | "Key Vault Administrator" 6 | ], 7 | "roleBeneficiaryObjID": "d30dd2e8-25d0-49cb-b99a-80ae061aac2c", 8 | "Description": "Your Object ID", 9 | "principalType": "User" 10 | }, 11 | { 12 | "roles": [ 13 | "Contributor", 14 | "Key Vault Administrator", 15 | "Storage Blob Data Contributor" 16 | ], 17 | "roleBeneficiaryObjID": "eb578d1b-72d9-4aa7-97be-97ace3a8954e", 18 | "Description": "Databricks SPN", 19 | "principalType": "ServicePrincipal" 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/production/repos.json: -------------------------------------------------------------------------------- 1 | { 2 | "Git_Configuration": [ 3 | { 4 | "git_username": "clintgrove", 5 | "git_provider": "gitHub" 6 | } 7 | ], 8 | "Repo_Configuration": [ 9 | { 10 | "url": "https://github.com/clintgrove/dstoolkit-mlops-databricks", 11 | "provider": "gitHub", 12 | "path": "ProductionFolder", 13 | "branch": "main" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/sandbox/clusters.json: -------------------------------------------------------------------------------- 1 | { 2 | "Clusters": [ 3 | { 4 | "cluster_name": "ml_cluster", 5 | "spark_version": "13.0.x-cpu-ml-scala2.12", 6 | "node_type_id": "Standard_DS3_v2", 7 | "spark_conf": { 8 | }, 9 | "autotermination_minutes": 30, 10 | "runtime_engine": "STANDARD", 11 | "autoscale": { 12 | "min_workers": 2, 13 | "max_workers": 3 14 | } 15 | } 16 | ] 17 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/sandbox/rbac.json: -------------------------------------------------------------------------------- 1 | { 2 | "RBAC_Assignments": [ 3 | { 4 | "roles": [ 5 | "Key Vault Administrator" 6 | ], 7 | "roleBeneficiaryObjID": "d30dd2e8-25d0-49cb-b99a-80ae061aac2c", 8 | "Description": "Your Object ID", 9 | "principalType": "User" 10 | }, 11 | { 12 | "roles": [ 13 | "Contributor", 14 | "Key Vault Administrator", 15 | "Storage Blob Data Contributor", 16 | "AzureML Data Scientist" 17 | ], 18 | "roleBeneficiaryObjID": "eb578d1b-72d9-4aa7-97be-97ace3a8954e", 19 | "Description": "Databricks SPN", 20 | "principalType": "ServicePrincipal" 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/sandbox/repos.json: -------------------------------------------------------------------------------- 1 | { 2 | "Git_Configuration": [ 3 | { 4 | "git_username": "clintgrove", 5 | "git_provider": "gitHub" 6 | } 7 | ], 8 | "Repo_Configuration": [ 9 | { 10 | "url": "https://github.com/clintgrove/dstoolkit-mlops-databricks", 11 | "provider": "gitHub", 12 | "path": "Sandbox", 13 | "branch": "main" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/uat/clusters.json: -------------------------------------------------------------------------------- 1 | { 2 | "Clusters": [ 3 | 4 | { 5 | "cluster_name": "ml_cluster", 6 | "spark_version": "13.0.x-cpu-ml-scala2.12", 7 | "node_type_id": "Standard_DS3_v2", 8 | "spark_conf": { 9 | "spark.databricks.delta.preview.enabled": "true" 10 | }, 11 | "autotermination_minutes": 30, 12 | "runtime_engine": "STANDARD", 13 | "autoscale": { 14 | "min_workers": 2, 15 | "max_workers": 3 16 | } 17 | } 18 | ] 19 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/uat/rbac.json: -------------------------------------------------------------------------------- 1 | { 2 | "RBAC_Assignments": [ 3 | { 4 | "roles": [ 5 | "Key Vault Administrator" 6 | ], 7 | "roleBeneficiaryObjID": "d30dd2e8-25d0-49cb-b99a-80ae061aac2c", 8 | "Description": "Your Object ID", 9 | "principalType": "User" 10 | }, 11 | { 12 | "roles": [ 13 | "Contributor", 14 | "Key Vault Administrator", 15 | "Storage Blob Data Contributor" 16 | ], 17 | "roleBeneficiaryObjID": "eb578d1b-72d9-4aa7-97be-97ace3a8954e", 18 | "Description": "Databricks SPN", 19 | "principalType": "ServicePrincipal" 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /infrastructure/databricks/databricks_configs/uat/repos.json: -------------------------------------------------------------------------------- 1 | { 2 | "Git_Configuration": [ 3 | { 4 | "git_username": "clintgrove", 5 | "git_provider": "gitHub" 6 | } 7 | ], 8 | "Repo_Configuration": [ 9 | { 10 | "url": "https://github.com/clintgrove/dstoolkit-mlops-databricks", 11 | "provider": "gitHub", 12 | "path": "UATFolder", 13 | "branch": "main" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /mlops/nyc_taxi/aml_pipelines/v1/nyc_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | from azureml.core import Workspace, Experiment 4 | from azureml.core.compute import ComputeTarget, AmlCompute 5 | from azureml.pipeline.steps import PythonScriptStep, DatabricksStep 6 | from azureml.pipeline.core import Pipeline, PipelineData, StepSequence 7 | from azureml.core.compute import ComputeTarget, DatabricksCompute 8 | from azureml.exceptions import ComputeTargetException 9 | from azureml.core.authentication import ServicePrincipalAuthentication 10 | 11 | 12 | DATABRICKS_COMPUTE_NAME = os.environ['DATABRICKS_COMPUTE_NAME'] 13 | DATABRICKS_CLUSTER_NAME = os.environ['DATABRICKS_CLUSTER_NAME'] 14 | RESOURCE_GROUP_NAME = os.environ['RESOURCE_GROUP_NAME'] 15 | DATABRICKS_AAD_TOKEN = os.environ['DATABRICKS_AAD_TOKEN'] 16 | DATABRICKS_INSTANCE = os.environ['DATABRICKS_INSTANCE'] 17 | DATABRICKS_WS_NAME = os.environ['DATABRICKS_WS_NAME'] 18 | WORKSPACE_ID = os.environ['WORKSPACE_ID'] 19 | SUBSCRIPTION_ID = os.environ['SUBSCRIPTION_ID'] 20 | ARM_CLIENT_SECRET = os.environ['ARM_CLIENT_SECRET'] 21 | ARM_TENANT_ID = os.environ['ARM_TENANT_ID'] 22 | ARM_CLIENT_ID = os.environ['ARM_CLIENT_ID'] 23 | DATABRICKS_MANAGEMENT_TOKEN = os.environ['DATABRICKS_MANAGEMENT_TOKEN'] 24 | ENVIRONMENT = os.environ['ENVIRONMENT'] 25 | AML_WS_NAME = os.environ['AML_WS_NAME'] 26 | 27 | DBRKS_REQ_HEADERS = { 28 | 'Authorization': f'Bearer {DATABRICKS_AAD_TOKEN}', 29 | 'X-Databricks-Azure-SP-Management-Token': f'{DATABRICKS_MANAGEMENT_TOKEN}', 30 | 'X-Databricks-Azure-Workspace-Resource-Id': f'{WORKSPACE_ID}', 31 | 'Content-Type': 'application/json' 32 | } 33 | 34 | #print(DATABRICKS_COMPUTE_NAME) 35 | #print(WORKSPACE_ID) 36 | #print(RESOURCE_GROUP_NAME) 37 | #print(DATABRICKS_AAD_TOKEN) 38 | #print(DATABRICKS_MANAGEMENT_TOKEN) 39 | #print(DATABRICKS_INSTANCE) 40 | #print(SUBSCRIPTION_ID) 41 | #print(ARM_CLIENT_SECRET) 42 | #print(ARM_TENANT_ID) 43 | #print(ARM_CLIENT_ID) 44 | #print(AML_WS_NAME) 45 | 46 | class GetClusterID(): 47 | def __init__(self, cluster_name): 48 | self.clusters_obj = self.list_clusters()['clusters'] 49 | self.cluster_name = cluster_name 50 | def get_cluster_id(self): 51 | """ 52 | Returns Cluster ID for a given cluster name. 53 | """ 54 | for cluster in self.clusters_obj: 55 | if cluster['cluster_name'] == self.cluster_name: 56 | print("ml_cluster exists") 57 | cluster_id = cluster['cluster_id'] 58 | print(cluster_id) 59 | return cluster_id 60 | def list_clusters(self): 61 | """ 62 | Returns a Json object containing a list of existing Databricks Clusters. 63 | """ 64 | response = requests.get('https://' + DATABRICKS_INSTANCE + '/api/2.0/clusters/list', headers=DBRKS_REQ_HEADERS) 65 | if response.status_code != 200: 66 | raise Exception(response.content) 67 | else: 68 | return response.json() 69 | 70 | def create_pipeline_structure(databricks_compute, ws, cluster_id): 71 | print('Creating the pipeline structure') 72 | 73 | nyc_taxi_e2e_mlops = DatabricksStep( 74 | name="nyc_taxi_e2e_mlops", 75 | notebook_path="/Repos/"+ ARM_CLIENT_ID + "/Sandbox/data_science/src_nyc_taxi/src.py", 76 | #notebook_params={'myparam': 'testparam', 77 | # 'myparam2': pipeline_param}, 78 | run_name='nyc_taxi_e2e_mlops', 79 | compute_target=databricks_compute, 80 | existing_cluster_id=cluster_id, 81 | allow_reuse=True, 82 | num_workers=3 83 | ) 84 | 85 | 86 | step_sequence = StepSequence(steps=[nyc_taxi_e2e_mlops]) 87 | pipeline = Pipeline(workspace=ws, steps=step_sequence) 88 | pipeline.validate() 89 | 90 | return pipeline 91 | 92 | 93 | if __name__ == "__main__": 94 | svc_pr = ServicePrincipalAuthentication( 95 | tenant_id = ARM_TENANT_ID, 96 | service_principal_id = ARM_CLIENT_ID, 97 | service_principal_password = ARM_CLIENT_SECRET 98 | ) 99 | ws = Workspace( 100 | subscription_id=SUBSCRIPTION_ID, 101 | resource_group=RESOURCE_GROUP_NAME, 102 | workspace_name=AML_WS_NAME, 103 | auth=svc_pr 104 | ) 105 | 106 | #print(f" AML Workspace Properties: {ws} ") 107 | 108 | try: 109 | databricks_compute = DatabricksCompute(workspace=ws, name=DATABRICKS_COMPUTE_NAME) 110 | print('Compute target {} already exists'.format(DATABRICKS_COMPUTE_NAME)) 111 | except ComputeTargetException: 112 | #print('Compute not found, will use below parameters to attach new one') 113 | #print('db_compute_name {}'.format(DATABRICKS_COMPUTE_NAME)) 114 | #print('db_resource_group {}'.format(RESOURCE_GROUP_NAME)) 115 | #print('db_workspace_name {}'.format(DATABRICKS_WS_NAME)) 116 | #print('db_access_token {}'.format(DATABRICKS_AAD_TOKEN)) 117 | 118 | config = DatabricksCompute.attach_configuration( 119 | resource_group = RESOURCE_GROUP_NAME, 120 | workspace_name = DATABRICKS_WS_NAME, 121 | access_token= DATABRICKS_AAD_TOKEN) 122 | databricks_compute=ComputeTarget.attach(ws, DATABRICKS_COMPUTE_NAME, config) 123 | databricks_compute.wait_for_completion(True) 124 | 125 | 126 | cluster_obj = GetClusterID("ml_cluster") 127 | cluster_id = cluster_obj.get_cluster_id() 128 | 129 | 130 | 131 | #existingClusters = listClusters()['clusters'] 132 | #for cluster in existingClusters: 133 | # if cluster['cluster_name'] == "ml_cluster": 134 | # print("ml_cluster exists") 135 | # cluster_id = cluster['cluster_id'] 136 | # print(cluster_id) 137 | # else: 138 | # print("ml_cluster does not exist: cannot continue") 139 | #notebook_path=os.getenv("DATABRICKS_NOTEBOOK_PATH", "/Data_Scientist/featureEngineering.py") 140 | #notebook_path=os.getenv("DATABRICKS_NOTEBOOK_PATH", "databricks.ipynb") 141 | 142 | 143 | pipeline = create_pipeline_structure(databricks_compute=databricks_compute, ws=ws, cluster_id=cluster_id) 144 | published_pipeline = pipeline.publish("databricks_pipeline", version="1.0.0", description="Databricks Pipeline") 145 | 146 | 147 | -------------------------------------------------------------------------------- /mlops/nyc_taxi/aml_pipelines/v2/dontdelete/databricks/listclusters.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pandas as pd 4 | from sklearn.model_selection import train_test_split 5 | import logging 6 | import mlflow 7 | import requests 8 | import os 9 | 10 | #parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25) 11 | 12 | def main(): 13 | """Main function of the script.""" 14 | 15 | # input and output arguments 16 | parser = argparse.ArgumentParser() 17 | 18 | parser.add_argument("--data", type=str, help="path to input data") 19 | parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25) 20 | parser.add_argument("--train_data", type=str, help="path to train data") 21 | parser.add_argument("--test_data", type=str, help="path to test data") 22 | 23 | args = parser.parse_args() 24 | # Start Logging 25 | mlflow.start_run() 26 | 27 | print(" ".join(f"{k}={v}" for k, v in vars(args).items())) 28 | 29 | print("input data:", args.data) 30 | 31 | credit_df = pd.read_excel(args.data, header=1, index_col=0) 32 | 33 | mlflow.log_metric("num_samples", credit_df.shape[0]) 34 | mlflow.log_metric("num_features", credit_df.shape[1] - 1) 35 | 36 | credit_train_df, credit_test_df = train_test_split( 37 | credit_df, 38 | test_size=args.test_train_ratio, 39 | ) 40 | 41 | # output paths are mounted as folder, therefore, we are adding a filename to the path 42 | credit_train_df.to_csv(os.path.join(args.train_data, "data.csv"), index=False) 43 | 44 | credit_test_df.to_csv(os.path.join(args.test_data, "data.csv"), index=False) 45 | 46 | # Stop Logging 47 | mlflow.end_run() 48 | 49 | 50 | 51 | # Retrieve Tokens 52 | 53 | 54 | def createManagementToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL): 55 | """ 56 | Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens 57 | """ 58 | 59 | tokenRequestBody['resource'] = 'https://management.core.windows.net/' 60 | 61 | response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody) 62 | 63 | if response.status_code == 200: 64 | print(response.status_code) 65 | 66 | else: 67 | raise Exception(response.text) 68 | 69 | return response.json()['access_token'] 70 | 71 | def createBearerToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL): 72 | """ 73 | Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens 74 | """ 75 | 76 | tokenRequestBody['resource'] = '2ff814a6-3304-4ab8-85cb-cd0e6f879c1d' 77 | 78 | response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody) 79 | 80 | if response.status_code == 200: 81 | print(response.status_code) 82 | 83 | else: 84 | raise Exception(response.text) 85 | 86 | return response.json()['access_token'] 87 | 88 | 89 | 90 | def listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE): 91 | """ 92 | Returns a Json object containing a list of existing Databricks Clusters. 93 | """ 94 | 95 | response = requests.get('https://' + DATABRICKS_INSTANCE + '/api/2.0/clusters/list', headers=DBRKS_REQ_HEADERS) 96 | 97 | if response.status_code != 200: 98 | raise Exception(response.content) 99 | 100 | else: 101 | return response.json() 102 | 103 | 104 | 105 | if __name__ == "__main__": 106 | 107 | # The sp credentials need to come in from key vault 108 | 109 | tokenRequestBody = { 110 | 'grant_type': 'client_credentials', 111 | 'client_id': ' ', 112 | 'client_secret': ' ' 113 | } 114 | tokenRequestHeaders = {'Content-Type': 'application/x-www-form-urlencoded'} 115 | tokenBaseURL = 'https://login.microsoftonline.com/' + ' ' + '/oauth2/token' 116 | 117 | DBRKS_BEARER_TOKEN = createBearerToken(tokenRequestBody=tokenRequestBody, 118 | tokenRequestHeaders=tokenRequestHeaders, 119 | tokenBaseURL=tokenBaseURL 120 | ) 121 | 122 | DBRKS_MANAGEMENT_TOKEN = createManagementToken(tokenRequestBody=tokenRequestBody, 123 | tokenRequestHeaders=tokenRequestHeaders, 124 | tokenBaseURL=tokenBaseURL 125 | ) 126 | 127 | 128 | DBRKS_REQ_HEADERS = { 129 | 'Authorization': f'Bearer {DBRKS_BEARER_TOKEN}', 130 | 'X-Databricks-Azure-SP-Management-Token': f'{DBRKS_MANAGEMENT_TOKEN}', 131 | 'X-Databricks-Azure-Workspace-Resource-Id': '/subscriptions/<>/resourceGroups/databricks-sandbox-rg/providers/Microsoft.Databricks/workspaces/dbxwssandbox-eco3', 132 | 'Content-Type': 'application/json' 133 | } 134 | DATABRICKS_INSTANCE = "adb-204110209##.#.azuredatabricks.net" 135 | 136 | existingClusters = listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE) 137 | 138 | print(existingClusters) 139 | -------------------------------------------------------------------------------- /mlops/nyc_taxi/aml_pipelines/v2/dontdelete/dependencies/conda.yaml: -------------------------------------------------------------------------------- 1 | name: model-env 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.8 6 | - numpy=1.21.2 7 | - pip=21.2.4 8 | - scikit-learn=0.24.2 9 | - scipy=1.7.1 10 | - pandas>=1.1,<1.2 11 | - pip: 12 | - inference-schema[numpy-support]==1.3.0 13 | - xlrd==2.0.1 14 | - mlflow== 1.26.1 15 | - azureml-mlflow==1.42.0 16 | - pandas 17 | - requests -------------------------------------------------------------------------------- /mlops/nyc_taxi/databricks_workflows/nyc_taxi.yaml: -------------------------------------------------------------------------------- 1 | custom: 2 | basic-cluster-props: &basic-cluster-props 3 | spark_version: "13.0.x-cpu-ml-scala2.12" 4 | node_type_id: "Standard_DS3_v2" 5 | spark_env_vars: 6 | PYSPARK_PYTHON: "/databricks/python3/bin/python3" 7 | enable_elastic_disk: true 8 | runtime_engine: STANDARD 9 | autoscale: 10 | min_workers: 2 11 | max_workers: 8 12 | 13 | 14 | nyc_taxi_vars: &nyc_taxi_vars 15 | job_clusters: 16 | - job_cluster_key: training_job_cluster 17 | new_cluster: 18 | <<: *basic-cluster-props 19 | tasks: 20 | - task_key: "nyc_taxi_pipeline_data_engineering" 21 | job_cluster_key: "training_job_cluster" 22 | spark_python_task: 23 | python_file: "file://dataops/src_nyc_taxi/transform.py" 24 | libraries: [ 25 | whl: "file://src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1-py3-none-any.whl" 26 | ] 27 | 28 | - task_key: "nyc_taxi_pipeline_machine_learning" 29 | job_cluster_key: "training_job_cluster" 30 | spark_python_task: 31 | python_file: "file://src/pkg/nyc_taxi/entrypoint.py" 32 | libraries: [ 33 | whl: "file://src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1-py3-none-any.whl" 34 | ] 35 | depends_on: 36 | - task_key: "nyc_taxi_pipeline_data_engineering" 37 | 38 | build: 39 | no_build: true 40 | 41 | environments: 42 | default: 43 | workflows: 44 | - name: "DatabricksArtifacts" 45 | <<: *nyc_taxi_vars -------------------------------------------------------------------------------- /mlops/nyc_taxi/databricks_workflows/unit_tests.yaml: -------------------------------------------------------------------------------- 1 | custom: 2 | 3 | basic-cluster-props: &basic-cluster-props 4 | spark_version: "13.0.x-cpu-ml-scala2.12" 5 | node_type_id: "Standard_DS3_v2" 6 | spark_env_vars: 7 | PYSPARK_PYTHON: "/databricks/python3/bin/python3" 8 | enable_elastic_disk: true 9 | runtime_engine: STANDARD 10 | autoscale: 11 | min_workers: 2 12 | max_workers: 8 13 | 14 | databricks_utils_testing_vars: &databricks_utils_testing_vars 15 | job_clusters: 16 | - job_cluster_key: training_job_cluster 17 | new_cluster: 18 | <<: *basic-cluster-props 19 | tasks: 20 | - task_key: "unittests" 21 | job_cluster_key: "training_job_cluster" 22 | spark_python_task: 23 | python_file: "file://test/entrypoint.py" 24 | # this call supports all standard pytest arguments 25 | parameters: [ 26 | "file:fuse://test/test_dbx_utils_pkg/test_utils_create_cluster.py", 27 | "-o", 28 | "cache_dir=/dbfs/FileStore/", 29 | "--cov=dbx_utils", 30 | "--cov-append", 31 | "--cov-report=xml:/dbfs/FileStore/databricks_utils_cov_report.xml", 32 | "--cov-report=html:/dbfs/FileStore/htmlcov", 33 | "--junitxml=/dbfs/FileStore/databricks_utils_unit_testresults.xml" 34 | ] 35 | 36 | build: 37 | python: poetry 38 | 39 | environments: 40 | default: 41 | workflows: 42 | - name: "DatabricksUtilsTesting" 43 | <<: *databricks_utils_testing_vars 44 | -------------------------------------------------------------------------------- /mlops/nyc_taxi/monitoring/data_drift_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/mlops/nyc_taxi/monitoring/data_drift_monitor.py -------------------------------------------------------------------------------- /mlops/nyc_taxi/monitoring/mflow_experiment_dashboard_pbi.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | 4 | import mlflow 5 | from mlflow.tracking import MlflowClient 6 | 7 | mlflow_client = MlflowClient() 8 | experiment = mlflow_client.get_experiment_by_name("/Shared/ciaran_experiment_nyc_taxi") 9 | experiment_id = experiment.experiment_id 10 | 11 | 12 | df = mlflow.search_runs( 13 | experiment_ids=experiment_id 14 | ) 15 | 16 | display(df) 17 | 18 | df = df.rename(columns={"metrics.r2": "r2"}) 19 | display(df) 20 | df = df[df.end_time.notnull()] 21 | df = df[df.r2.notnull()] 22 | 23 | display(df) 24 | 25 | df2 = df.drop(df[df['status'] == "FINISHED"].index, inplace = True) 26 | 27 | display(df2) 28 | -------------------------------------------------------------------------------- /mlops/nyc_taxi/monitoring/model_serving_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/mlops/nyc_taxi/monitoring/model_serving_monitor.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "mlops_for_databricks" 3 | version = "0.1.0" 4 | description = "Full CICD deployment of mlops for Databricks" 5 | 6 | license = "MIT" 7 | 8 | authors = [ 9 | "Ciaran Hamill Diamond" 10 | ] 11 | 12 | repository = "https://github.com/python-poetry/poetry" 13 | homepage = "https://python-poetry.org" 14 | 15 | # README file(s) are used as the package description 16 | readme = ["README.md", "LICENSE"] 17 | 18 | # Keywords (translated to tags on the package index) 19 | keywords = ["packaging", "poetry"] 20 | 21 | packages = [ 22 | {include = "dbx_utils", from = "src/pkg"}, 23 | {include = "nyc_taxi", from = "src/pkg"}, 24 | {include = "ciaran_experiments", from = "experiments/notebooks"} 25 | ] 26 | 27 | 28 | [tool.poetry.dependencies] 29 | # All Packages Destined For Databricks Cluster 30 | # Only Install Pacakages Here That Are Not Already Installed On Databricks Cluster 31 | 32 | python = ">=3.8, <3.11" 33 | numpy = "^1.24.3" 34 | dbx = "^0.8.18" 35 | pytest = "^7.3.2" 36 | pytest-cov = "^4.1.0" 37 | bandit = "1.7.4" 38 | pylint = "2.15.0" 39 | pylint_junit = "0.3.2" 40 | python-dotenv = "1.0.0" 41 | pyspark = "3.2.1" 42 | delta-spark = "1.1.0" 43 | packaging = "21.*" 44 | mlflow-databricks-artifacts = "2.0.0" 45 | databricks-cli = "0.17.7" 46 | scikit-learn = "^1.2.2" 47 | lightgbm = "^3.3.5" 48 | pyyaml = "^6.0" 49 | pathlib = "^1.0.1" 50 | argparse = "^1.4.0" 51 | 52 | 53 | [tool.poetry.extras] 54 | [tool.poetry.scripts] -------------------------------------------------------------------------------- /score.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import json 4 | import numpy 5 | import joblib 6 | 7 | 8 | def init(): 9 | """ 10 | This function is called when the container is initialized/started, typically after create/update of the deployment. 11 | You can write the logic here to perform init operations like caching the model in memory 12 | """ 13 | global model 14 | # AZUREML_MODEL_DIR is an environment variable created during deployment. 15 | # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) 16 | # Please provide your model's folder name if there is one 17 | model_path = os.path.join( 18 | os.getenv("AZUREML_MODEL_DIR"), "model/sklearn_regression_model.pkl" 19 | ) 20 | # deserialize the model file back into a sklearn model 21 | model = joblib.load(model_path) 22 | logging.info("Init complete") 23 | 24 | 25 | def run(raw_data): 26 | """ 27 | This function is called for every invocation of the endpoint to perform the actual scoring/prediction. 28 | In the example we extract the data from the json input and call the scikit-learn model's predict() 29 | method and return the result back 30 | """ 31 | logging.info("model 1: request received") 32 | data = json.loads(raw_data)["data"] 33 | data = numpy.array(data) 34 | result = model.predict(data) 35 | logging.info("Request processed") 36 | return result.tolist() -------------------------------------------------------------------------------- /setup.ps1: -------------------------------------------------------------------------------- 1 | # Create The Service Principal 2 | # WARNING: DO NOT DELETE OUTPUT 3 | 4 | $SubscriptionId=( az account show --query id -o tsv ) 5 | 6 | $main_sp_name="main_sp_"+$(Get-Random -Minimum 1000 -Maximum 9999) 7 | 8 | # use --sdk-auth flag if using GitHub Action Azure Authenticator 9 | $DBX_CREDENTIALS=( az ad sp create-for-rbac -n $main_sp_name --role Owner --scopes /subscriptions/$SubscriptionId --query "{ARM_TENANT_ID:tenant, ARM_CLIENT_ID:appId, ARM_CLIENT_SECRET:password}") 10 | 11 | 12 | # Service Principal Credentials 13 | $DBX_CREDENTIALS=( $DBX_CREDENTIALS | convertfrom-json ) 14 | #echo $DBX_CREDENTIALS 15 | $Client_ID=( $DBX_CREDENTIALS.ARM_CLIENT_ID ) 16 | 17 | 18 | # Retrieve Object IDs 19 | $main_sp_name_obj_id=( az ad sp show --id $Client_ID --query "{roleBeneficiaryObjID:id}" -o tsv ) 20 | 21 | echo "Back Stop Command For Older Azure CLI Command" 22 | 23 | if ($main_sp_name_obj_id -eq "None" ) { $main_sp_name_obj_id=( az ad sp show --id $Client_ID --query "{roleBeneficiaryObjID:objectId}" -o tsv ) } 24 | 25 | 26 | 27 | $User_ObjID=( az ad signed-in-user show --query "{roleBeneficiaryObjID:id}" -o tsv ) 28 | 29 | echo "Back Stop Command For Older Azure CLI Command" 30 | 31 | if ($User_ObjID -eq "None" ) { $User_ObjID=( az ad signed-in-user show --query "{roleBeneficiaryObjID: objectId}" -o tsv ) } 32 | 33 | 34 | 35 | 36 | echo "Update The Variable Files" 37 | $environments = @('sandbox', 'development', 'uat', 'production') 38 | foreach ($environment in $environments) 39 | { 40 | $JsonData = Get-Content infrastructure/databricks/databricks_configs/$environment/repos.json -raw | ConvertFrom-Json 41 | foreach ($Obj in $JsonData.Git_Configuration) 42 | { 43 | ($Obj.git_username = $Git_Configuration ) 44 | } 45 | foreach ($Obj in $JsonData.Repo_Configuration) 46 | { 47 | ($Obj.url = $Repo_ConfigurationURL ) 48 | } 49 | $JsonData | ConvertTo-Json -Depth 4 | set-content infrastructure/databricks/databricks_configs/$environment/repos.json -NoNewline 50 | } 51 | 52 | foreach ($environment in $environments) 53 | { 54 | $JsonData = Get-Content infrastructure/databricks/databricks_configs/$environment/rbac.json -raw | ConvertFrom-Json 55 | $JsonData.RBAC_Assignments | % {if($_.Description -eq 'Your Object ID'){$_.roleBeneficiaryObjID=$User_ObjID}} 56 | $JsonData.RBAC_Assignments | % {if($_.Description -eq 'Databricks SPN'){$_.roleBeneficiaryObjID=$main_sp_name_obj_id}} 57 | $JsonData | ConvertTo-Json -Depth 4 | set-content infrastructure/databricks/databricks_configs/$environment/rbac.json -NoNewline 58 | } 59 | 60 | git add . 61 | git commit . -m 'Demo Deployment Commit' 62 | 63 | git config core.autocrlf false 64 | git rm --cached -r . 65 | git reset --hard 66 | git pull 67 | git push 68 | 69 | # Secret Configuration 70 | 71 | echo "Credentials Used In Later Step - Do Not Delete" 72 | echo $DBX_CREDENTIALS -------------------------------------------------------------------------------- /src/pkg/dbx_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/src/pkg/dbx_utils/__init__.py -------------------------------------------------------------------------------- /src/pkg/dbx_utils/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict 3 | 4 | def get_databricks_request_headers() -> Dict[str, str]: 5 | """ 6 | Gets the Databricks headers required for API calls using 7 | the Databricks AAD token, Databricks Management token and 8 | Databricks Workspace ID from the environment variables. 9 | 10 | :return: databricks_req_headers 11 | :type: dict 12 | """ 13 | workspace_id = os.environ.get("WORKSPACE_ID") 14 | databricks_aad_token = os.environ.get("DATABRICKS_AAD_TOKEN") 15 | databricks_mgmt_token = os.environ.get("DATABRICKS_MANAGEMENT_TOKEN") 16 | 17 | databricks_req_headers = { 18 | 'Authorization': f'Bearer {databricks_aad_token}', 19 | 'X-Databricks-Azure-SP-Management-Token': f'{databricks_mgmt_token}', 20 | 'X-Databricks-Azure-Workspace-Resource-Id': f'{workspace_id}', 21 | 'Content-Type': 'application/json' 22 | } 23 | return databricks_req_headers -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_azure_login.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script is used to login to Azure using a service principal 3 | """ 4 | import os 5 | import subprocess 6 | 7 | 8 | ARM_CLIENT_ID = os.environ['ARM_CLIENT_ID'] 9 | ARM_CLIENT_SECRET = os.environ['ARM_CLIENT_SECRET'] 10 | ARM_TENANT_ID = os.environ['ARM_TENANT_ID'] 11 | 12 | 13 | def run_cmd(cmd): 14 | """ 15 | Run a command and return the output as a list of lines 16 | shell=false for devops pipelines 17 | """ 18 | process = subprocess.run(cmd, stdout=subprocess.PIPE, check=True, shell=False) 19 | output = process.stdout.decode().split('\n') 20 | #print(output) 21 | output = [ 22 | line.strip('\n').strip('\r').strip('"') for line in output 23 | if line.strip('\n').strip('\r') 24 | ] 25 | #import pdb; pdb.set_trace() 26 | #print(f"Return Code: {process.returncode}"). 27 | if process.returncode != 0: 28 | raise RuntimeError('\n'.join(output)) 29 | return output, process.returncode 30 | 31 | 32 | def start_azure_login(): 33 | """ 34 | Login to Azure using the service principal 35 | """ 36 | az_login_cmd = ["az", "login", "--service-principal", 37 | "-u", ARM_CLIENT_ID, 38 | "-p", ARM_CLIENT_SECRET, 39 | "--tenant", ARM_TENANT_ID 40 | ] 41 | print("Logging In To Azure") 42 | #_, returncode = run_cmd(az_login_cmd) 43 | output, returncode = run_cmd(az_login_cmd) 44 | return returncode 45 | 46 | 47 | if __name__ == '__main__': 48 | returncode = start_azure_login() 49 | print(f"Return Code: {returncode}") 50 | -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_azure_login.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | az upgrade 4 | echo $ARM_CLIENT_ID 5 | echo $ARM_CLIENT_SECRET 6 | echo $ARM_TENANT_ID 7 | echo $AuthenticationType 8 | 9 | az config set extension.use_dynamic_install=yes_without_prompt 10 | 11 | 12 | echo "Service Principal Authentication" 13 | az login --service-principal -u $ARM_CLIENT_ID -p $ARM_CLIENT_SECRET --tenant $ARM_TENANT_ID 14 | az account list -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_aad_tokens.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | 4 | def createManagementToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL): 5 | """ 6 | Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens 7 | """ 8 | 9 | tokenRequestBody['resource'] = 'https://management.core.windows.net/' 10 | 11 | response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody) 12 | 13 | if response.status_code == 200: 14 | print(response.status_code) 15 | 16 | else: 17 | raise Exception(response.text) 18 | 19 | return response.json()['access_token'] 20 | 21 | def createBearerToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL): 22 | """ 23 | Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens 24 | """ 25 | 26 | tokenRequestBody['resource'] = '2ff814a6-3304-4ab8-85cb-cd0e6f879c1d' 27 | 28 | response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody) 29 | 30 | if response.status_code == 200: 31 | print(response.status_code) 32 | 33 | else: 34 | raise Exception(response.text) 35 | 36 | return response.json()['access_token'] 37 | 38 | 39 | if __name__ == "__main__": 40 | tokenRequestBody = { 41 | 'grant_type': 'client_credentials', 42 | 'client_id': os.environ['ARM_CLIENT_ID'], 43 | 'client_secret': os.environ['ARM_CLIENT_SECRET'] 44 | } 45 | tokenRequestHeaders = {'Content-Type': 'application/x-www-form-urlencoded'} 46 | tokenBaseURL = 'https://login.microsoftonline.com/' + os.environ['ARM_TENANT_ID'] + '/oauth2/token' 47 | 48 | bearerToken = createBearerToken(tokenRequestBody=tokenRequestBody, 49 | tokenRequestHeaders=tokenRequestHeaders, 50 | tokenBaseURL=tokenBaseURL 51 | ) 52 | 53 | managementToken = createManagementToken(tokenRequestBody=tokenRequestBody, 54 | tokenRequestHeaders=tokenRequestHeaders, 55 | tokenBaseURL=tokenBaseURL 56 | ) 57 | 58 | os.environ['DATABRICKS_AAD_TOKEN'] = bearerToken 59 | os.environ['DATABRICKS_MANAGEMENT_TOKEN'] = managementToken 60 | 61 | print("DATABRICKS_AAD_TOKEN",os.environ['DATABRICKS_AAD_TOKEN']) 62 | print("DATABRICKS_MANAGEMENT_TOKEN",os.environ['DATABRICKS_MANAGEMENT_TOKEN']) 63 | 64 | with open(os.getenv('GITHUB_ENV'), 'a') as env: 65 | print(f'DATABRICKS_AAD_TOKEN={bearerToken}', file=env) 66 | print(f'DATABRICKS_MANAGEMENT_TOKEN={managementToken}', file=env) 67 | 68 | 69 | 70 | #print("##vso[task.setvariable variable=DATABRICKS_AAD_TOKEN;isOutput=true;]{b}".format(b=bearerToken)) 71 | #print("##vso[task.setvariable variable=DATABRICKS_MANAGEMENT_TOKEN;isOutput=true;]{b}".format(b=managementToken)) -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_aad_tokens.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | DATABRICKS_AAD_TOKEN=$( \ 4 | az account get-access-token \ 5 | --resource $DBX_RESOURCE_ID \ 6 | --query "accessToken" \ 7 | --output tsv \ 8 | ) 9 | 10 | DATABRICKS_MANAGEMENT_TOKEN=$( \ 11 | az account get-access-token \ 12 | --resource "https://management.core.windows.net/" \ 13 | --query "accessToken" \ 14 | --output tsv \ 15 | ) 16 | 17 | echo $DATABRICKS_AAD_TOKEN 18 | echo $DATABRICKS_MANAGEMENT_TOKEN 19 | 20 | echo "##vso[task.setvariable variable="DATABRICKS_MANAGEMENT_TOKEN";isOutput=true;]$DATABRICKS_MANAGEMENT_TOKEN" 21 | echo "##vso[task.setvariable variable="DATABRICKS_AAD_TOKEN";isOutput=true;]$DATABRICKS_AAD_TOKEN" -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_azure_resources.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import json 4 | 5 | __here__ = os.path.dirname(__file__) 6 | 7 | ENVIRONMENT = os.environ['ENVIRONMENT'] 8 | 9 | 10 | class LoadJson(): 11 | def __init__(self): 12 | self.json_file = 'infrastructure/bicep/params/' + ENVIRONMENT + '/bicep.parameters.json' 13 | 14 | def load_json(self): 15 | with open(self.json_file, 'r') as f: 16 | repos_config = json.load(f) 17 | return repos_config 18 | 19 | def get_param_file_path(self): 20 | return self.load_json()['parameters']['TemplateParamFilePath']['value'] 21 | 22 | def get_template_file_path(self): 23 | return self.load_json()['parameters']['TemplateFilePath']['value'] 24 | 25 | def get_location(self): 26 | return self.load_json()['parameters']['location']['value'] 27 | 28 | 29 | def run_cmd(cmd): 30 | #May Need To Rmove shell=True 31 | process = subprocess.run(cmd, stdout=subprocess.PIPE) 32 | #print(process) 33 | output = process.stdout.decode().split('\n') 34 | #print(output) 35 | output = [line.strip('\n').strip('\r') for line in output] 36 | #print(output) 37 | if process.returncode != 0: 38 | raise RuntimeError('\n'.join(output)) 39 | return output 40 | 41 | 42 | def deploy_azure_resources(): 43 | json_obj = LoadJson() 44 | template_param_file_path = json_obj.get_param_file_path() 45 | template_file_path = json_obj.get_template_file_path() 46 | location = json_obj.get_location() 47 | 48 | az_deploy_cmd = ["az", "deployment", "sub", "create", 49 | "--location", location, 50 | "--template-file", template_file_path, 51 | "--parameters", template_param_file_path, 52 | "--name", ENVIRONMENT, 53 | "--only-show-errors" ] 54 | 55 | 56 | print("Deploying Azure Resources... This Make Take A Few Minutes") 57 | run_cmd(az_deploy_cmd) 58 | 59 | 60 | if __name__ == "__main__": 61 | deploy_azure_resources() 62 | -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_azure_resources.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo $ENVIRONMENT 4 | echo "Ingest JSON File" 5 | JSON=$( jq '.' infrastructure/bicep/params/$ENVIRONMENT/bicep.parameters.json) 6 | 7 | TemplateParamFilePath=$( jq -r '.parameters.TemplateParamFilePath.value' <<< "$JSON") 8 | echo "Parm File Path: $TemplateParamFilePath" 9 | 10 | 11 | TemplateFilePath=$( jq -r '.parameters.TemplateFilePath.value' <<< "$JSON") 12 | echo "File Path: $TemplateFilePath" 13 | 14 | location=$( jq -r '.parameters.location.value' <<< "$JSON") 15 | echo "Location: $location" 16 | 17 | 18 | echo "environment variable: $TemplateParamFilePath" 19 | echo "environment variable: $location" 20 | echo "environment variable: $TemplateFilePath" 21 | # Important to define unique deployment names as conflicts will occur 22 | echo "Create Azure DBX Resource Environments...." 23 | 24 | az deployment sub create \ 25 | --location $location \ 26 | --template-file $TemplateFilePath \ 27 | --parameters $TemplateParamFilePath \ 28 | --name "$ENVIRONMENT" \ 29 | --only-show-errors -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_cluster.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | from urllib.error import HTTPError 5 | 6 | import requests 7 | 8 | from dbx_utils.common import get_databricks_request_headers 9 | 10 | 11 | def ingest_cluster_param_file(filename): 12 | 13 | """ 14 | loads the json parameter file containing 15 | databricks cluster configs to build 16 | 17 | :param: filename: param file location 18 | :type: str 19 | 20 | :return: cluster_param_file: dbx cluster definitions 21 | :type: str 22 | """ 23 | with open(filename, 'r', encoding="utf-8") as file: 24 | cluster_param_file = json.load(file) 25 | cluster_param_file = cluster_param_file['Clusters'] 26 | 27 | return cluster_param_file 28 | 29 | 30 | def create_clusters(): 31 | """ 32 | Main script which calls sub functions to create 33 | databricks clusters degined in params file 34 | """ 35 | 36 | environment = os.environ.get("ENVIRONMENT") 37 | 38 | cluster_param_file = ingest_cluster_param_file( 39 | 'infrastructure/databricks/databricks_configs/' 40 | + environment + 41 | '/clusters.json' 42 | ) 43 | 44 | existing_clusters, _ = list_existing_clusters() 45 | 46 | existing_clusters_name_arr = get_cluster_names(existing_clusters) 47 | 48 | for cluster in cluster_param_file: 49 | # Cluster Does Not Exist - Deploy 50 | if cluster['cluster_name'] not in existing_clusters_name_arr: 51 | cluster_status, cluster_id = create_cluster(cluster) 52 | print(f"Cluster Status: {cluster_status}") 53 | manage_cluster_state(cluster_id) 54 | else: 55 | print( 56 | f"Cluster {cluster['cluster_name']} already exists - Skipping." 57 | ) 58 | 59 | 60 | def list_existing_clusters(): 61 | """ 62 | Returns a Json object containing a list 63 | of existing Databricks Clusters. 64 | 65 | :return: response_content: dbx api response with clusters 66 | :type: str 67 | 68 | :return: status_code: api status code 69 | :type: int 70 | """ 71 | 72 | databricks_req_headers = get_databricks_request_headers() 73 | databricks_instance = os.environ.get("DATABRICKS_INSTANCE") 74 | response = requests.get( 75 | 'https://' + databricks_instance + '/api/2.0/clusters/list', 76 | headers=databricks_req_headers, 77 | timeout=10 78 | ) 79 | 80 | if response.ok: 81 | return response.json(), response.status_code 82 | 83 | raise HTTPError( 84 | response.url, code=response.status_code, msg="Failure", 85 | hdrs=response.headers, fp=response 86 | ) 87 | 88 | 89 | def get_cluster_names(existing_clusters): 90 | """ 91 | Parses JSON object with existing databricks clusters 92 | and returns an array with cluster names 93 | 94 | :param: cluster: json object of existing dbx clusters 95 | :type: str 96 | 97 | :return: existing_clusters_name_arr: array of cluster names 98 | :type: array 99 | """ 100 | existing_clusters_name_arr = [] 101 | 102 | if existing_clusters: # If clusters exist 103 | for existing_cluster in existing_clusters['clusters']: 104 | existing_clusters_name_arr.append(existing_cluster['cluster_name']) 105 | return existing_clusters_name_arr 106 | # If No Clusters Exist, Return Empty Array 107 | return existing_clusters_name_arr 108 | 109 | 110 | def create_cluster(cluster): 111 | """ 112 | Takes json definitions for clusters to create, 113 | and invokes the Databricks Cluster API. 114 | 115 | :param: cluster: cluster definition 116 | :type: str 117 | 118 | :return: status code: response status for api call 119 | :type: int 120 | 121 | :return: cluster_id: cluster id for newly created databricks cluster 122 | :type: str 123 | """ 124 | databricks_req_headers = get_databricks_request_headers() 125 | databricks_instance = os.environ.get("DATABRICKS_INSTANCE") 126 | response = requests.post( 127 | 'https://' + databricks_instance + '/api/2.0/clusters/create', 128 | headers=databricks_req_headers, 129 | json=cluster, 130 | timeout=10 131 | ) 132 | 133 | if response.ok: 134 | cluster_id = response.json()["cluster_id"] 135 | return response.status_code, cluster_id 136 | 137 | raise HTTPError( 138 | response.text, 139 | code=response.status_code, 140 | msg="Failure", 141 | hdrs=response.headers, 142 | fp=response 143 | ) 144 | 145 | 146 | def manage_cluster_state(cluster_id): 147 | """ 148 | Function contuninally checks cluster state until 149 | cluster is Running, or Fails to Start 150 | 151 | :param: cluster_id: clusterid for the Databricks Cluster 152 | :type: str 153 | """ 154 | 155 | await_cluster = True 156 | start_time = time.time() 157 | loop_time = 1200 # 20 Minutes 158 | while await_cluster: 159 | current_time = time.time() 160 | elapsed_time = current_time - start_time 161 | if elapsed_time > loop_time: 162 | raise Exception(f'Error: Loop took over {loop_time} seconds to run.') 163 | if get_databricks_cluster_info(cluster_id)['state'] == 'TERMINATED': 164 | print('Starting Terminated Cluster') 165 | raise ValueError("Failed to create cluster, cluster teminated") 166 | if get_databricks_cluster_info(cluster_id)['state'] == 'RESTARTING': 167 | print('Cluster is Restarting') 168 | time.sleep(60) 169 | elif get_databricks_cluster_info(cluster_id)['state'] == 'PENDING': 170 | print('Cluster is Pending Start') 171 | time.sleep(60) 172 | else: 173 | print('Cluster is Running') 174 | await_cluster = False 175 | 176 | 177 | def get_databricks_cluster_info(cluster_id): 178 | """ 179 | Returns an object revealing the Databricks Cluster State 180 | "Terminated", "Restarting", or "Pending" 181 | 182 | :param: cluster_id: clusterid for the Databricks Cluster 183 | :type: str 184 | 185 | :return: api response object 186 | :type: str 187 | """ 188 | databricks_req_headers = get_databricks_request_headers() 189 | databricks_instance = os.environ.get("DATABRICKS_INSTANCE") 190 | databricks_cluster_id = {'cluster_id': cluster_id} 191 | 192 | response = requests.get( 193 | 'https://' + databricks_instance + '/api/2.0/clusters/get', 194 | headers=databricks_req_headers, 195 | params=databricks_cluster_id, 196 | timeout=10 197 | ) 198 | 199 | if response.ok: 200 | return response.json() 201 | 202 | raise HTTPError( 203 | response.text, 204 | code=response.status_code, 205 | msg="Failure", 206 | hdrs=response.headers, 207 | fp=response 208 | ) 209 | 210 | 211 | def main(): 212 | """ 213 | Main function to invoke cluster creation 214 | """ 215 | 216 | create_clusters() 217 | 218 | 219 | if __name__ == "__main__": 220 | main() 221 | -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_databricks_token.sh: -------------------------------------------------------------------------------- 1 | SECRET_NAME="dbkstoken" 2 | # Check if secret exists 3 | 4 | az upgrade 5 | 6 | az account set --subscription $SUBSCRIPTION_ID 7 | 8 | 9 | SECRET_EXISTS=$(az keyvault secret list \ 10 | --vault-name $AZ_KEYVAULT_NAME \ 11 | --query "contains([].id, \ 12 | 'https://$AZ_KEYVAULT_NAME.vault.azure.net/secrets/$SECRET_NAME')") 13 | 14 | echo "secret exists: $SECRET_EXISTS" 15 | 16 | if [ $SECRET_EXISTS == true ]; then 17 | echo "Secret '$SECRET_NAME' exists! fetching..." 18 | DATABRICKS_TOKEN=$(az keyvault secret show \ 19 | --name $SECRET_NAME \ 20 | --vault-name $AZ_KEYVAULT_NAME \ 21 | --query "value" \ 22 | -o tsv ) 23 | 24 | #echo "Secret Value: $DATABRICKS_TOKEN" 25 | 26 | # if [[ $DevOps_Agent == "GitHub" ]]; then 27 | # echo "DATABRICKS_TOKEN=$DATABRICKS_TOKEN" >> $GITHUB_ENV 28 | # else 29 | # echo "##vso[task.setvariable variable="DATABRICKS_TOKEN";isOutput=true;]$DATABRICKS_TOKEN" 30 | # fi 31 | 32 | 33 | else 34 | echo "Secret '$SECRET_NAME' Do Not exist! Creating PAT Token & Store In Key Vault..." 35 | # Must Assign SP Minimum Contributor Permissions. Must also give the SP Key Vault Administrator Privileges (Need to Set these in YAML) 36 | 37 | PAT_TOKEN_RESPONSE=$(curl -X POST \ 38 | -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 39 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 40 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" -d \ 41 | '{ 42 | "lifetime_seconds": "30000000", 43 | "comment": "Token For Databricks" 44 | }' https://$DATABRICKS_INSTANCE/api/2.0/token/create ) 45 | 46 | echo "PAT Token Creation Response...." 47 | #echo $PAT_TOKEN_RESPONSE 48 | 49 | DATABRICKS_PAT_TOKEN=$(jq .token_value -r <<< "$PAT_TOKEN_RESPONSE") 50 | echo "PAT Token Creation...." 51 | #echo $DATABRICKS_PAT_TOKEN 52 | 53 | echo "Store PAT In Key Vault...." 54 | az keyvault secret set \ 55 | --vault-name $AZ_KEYVAULT_NAME \ 56 | --name $SECRET_NAME \ 57 | --value $DATABRICKS_PAT_TOKEN 58 | 59 | #echo "Databricks Token As Environment Variable..." 60 | 61 | #if [[ $DevOps_Agent == "GitHub" ]]; then 62 | # echo "DATABRICKS_AAD_TOKEN=$DATABRICKS_AAD_TOKEN" >> $GITHUB_ENV 63 | #else 64 | # echo "##vso[task.setvariable variable="DATABRICKS_AAD_TOKEN";isOutput=true;]$DATABRICKS_AAD_TOKEN" 65 | #fi 66 | fi 67 | -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_key_vault_secrets.sh: -------------------------------------------------------------------------------- 1 | 2 | ###################################################################################################################################################################// 3 | ## Create Key Vault Secrets 4 | ###################################################################################################################################################################// 5 | SECRET_VALUE=$ARM_CLIENT_ID 6 | SECRET_NAME="ARMCLIENTID" 7 | 8 | 9 | 10 | SECRET_EXISTS=$(az keyvault secret list \ 11 | --vault-name $AZ_KEYVAULT_NAME \ 12 | --query "contains([].id, \ 13 | 'https://$AZ_KEYVAULT_NAME.vault.azure.net/secrets/$SECRET_NAME')") 14 | 15 | echo "secret exists: $SECRET_EXISTS" 16 | 17 | if [ $SECRET_EXISTS == true ]; then 18 | echo "Secret '$SECRET_NAME' exists! fetching..." 19 | SECRET_VALUE=$(az keyvault secret show \ 20 | --name $SECRET_NAME \ 21 | --vault-name $AZ_KEYVAULT_NAME \ 22 | --query "value") 23 | 24 | echo "Secret Value: $SECRET_VALUE" 25 | 26 | else 27 | echo "Secret '$SECRET_NAME' Do Not exist! Creating PAT Token & Store In Key Vault..." 28 | # Must Assign SP Minimum Contributor Permissions. Must also give the SP Key Vault Administrator Privileges (Need to Set these in YAML) 29 | echo "Store Secret In Key Vault...." 30 | az keyvault secret set \ 31 | --vault-name $AZ_KEYVAULT_NAME \ 32 | --name $SECRET_NAME \ 33 | --value $SECRET_VALUE 34 | fi 35 | 36 | 37 | ###################################################################################################################################################################// 38 | ## ARM_TENANT 39 | ###################################################################################################################################################################// 40 | 41 | 42 | SECRET_VALUE=$ARM_TENANT_ID 43 | SECRET_NAME="ARMTENANTID" 44 | # Check if secret exists 45 | SECRET_EXISTS=$(az keyvault secret list \ 46 | --vault-name $AZ_KEYVAULT_NAME \ 47 | --query "contains([].id, \ 48 | 'https://$AZ_KEYVAULT_NAME.vault.azure.net/secrets/$SECRET_NAME')") 49 | 50 | echo "secret exists: $SECRET_EXISTS" 51 | 52 | if [ $SECRET_EXISTS == true ]; then 53 | echo "Secret '$SECRET_NAME' exists! fetching..." 54 | SECRET_VALUE=$(az keyvault secret show \ 55 | --name $SECRET_NAME \ 56 | --vault-name $AZ_KEYVAULT_NAME \ 57 | --query "value") 58 | 59 | echo "Secret Value: $SECRET_VALUE" 60 | 61 | else 62 | echo "Secret '$SECRET_NAME' Do Not exist! Creating PAT Token & Store In Key Vault..." 63 | # Must Assign SP Minimum Contributor Permissions. Must also give the SP Key Vault Administrator Privileges (Need to Set these in YAML) 64 | echo "Store Secret In Key Vault...." 65 | az keyvault secret set \ 66 | --vault-name $AZ_KEYVAULT_NAME \ 67 | --name $SECRET_NAME \ 68 | --value $SECRET_VALUE 69 | fi 70 | 71 | 72 | ###################################################################################################################################################################// 73 | ## ARM_Client_Secret 74 | ###################################################################################################################################################################// 75 | 76 | 77 | SECRET_VALUE=$ARM_CLIENT_SECRET 78 | SECRET_NAME="ARMCLIENTSECRET" 79 | # Check if secret exists 80 | SECRET_EXISTS=$(az keyvault secret list \ 81 | --vault-name $AZ_KEYVAULT_NAME \ 82 | --query "contains([].id, \ 83 | 'https://$AZ_KEYVAULT_NAME.vault.azure.net/secrets/$SECRET_NAME')") 84 | 85 | echo "secret exists: $SECRET_EXISTS" 86 | 87 | if [ $SECRET_EXISTS == true ]; then 88 | echo "Secret '$SECRET_NAME' exists! fetching..." 89 | SECRET_VALUE=$(az keyvault secret show \ 90 | --name $SECRET_NAME \ 91 | --vault-name $AZ_KEYVAULT_NAME \ 92 | --query "value") 93 | 94 | echo "Secret Value: $SECRET_VALUE" 95 | 96 | else 97 | echo "Secret '$SECRET_NAME' Do Not exist! Creating PAT Token & Store In Key Vault..." 98 | # Must Assign SP Minimum Contributor Permissions. Must also give the SP Key Vault Administrator Privileges (Need to Set these in YAML) 99 | echo "Store Secret In Key Vault...." 100 | az keyvault secret set \ 101 | --vault-name $AZ_KEYVAULT_NAME \ 102 | --name $SECRET_NAME \ 103 | --value $SECRET_VALUE 104 | fi -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_repo_folder.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import os 4 | import json 5 | from dotenv import load_dotenv 6 | 7 | 8 | load_dotenv(".env") # load environment variables 9 | 10 | def _ingest_repo_param_file(filename): 11 | """ 12 | Ingests the Json Parameters File for Databricks Repo Creation 13 | """ 14 | with open(filename, 'r') as file: 15 | 16 | repo_param_file = json.load(file)['Repo_Configuration'] 17 | 18 | return repo_param_file 19 | 20 | def create_databricks_repos(postjson): 21 | """ 22 | Takes Json object for cluster creation, and invokes the Databricks API. 23 | """ 24 | 25 | ARM_CLIENT_ID = os.environ.get("ARM_CLIENT_ID") 26 | WORKSPACE_ID = os.environ.get("WORKSPACE_ID") 27 | DATABRICKS_INSTANCE = os.environ.get("DATABRICKS_INSTANCE") 28 | DATABRICKS_AAD_TOKEN = os.environ.get("DATABRICKS_AAD_TOKEN") 29 | DATABRICKS_MANAGEMENT_TOKEN = os.environ.get("DATABRICKS_MANAGEMENT_TOKEN") 30 | 31 | 32 | DBRKS_REQ_HEADERS = { 33 | 'Authorization': f'Bearer {DATABRICKS_AAD_TOKEN}', 34 | 'X-Databricks-Azure-SP-Management-Token': f'{DATABRICKS_MANAGEMENT_TOKEN}', 35 | 'X-Databricks-Azure-Workspace-Resource-Id': f'{WORKSPACE_ID}', 36 | 'Content-Type': 'application/json' 37 | } 38 | 39 | path = postjson['path'] 40 | #import pdb; pdb.set_trace() 41 | 42 | newData = { 43 | "path": "/Repos/"+ ARM_CLIENT_ID + "/" + path 44 | } 45 | 46 | postjson.update(newData) 47 | 48 | print("Updated Repo Json String") 49 | print(postjson) 50 | 51 | response = requests.post( 52 | 'https://' + DATABRICKS_INSTANCE + '/api/2.0/repos', headers=DBRKS_REQ_HEADERS, json=postjson 53 | ) 54 | 55 | #400: Already Exists 56 | print(f"Response: {response.content}") 57 | 58 | if response.status_code == 200 or response.status_code == 400: 59 | print(f"Status Code: {response.status_code}") 60 | else: 61 | raise Exception(response.status_code) 62 | 63 | 64 | def main(): 65 | 66 | ENVIRONMENT = os.environ.get("ENVIRONMENT") 67 | 68 | file_name = 'infrastructure/databricks/databricks_configs/' + ENVIRONMENT + '/repos.json' 69 | repo_param_file = _ingest_repo_param_file(file_name) 70 | 71 | # Extract array from Json object 72 | 73 | print(f"Repos To Connect {repo_param_file}") 74 | 75 | for repo in repo_param_file: 76 | print(f"Repo {repo}") 77 | create_databricks_repos(repo) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_repo_folder.sh: -------------------------------------------------------------------------------- 1 | echo "Ingest JSON File" 2 | JSON=$( jq '.' infrastructure/databricks/databricks_configs/$ENVIRONMENT/repos.json) 3 | 4 | #echo "${JSON}" | jq 5 | 6 | 7 | echo "User Folders In Databricks Repos Will Be Described Using An Email Address... e.g Ciaranh@Microsoft.com " 8 | echo "The DevOps Agent SP Which Is Also A User, However Its Databricks Repo User Folder is Named After The AppID: $ARM_CLIENT_ID" 9 | echo "All Folders Defined In The JSON Parameters Folder Will Be Appended To /Repos//" 10 | 11 | for row in $(echo "${JSON}" | jq -r '.Repo_Configuration[] | @base64'); do 12 | _jq() { 13 | echo ${row} | base64 --decode | jq -r ${1} 14 | } 15 | 16 | JSON_STRING=$( jq -n -c \ 17 | --arg url "$(_jq '.url')" \ 18 | --arg pr "$(_jq '.provider')" \ 19 | --arg pa "/Repos/$ARM_CLIENT_ID/$(_jq '.path')" \ 20 | '{url: $url, 21 | provider: $pr, 22 | path: $pa}' ) 23 | 24 | #echo "JSON -D String " 25 | #echo $JSON_STRING 26 | 27 | echo $DATABRICKS_AAD_TOKEN 28 | echo $DATABRICKS_MANAGEMENT_TOKEN 29 | 30 | CREATE_REPO_RESPONSE=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 31 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 32 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 33 | -H 'Content-Type: application/json' \ 34 | -d $JSON_STRING \ 35 | https://$DATABRICKS_INSTANCE/api/2.0/repos ) 36 | 37 | echo "Repo Response" 38 | echo $CREATE_REPO_RESPONSE 39 | done -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_role_based_access.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | #echo "Resource Group Name: $RESOURCE_GROUP_NAME" 5 | echo "ENVIRONMENT: $ENVIRONMENT" 6 | RESOURCE_GROUP_ID=$( az group show -n $RESOURCE_GROUP_NAME --query id -o tsv ) 7 | 8 | echo "Ingest JSON File" 9 | JSON=$( jq '.' infrastructure/databricks/databricks_configs/$ENVIRONMENT/rbac.json) 10 | #echo "${JSON}" | jq 11 | 12 | for row in $(echo "${JSON}" | jq -r '.RBAC_Assignments[] | @base64'); do 13 | _jq() { 14 | echo ${row} | base64 --decode | jq -r ${1} 15 | } 16 | ROLES_ARRAY="$(_jq '.roles')" 17 | #echo $ROLES_ARRAY 18 | 19 | # Before: [ "Contributor", "DBX_Custom_Role", "Key Vault Administrator" ] 20 | # xargs trims whitespace on either side. -n removes newline characters. 21 | ROLES_ARRAY_PARSED=$( echo $ROLES_ARRAY | jq -r | tr -d "[]" | tr -d \'\" | xargs echo -n ) 22 | # After: Contributor, DBX_Custom_Role, Key Vault Administrator 23 | #echo $ROLES_ARRAY_PARSED 24 | Field_Separator=$IFS 25 | IFS=, 26 | for ROLE in $ROLES_ARRAY_PARSED; do 27 | ROLE=$( echo $ROLE | xargs ) 28 | 29 | az role assignment create \ 30 | --role "$ROLE" \ 31 | --assignee-object-id $(_jq '.roleBeneficiaryObjID') \ 32 | --assignee-principal-type "$(_jq '.principalType')" \ 33 | --scope "$RESOURCE_GROUP_ID" \ 34 | -o none 35 | #--scope "$(_jq '.scope')" 36 | 37 | done 38 | IFS=$Field_Separator 39 | done -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_secret_scopes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import requests 4 | 5 | __here__ = os.path.dirname(__file__) 6 | 7 | RESOURCE_GROUP_NAME = os.environ['RESOURCE_GROUP_NAME'] 8 | DATABRICKS_INSTANCE = os.environ['DATABRICKS_INSTANCE'] 9 | WORKSPACE_ID = os.environ['WORKSPACE_ID'] 10 | SUBSCRIPTION_ID = os.environ['SUBSCRIPTION_ID'] 11 | DATABRICKS_AAD_TOKEN = os.environ['DATABRICKS_AAD_TOKEN'] 12 | DATABRICKS_MANAGEMENT_TOKEN = os.environ['DATABRICKS_MANAGEMENT_TOKEN'] 13 | ARM_CLIENT_ID = os.environ['ARM_CLIENT_ID'] 14 | ARM_CLIENT_SECRET = os.environ['ARM_CLIENT_SECRET'] 15 | ARM_TENANT_ID = os.environ['ARM_TENANT_ID'] 16 | AML_WS_NAME = os.environ['AML_WS_NAME'] 17 | 18 | 19 | DBRKS_REQ_HEADERS = { 20 | 'Authorization': f'Bearer {DATABRICKS_AAD_TOKEN}', 21 | 'X-Databricks-Azure-SP-Management-Token': f'{DATABRICKS_MANAGEMENT_TOKEN}', 22 | 'X-Databricks-Azure-Workspace-Resource-Id': f'{WORKSPACE_ID}', 23 | 'Content-Type': 'application/json' 24 | } 25 | 26 | 27 | 28 | def run_cmd(cmd): 29 | #May Need To Rmove shell=True 30 | process = subprocess.run(cmd, stdout=subprocess.PIPE) 31 | output = process.stdout.decode().split('\n') 32 | #print(output) 33 | output = [line.strip('\n').strip('\r') for line in output] 34 | 35 | 36 | #print(f"Return Code: {process.returncode}") 37 | if process.returncode != 0: 38 | raise RuntimeError('\n'.join(output)) 39 | return output 40 | 41 | def get_app_insight_name(): 42 | cmd = ["az", "resource", "list", "-g", RESOURCE_GROUP_NAME, "--resource-type", "microsoft.insights/components", "--query", "[].name", "-o", "tsv"] 43 | name = run_cmd(cmd) 44 | return name 45 | 46 | 47 | def get_app_insight_key(name): 48 | cmd = ["az", "monitor", "app-insights", "component", "show", "-g", RESOURCE_GROUP_NAME, "-a", name, "--query", "connectionString", "-o", "tsv"] 49 | key = run_cmd(cmd) 50 | return key 51 | 52 | 53 | def create_secret_scopes(scope_name=str, initial_manage_principal=str): 54 | """ 55 | Takes Json object for cluster creation, and invokes the Databricks API. 56 | """ 57 | postjson = { 58 | "scope": scope_name, 59 | "initial_manage_principal": initial_manage_principal 60 | } 61 | 62 | response = requests.post( 63 | 'https://' + DATABRICKS_INSTANCE + '/api/2.0/secrets/scopes/create', headers=DBRKS_REQ_HEADERS, json=postjson 64 | ) 65 | 66 | #print(response.status_code) 67 | #if response.status_code != 200: 68 | # raise Exception(response.text) 69 | 70 | #print(response.json()) 71 | 72 | def insert_secret(secret_value=str, scope_name=str, key=str): 73 | """ 74 | Takes Json object for cluster creation, and invokes the Databricks API. 75 | """ 76 | postjson = { 77 | "scope": scope_name, 78 | "key": key, 79 | "string_value": secret_value 80 | } 81 | 82 | response = requests.post( 83 | 'https://' + DATABRICKS_INSTANCE + '/api/2.0/secrets/put', headers=DBRKS_REQ_HEADERS, json=postjson 84 | ) 85 | #print(response.status_code) 86 | if response.status_code != 200: 87 | raise Exception(response.text) 88 | 89 | #print(response.json()) 90 | 91 | 92 | if __name__ == '__main__': 93 | app_insight_name = get_app_insight_name()[0] 94 | #print(app_insight_name) 95 | app_insight_key = get_app_insight_key(app_insight_name)[0] 96 | #print(app_insight_key) 97 | 98 | 99 | # Create Secret Scopes 100 | create_secret_scopes(scope_name="DBX_SP_Credentials", initial_manage_principal="users") 101 | create_secret_scopes(scope_name="AzureResourceSecrets", initial_manage_principal="users") 102 | 103 | # Insert Secrets into Secret Scope "DBX_SP_Credentials" 104 | insert_secret(secret_value=ARM_CLIENT_ID, scope_name="DBX_SP_Credentials", key="DBX_SP_Client_ID") 105 | insert_secret(secret_value=ARM_CLIENT_SECRET, scope_name="DBX_SP_Credentials", key="DBX_SP_Client_Secret") 106 | insert_secret(secret_value=ARM_TENANT_ID, scope_name="DBX_SP_Credentials", key="DBX_SP_Tenant_ID") 107 | insert_secret(secret_value=SUBSCRIPTION_ID, scope_name="DBX_SP_Credentials", key="SUBSCRIPTION_ID") 108 | 109 | # Insert Secrets into Secret Scope "AzureResourceSecrets" 110 | insert_secret(secret_value=app_insight_key, scope_name="AzureResourceSecrets", key="AppInsightsKey") 111 | insert_secret(secret_value=RESOURCE_GROUP_NAME, scope_name="AzureResourceSecrets", key="RESOURCE_GROUP_NAME") 112 | insert_secret(secret_value=AML_WS_NAME, scope_name="AzureResourceSecrets", key="AML_WS_NAME") 113 | -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_create_secret_scopes.sh: -------------------------------------------------------------------------------- 1 | az config set extension.use_dynamic_install=yes_without_promp 2 | az extension add --name application-insights 3 | 4 | echo $RESOURCE_GROUP_NAME 5 | echo $DATABRICKS_INSTANCE 6 | echo $WORKSPACE_ID 7 | echo $SUBSCRIPTION_ID 8 | 9 | APP_INSIGHT_NAME=$(az resource list \ 10 | -g $RESOURCE_GROUP_NAME \ 11 | --resource-type 'microsoft.insights/components' \ 12 | --query [].name \ 13 | -o tsv ) 14 | 15 | APP_INSIGHT_INSTRUMENT_KEY=$( az monitor app-insights component show \ 16 | -g $RESOURCE_GROUP_NAME \ 17 | -a $APP_INSIGHT_NAME \ 18 | --query connectionString ) 19 | 20 | echo "Test" 21 | 22 | echo $APP_INSIGHT_NAME 23 | echo $APP_INSIGHT_INSTRUMENT_KEY 24 | echo $SUBSCRIPTION_ID 25 | 26 | echo "Creating Secret Scopes...." 27 | 28 | echo "Create DBX_SP_Credentials Scope...." 29 | 30 | Create_Secret_Scope=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 31 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 32 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 33 | -H 'Content-Type: application/json' -d \ 34 | '{ 35 | "scope": "DBX_SP_Credentials", 36 | "initial_manage_principal": "users" 37 | }' https://$DATABRICKS_INSTANCE/api/2.0/secrets/scopes/create ) 38 | 39 | echo "Inserting Service Principal + Other Secrets Into Scope.... " 40 | 41 | 42 | 43 | 44 | JSON_STRING=$( jq -n -c \ 45 | --arg scope "DBX_SP_Credentials" \ 46 | --arg key "DBX_SP_Client_Secret" \ 47 | --arg value "$ARM_CLIENT_SECRET" \ 48 | '{ 49 | scope: $scope, 50 | key: $key, 51 | string_value: $value 52 | }' ) 53 | 54 | echo $JSON_STRING 55 | 56 | Create_DBX_Client_Secret=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 57 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 58 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 59 | -H 'Content-Type: application/json' \ 60 | -d $JSON_STRING \ 61 | https://$DATABRICKS_INSTANCE/api/2.0/secrets/put ) 62 | 63 | 64 | 65 | JSON_STRING=$( jq -n -c \ 66 | --arg scope "DBX_SP_Credentials" \ 67 | --arg key "DBX_SP_ClientID" \ 68 | --arg value "$ARM_CLIENT_ID" \ 69 | '{ 70 | scope: $scope, 71 | key: $key, 72 | string_value: $value 73 | }' ) 74 | echo $JSON_STRING 75 | 76 | Create_DBX_ClientID_Secret=$(curl -X POST \ 77 | -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 78 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 79 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 80 | -H 'Content-Type: application/json' \ 81 | -d $JSON_STRING \ 82 | https://$DATABRICKS_INSTANCE/api/2.0/secrets/put ) 83 | 84 | 85 | 86 | JSON_STRING=$( jq -n -c --arg scope "DBX_SP_Credentials" --arg key "DBX_SP_TenantID" --arg value "$ARM_TENANT_ID" \ 87 | '{ 88 | scope: $scope, 89 | key: $key, 90 | string_value: $value 91 | }' ) 92 | 93 | echo $JSON_STRING 94 | 95 | Create_DBX_TenantID_Secret=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 96 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 97 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 98 | -H 'Content-Type: application/json' \ 99 | -d $JSON_STRING \ 100 | https://$DATABRICKS_INSTANCE/api/2.0/secrets/put ) 101 | 102 | 103 | 104 | JSON_STRING=$( jq -n -c --arg scope "DBX_SP_Credentials" --arg key "SUBSCRIPTION_ID" --arg value "$SUBSCRIPTION_ID" \ 105 | '{ 106 | scope: $scope, 107 | key: $key, 108 | string_value: $value 109 | }' ) 110 | 111 | echo $JSON_STRING 112 | 113 | CREATE_SUBSCRIPTIONID=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 114 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 115 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 116 | -H 'Content-Type: application/json' \ 117 | -d $JSON_STRING \ 118 | https://$DATABRICKS_INSTANCE/api/2.0/secrets/put ) 119 | 120 | 121 | 122 | 123 | 124 | echo "Create Azure Resources Secrets Scope...." 125 | 126 | Create_Secret_Scope=$(curl -X POST -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 127 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 128 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 129 | -H 'Content-Type: application/json' -d \ 130 | '{ 131 | "scope": "AzureResourceSecrets", 132 | "initial_manage_principal": "users" 133 | }' https://$DATABRICKS_INSTANCE/api/2.0/secrets/scopes/create ) 134 | 135 | #There can be encoding problems passing some variables directly into the api request. Use json_String below with jq to solve this issue 136 | JSON_STRING=$( jq -n -c --arg scope "AzureResourceSecrets" --arg key "appi_ik" --arg value "$APP_INSIGHT_INSTRUMENT_KEY" \ 137 | '{ 138 | scope: $scope, 139 | key: $key, 140 | string_value: $value 141 | }' ) 142 | 143 | Create_APP_INSIGHT_INSTRUMENT_KEY_Secret=$(curl -X POST \ 144 | -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 145 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 146 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 147 | -H 'Content-Type: application/json' \ 148 | -d $JSON_STRING \ 149 | https://$DATABRICKS_INSTANCE/api/2.0/secrets/put ) 150 | 151 | 152 | JSON_STRING=$( jq -n -c --arg scope "AzureResourceSecrets" --arg key "RESOURCE_GROUP_NAME" --arg value "$RESOURCE_GROUP_NAME" \ 153 | '{ 154 | scope: $scope, 155 | key: $key, 156 | string_value: $value 157 | }' ) 158 | 159 | CREATE_RESOURCE_GROUP_NAME_SECRET=$(curl -X POST \ 160 | -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 161 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 162 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 163 | -H 'Content-Type: application/json' \ 164 | -d $JSON_STRING \ 165 | https://$DATABRICKS_INSTANCE/api/2.0/secrets/put ) 166 | 167 | 168 | JSON_STRING=$( jq -n -c --arg scope "AzureResourceSecrets" --arg key "AML_WS_NAME" --arg value "$AML_WS_NAME" \ 169 | '{ 170 | scope: $scope, 171 | key: $key, 172 | string_value: $value 173 | }' ) 174 | 175 | CREATE_AML_WS_NAME_SECRET=$(curl -X POST \ 176 | -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 177 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 178 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 179 | -H 'Content-Type: application/json' \ 180 | -d $JSON_STRING \ 181 | https://$DATABRICKS_INSTANCE/api/2.0/secrets/put ) 182 | -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_git_configuration.py: -------------------------------------------------------------------------------- 1 | 2 | # If You Want To Run A Job Which Is Linked To A Git Repo, The Service Principal Will Run The Job As It will Be Owner... 3 | # ... The Service Principal, Without Receiving Git Authentication, Will Not Be Able To Access The Ropo Files For Which... 4 | # ... The Job Needs. 5 | 6 | 7 | import requests 8 | import os 9 | import json 10 | 11 | 12 | def configureGit(gitConfig, workspaceId, databricksInstance, bearerToken, managementToken, githubToken, environment): 13 | 14 | DBRKS_REQ_HEADERS = { 15 | 'Authorization': f'Bearer {bearerToken}', 16 | 'X-Databricks-Azure-SP-Management-Token': f'{managementToken}', 17 | 'X-Databricks-Azure-Workspace-Resource-Id': f'{workspaceId}', 18 | 'Content-Type': 'application/json' 19 | } 20 | 21 | newData = { 22 | "personal_access_token": githubToken 23 | } 24 | 25 | gitConfig.update(newData) 26 | print(gitConfig) 27 | print(DBRKS_REQ_HEADERS) 28 | 29 | response = requests.post('https://' + databricksInstance + '/api/2.0/git-credentials', headers=DBRKS_REQ_HEADERS, json=gitConfig) 30 | print(response) 31 | print(response.json()) 32 | 33 | if response.status_code != 200: 34 | 35 | response = requests.get('https://' + databricksInstance + '/api/2.0/git-credentials', headers=DBRKS_REQ_HEADERS) 36 | print(response.json()) 37 | credential = response.json()["credentials"][0]["credential_id"] 38 | print(f"Credential is {credential}") 39 | response = requests.patch('https://' + databricksInstance + '/api/2.0/git-credentials/'+ str(credential), headers=DBRKS_REQ_HEADERS, json=gitConfig) 40 | 41 | print(response.json()) 42 | 43 | if __name__ == "__main__": 44 | 45 | with open('infrastructure/databricks/databricks_configs/' + os.environ['ENVIRONMENT'] +'/repos.json', 'r') as f: 46 | json = json.load(f) 47 | 48 | gitConfigs = json['Git_Configuration'] 49 | #print(gitConfigs) 50 | 51 | #print(os.environ['WORKSPACE_ID']) 52 | #print(os.environ['DATABRICKS_INSTANCE']) 53 | #print(os.environ['DATABRICKS_AAD_TOKEN']) 54 | #print(os.environ['DATABRICKS_MANAGEMENT_TOKEN']) 55 | #print(os.environ['PAT_GITHUB']) 56 | #print(os.environ['ENVIRONMENT']) 57 | for gitConfig in gitConfigs: 58 | response = configureGit( 59 | gitConfig=gitConfig, 60 | workspaceId=os.environ['WORKSPACE_ID'], 61 | databricksInstance=os.environ['DATABRICKS_INSTANCE'], 62 | bearerToken=os.environ['DATABRICKS_AAD_TOKEN'], 63 | managementToken=os.environ['DATABRICKS_MANAGEMENT_TOKEN'], 64 | githubToken=os.environ['PAT_GITHUB'], 65 | environment=os.environ['ENVIRONMENT']) -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_repo_pull.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import json 4 | from dotenv import load_dotenv 5 | 6 | 7 | load_dotenv(".env") # load environment variables 8 | 9 | 10 | def _ingest_repo_param_file(filename): 11 | """ 12 | Ingests the Json Parameters File for Repo Pull 13 | """ 14 | with open(filename, 'r') as file: 15 | 16 | repo_param_file = json.load(file)['Repo_Configuration'] 17 | 18 | return repo_param_file 19 | 20 | 21 | def get_repos_with_management_permissions(): 22 | """ 23 | Invokes Databricks API to get all repos with management permissions 24 | """ 25 | 26 | WORKSPACE_ID = os.environ.get("WORKSPACE_ID") 27 | DATABRICKS_INSTANCE = os.environ.get("DATABRICKS_INSTANCE") 28 | DATABRICKS_AAD_TOKEN = os.environ.get("DATABRICKS_AAD_TOKEN") 29 | DATABRICKS_MANAGEMENT_TOKEN = os.environ.get("DATABRICKS_MANAGEMENT_TOKEN") 30 | 31 | 32 | DBRKS_REQ_HEADERS = { 33 | 'Authorization': f'Bearer {DATABRICKS_AAD_TOKEN}', 34 | 'X-Databricks-Azure-SP-Management-Token': f'{DATABRICKS_MANAGEMENT_TOKEN}', 35 | 'X-Databricks-Azure-Workspace-Resource-Id': f'{WORKSPACE_ID}', 36 | 'Content-Type': 'application/json' 37 | } 38 | 39 | response = requests.get( 40 | 'https://' + DATABRICKS_INSTANCE + '/api/2.0/repos', headers=DBRKS_REQ_HEADERS 41 | ) 42 | 43 | status_code = response.status_code 44 | repos_with_management_permissions = response.json() 45 | 46 | if response.status_code != 200: 47 | raise Exception(response.status_code) 48 | else: 49 | repos_with_management_permissions = repos_with_management_permissions['repos'] 50 | return repos_with_management_permissions, status_code 51 | 52 | 53 | def update_repo(repo_id, update_branch): 54 | """ 55 | Invoked Databricks API to update repo 56 | """ 57 | 58 | repo_id = str(repo_id) 59 | 60 | WORKSPACE_ID = os.environ.get("WORKSPACE_ID") 61 | DATABRICKS_INSTANCE = os.environ.get("DATABRICKS_INSTANCE") 62 | DATABRICKS_AAD_TOKEN = os.environ.get("DATABRICKS_AAD_TOKEN") 63 | DATABRICKS_MANAGEMENT_TOKEN = os.environ.get("DATABRICKS_MANAGEMENT_TOKEN") 64 | DATABRICKS_MANAGEMENT_TOKEN = os.environ.get("DATABRICKS_MANAGEMENT_TOKEN") 65 | 66 | DBRKS_REQ_HEADERS = { 67 | 'Authorization': f'Bearer {DATABRICKS_AAD_TOKEN}', 68 | 'X-Databricks-Azure-SP-Management-Token': f'{DATABRICKS_MANAGEMENT_TOKEN}', 69 | 'X-Databricks-Azure-Workspace-Resource-Id': f'{WORKSPACE_ID}', 70 | 'Content-Type': 'application/json' 71 | } 72 | 73 | postjson = { 74 | "branch": str(update_branch) 75 | } 76 | 77 | print("Updated Repo Json String") 78 | print(postjson) 79 | 80 | response = requests.patch( 81 | 'https://' + DATABRICKS_INSTANCE + '/api/2.0/repos/'+ repo_id, headers=DBRKS_REQ_HEADERS, json=postjson 82 | ) 83 | 84 | if response.status_code != 200: 85 | raise Exception(response.content) 86 | else: 87 | #print(f"Status Code: {response.status_code}") 88 | #print(response.json()) 89 | return response.status_code 90 | 91 | 92 | def main(): 93 | 94 | ENVIRONMENT = os.environ.get("ENVIRONMENT") 95 | 96 | file_name = 'infrastructure/databricks/databricks_configs/' + ENVIRONMENT + '/repos.json' 97 | repo_param_file = _ingest_repo_param_file(file_name) 98 | 99 | print(f"Repos To Connect {repo_param_file}") 100 | 101 | repos_with_management_permissions, status_code = get_repos_with_management_permissions() 102 | 103 | for repo in repo_param_file: 104 | 105 | 106 | update_folder = repo['path'] 107 | update_branch = repo['branch'] 108 | 109 | for item in repos_with_management_permissions: 110 | print(f" The Update Folder is {update_folder} and path is {item['path']}") 111 | 112 | if update_folder in item['path']: 113 | print(f" The Update Folder {update_folder} is Contained within the Path {item['path']}") 114 | print("Retrieve the Repo ID") 115 | 116 | repo_id = str(item['id']) 117 | 118 | #Update repo 119 | #import pdb; pdb.set_trace() 120 | status_code = update_repo(repo_id, update_branch) 121 | 122 | return status_code 123 | 124 | 125 | if __name__ == "__main__": 126 | main() 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_repo_pull.sh: -------------------------------------------------------------------------------- 1 | REPOS_WITH_MANAGEMENT_PERMISSIONS=$(curl -X GET \ 2 | -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 3 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 4 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 5 | -H 'Content-Type: application/json' \ 6 | https://$DATABRICKS_INSTANCE/api/2.0/repos ) 7 | 8 | 9 | echo "Ingest JSON File" 10 | JSON=$( jq '.' infrastructure/databricks/databricks_configs/$ENVIRONMENT/repos.json) 11 | for row in $(echo "${JSON}" | jq -r '.Repo_Configuration[] | @base64'); do 12 | _jq() { 13 | echo ${row} | base64 --decode | jq -r ${1} 14 | } 15 | 16 | echo "PULL_BRANCH: $PULL_BRANCH" 17 | UPDATE_FOLDER=$(_jq '.path') 18 | echo "UPDATE FOLDER: $UPDATE_FOLDER" 19 | 20 | if [ -z "$PULL_BRANCH" ]; 21 | then 22 | PULL_BRANCH=$DBX_REPO_BRANCH 23 | "Use Release Branch: $PULL_BRANCH" 24 | fi 25 | 26 | echo "Display Repos In DBX With Manage Permissions...." 27 | echo $REPOS_WITH_MANAGEMENT_PERMISSIONS 28 | 29 | echo "Retrieve Repo ID For ..." 30 | REPO_ID=$( jq -r --arg UPDATE_FOLDER "$UPDATE_FOLDER" ' .repos[] | select( .path | contains($UPDATE_FOLDER)) | .id ' <<< "$REPOS_WITH_MANAGEMENT_PERMISSIONS") 31 | 32 | echo "Repo ID: $REPO_ID" 33 | 34 | echo "Git Pull on DBX Repo $UPDATE_FOLDER With $PULL_BRANCH Branch " 35 | 36 | JSON_STRING=$( jq -n -c --arg tb "$PULL_BRANCH" \ 37 | '{branch: $tb}' ) 38 | 39 | 40 | GIT_PULL_RESPONSE=$(curl -X PATCH \ 41 | -H "Authorization: Bearer $DATABRICKS_AAD_TOKEN" \ 42 | -H "X-Databricks-Azure-SP-Management-Token: $DATABRICKS_MANAGEMENT_TOKEN" \ 43 | -H "X-Databricks-Azure-Workspace-Resource-Id: $WORKSPACE_ID" \ 44 | -H 'Content-Type: application/json' \ 45 | -d $JSON_STRING \ 46 | https://$DATABRICKS_INSTANCE/api/2.0/repos/$REPO_ID ) 47 | 48 | echo "Git Pull Response..." 49 | echo $GIT_PULL_RESPONSE 50 | done 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /src/pkg/dbx_utils/utils_set_env_vars.sh: -------------------------------------------------------------------------------- 1 | 2 | ### Lets Retrieve Important Variables That Are Important For Later Steps 3 | 4 | echo $ENVIRONMENT 5 | 6 | echo "Ingest JSON File" 7 | JSON=$( jq '.' infrastructure/bicep/params/$ENVIRONMENT/bicep.parameters.json) 8 | 9 | 10 | RESOURCE_GROUP_NAME=$( jq -r '.parameters.resourceGroupName.value' <<< "$JSON") 11 | echo "Resource Group Name: $RESOURCE_GROUP_NAME" 12 | 13 | DATABRICKS_WS_NAME=$( az databricks workspace list -g $RESOURCE_GROUP_NAME --query [].name -o tsv ) 14 | AML_WS_NAME=$(az ml workspace list -g $RESOURCE_GROUP_NAME --query [].workspaceName -o tsv) 15 | DATABRICKS_ORDGID=$(az databricks workspace list -g $RESOURCE_GROUP_NAME --query "[].workspaceId" -o tsv) 16 | DATABRICKS_INSTANCE="$(az databricks workspace list -g $RESOURCE_GROUP_NAME --query "[].workspaceUrl" -o tsv)" 17 | WORKSPACE_ID=$(az databricks workspace list -g $RESOURCE_GROUP_NAME --query "[].id" -o tsv) 18 | AZ_KEYVAULT_NAME=$(az keyvault list -g $RESOURCE_GROUP_NAME --query "[].name" -o tsv) 19 | SUBSCRIPTION_ID=$( az account show --query id -o tsv ) 20 | 21 | echo $SUBSCRIPTION_ID 22 | echo $DATABRICKS_ORDGID 23 | echo $WORKSPACE_ID 24 | echo $AZ_KEYVAULT_NAME 25 | echo $SUBSCRIPTION_ID 26 | echo $AML_WS_NAME 27 | echo $DATABRICKS_WS_NAME 28 | #DATABRICKS_TOKEN=$(az keyvault secret show --name "dbkstoken" --vault-name $AZ_KEYVAULT_NAME --query "value" -o tsv) 29 | 30 | 31 | if [[ $DevOps_Agent == "GitHub" ]]; then 32 | # Creation Of Important Environment Variables For Later Steps. 33 | echo "Set Environment Variables For Later Stages..." 34 | 35 | echo "Set Environment Name As Environment Variable..." 36 | echo "ENVIRONMENT=$ENVIRONMENT" >> $GITHUB_ENV 37 | 38 | echo "Set Resource Group Name Name As Environment Variable..." 39 | echo "RESOURCE_GROUP_NAME=$RESOURCE_GROUP_NAME" >> $GITHUB_ENV 40 | 41 | echo "Set Key Vault Name As Environment Variable..." 42 | echo "AZ_KEYVAULT_NAME=$AZ_KEYVAULT_NAME" >> $GITHUB_ENV 43 | 44 | echo "Set Databricks OrgID As Environment Variable..." 45 | echo "DATABRICKS_ORDGID=$DATABRICKS_ORDGID" >> $GITHUB_ENV 46 | 47 | echo "Set Workspace ID As Environment Variable..." 48 | echo "WORKSPACE_ID=$WORKSPACE_ID" >> $GITHUB_ENV 49 | 50 | echo "Set Datbricks Instance As Environment Variable..." 51 | echo "DATABRICKS_INSTANCE=$DATABRICKS_INSTANCE" >> $GITHUB_ENV 52 | 53 | echo "Set Databricks Host As Environment Variable..." 54 | echo "DATABRICKS_HOST=https://$DATABRICKS_INSTANCE" >> $GITHUB_ENV 55 | 56 | #echo "Set Databricks Token ID As Environment Variable..." 57 | #echo "DATABRICKS_TOKEN=$DATABRICKS_TOKEN" >> $GITHUB_ENV 58 | 59 | echo "Set SUBSCRIPTION_ID As Environment Variable..." 60 | echo "SUBSCRIPTION_ID=$SUBSCRIPTION_ID" >> $GITHUB_ENV 61 | 62 | echo "Set AML_WS_NAME As Environment Variable..." 63 | echo "AML_WS_NAME=$AML_WS_NAME" >> $GITHUB_ENV 64 | 65 | echo "Set DATABRICKS_WS_NAME As Environment Variable..." 66 | echo "DATABRICKS_WS_NAME=$DATABRICKS_WS_NAME" >> $GITHUB_ENV 67 | 68 | else 69 | 70 | # Creation Of Important Environment Variables For Later Steps. 71 | echo "Set Environment Variables For Later Stages..." 72 | 73 | 74 | echo "ENVIRONMENT Name As Environment Variable..." 75 | echo "##vso[task.setvariable variable="ENVIRONMENT";isOutput=true;]$ENVIRONMENT" 76 | 77 | 78 | echo "Resource Group Name As Environment Variable..." 79 | echo "##vso[task.setvariable variable="RESOURCE_GROUP_NAME";isOutput=true;]$RESOURCE_GROUP_NAME" 80 | 81 | echo "Set Key Vault Name As Environment Variable..." 82 | echo "##vso[task.setvariable variable="AZ_KEYVAULT_NAME";isOutput=true;]$AZ_KEYVAULT_NAME" 83 | 84 | echo "Set Databricks OrgID As Environment Variable..." 85 | echo "##vso[task.setvariable variable="DATABRICKS_ORDGID";isOutput=true;]$DATABRICKS_ORDGID" 86 | 87 | echo "Set Workspace ID As Environment Variable..." 88 | echo "##vso[task.setvariable variable="WORKSPACE_ID";isOutput=true;]$WORKSPACE_ID" 89 | 90 | 91 | echo "Set Datbricks Instance As Environment Variable..." 92 | echo "##vso[task.setvariable variable="DATABRICKS_INSTANCE";isOutput=true;]$DATABRICKS_INSTANCE" 93 | 94 | echo "Set Databricks Host As Environment Variable..." 95 | echo "##vso[task.setvariable variable="DATABRICKS_HOST";isOutput=true;]https://$DATABRICKS_INSTANCE" 96 | 97 | echo "Set Databricks Host As Environment Variable..." 98 | echo "##vso[task.setvariable variable="SUBSCRIPTION_ID";isOutput=true;]$SUBSCRIPTION_ID" 99 | 100 | echo "Set AML_WS_NAME As Environment Variable..." 101 | echo "##vso[task.setvariable variable="AML_WS_NAME";isOutput=true;]$AML_WS_NAME" 102 | 103 | echo "Set DATABRICKS_WS_NAME As Environment Variable..." 104 | echo "##vso[task.setvariable variable="DATABRICKS_WS_NAME";isOutput=true;]$DATABRICKS_WS_NAME" 105 | fi 106 | -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/build/lib/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | from mlflow.tracking import MlflowClient 4 | import math 5 | from datetime import timedelta 6 | from pytz import timezone 7 | from pyspark.sql.types import FloatType, IntegerType, StringType 8 | import mlflow 9 | #from databricks import feature_store 10 | from pyspark.sql.functions import * 11 | from pyspark.sql.types import FloatType, IntegerType, StringType 12 | from pytz import timezone 13 | import mlflow 14 | 15 | 16 | 17 | 18 | 19 | def utils_test_function(): 20 | a = 8 21 | b = 10 22 | 23 | c = a + b 24 | return c 25 | 26 | @udf(returnType=IntegerType()) 27 | def is_weekend(dt): 28 | tz = "America/New_York" 29 | return int(dt.astimezone(timezone(tz)).weekday() >= 5) # 5 = Saturday, 6 = Sunday 30 | 31 | @udf(returnType=StringType()) 32 | def partition_id(dt): 33 | # datetime -> "YYYY-MM" 34 | return f"{dt.year:04d}-{dt.month:02d}" 35 | 36 | 37 | def filter_df_by_ts(df, ts_column, start_date, end_date): 38 | if ts_column and start_date: 39 | df = df.filter(col(ts_column) >= start_date) 40 | if ts_column and end_date: 41 | df = df.filter(col(ts_column) < end_date) 42 | return df 43 | 44 | 45 | def rounded_unix_timestamp(dt, num_minutes=15): 46 | """ 47 | Ceilings datetime dt to interval num_minutes, then returns the unix timestamp. 48 | """ 49 | nsecs = dt.minute * 60 + dt.second + dt.microsecond * 1e-6 50 | delta = math.ceil(nsecs / (60 * num_minutes)) * (60 * num_minutes) - nsecs 51 | return int((dt + timedelta(seconds=delta)).timestamp()) 52 | 53 | 54 | rounded_unix_timestamp_udf = udf(rounded_unix_timestamp, IntegerType()) 55 | 56 | 57 | def rounded_taxi_data( 58 | spark, 59 | taxi_data_df 60 | ): 61 | # Round the taxi data timestamp to 15 and 30 minute intervals so we can join with the pickup and dropoff features 62 | # respectively 63 | taxi_data_df = ( 64 | taxi_data_df.withColumn( 65 | "rounded_pickup_datetime", 66 | rounded_unix_timestamp_udf(taxi_data_df["tpep_pickup_datetime"], lit(15)), 67 | ) 68 | .withColumn( 69 | "rounded_dropoff_datetime", 70 | rounded_unix_timestamp_udf(taxi_data_df["tpep_dropoff_datetime"], lit(30)), 71 | ) 72 | .drop("tpep_pickup_datetime") 73 | .drop("tpep_dropoff_datetime") 74 | ) 75 | taxi_data_df.createOrReplaceTempView("taxi_data") 76 | return taxi_data_df 77 | 78 | def get_latest_model_version(model_name): 79 | latest_version = 1 80 | 81 | mlflow_client = MlflowClient() 82 | #mlflow.set_experiment() 83 | for mv in mlflow_client.search_model_versions(f"name='{model_name}'"): 84 | version_int = int(mv.version) 85 | if version_int > latest_version: 86 | latest_version = version_int 87 | return latest_version 88 | 89 | 90 | class fareClassifier(mlflow.pyfunc.PythonModel): 91 | def __init__(self, trained_model): 92 | self.model = trained_model 93 | 94 | def preprocess_result(self, model_input): 95 | return model_input 96 | 97 | def postprocess_result(self, results): 98 | '''Return post-processed results. 99 | Creates a set of fare ranges 100 | and returns the predicted range.''' 101 | 102 | return ["$0 - $9.99" if result < 10 else "$10 - $19.99" if result < 20 else " > $20" for result in results] 103 | 104 | def predict(self, context, model_input): 105 | processed_df = self.preprocess_result(model_input.copy()) 106 | results = self.model.predict(processed_df) 107 | return self.postprocess_result(results) -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/build/lib/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # TO DO -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/build/lib/prediction/__init__.py: -------------------------------------------------------------------------------- 1 | # TO DO -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/build/lib/registration/__init__.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | from databricks.sdk.runtime import * 4 | from databricks import feature_store 5 | from pyspark.sql.types import * 6 | from pyspark.sql.functions import * 7 | from pyspark.sql.types import FloatType, IntegerType, StringType 8 | import mlflow 9 | from mlflow.tracking import MlflowClient 10 | from databricks import feature_store 11 | from common import * 12 | 13 | # COMMAND ---------- 14 | 15 | def wait_until_ready(model_name, model_version, client): 16 | for _ in range(10): 17 | model_version_details = client.get_model_version( 18 | name=model_name, 19 | version=model_version, 20 | ) 21 | status = ModelVersionStatus.from_string(model_version_details.status) 22 | print("Model status: %s" % ModelVersionStatus.to_string(status)) 23 | if status == ModelVersionStatus.READY: 24 | break 25 | time.sleep(1) 26 | 27 | 28 | def get_model_uri( 29 | fs, 30 | model_name, 31 | model_stage 32 | ): 33 | 34 | fs = feature_store.FeatureStoreClient() 35 | 36 | model_uri_production = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage) 37 | 38 | return model_uri_production 39 | 40 | def get_data(feature_table_name): 41 | 42 | 43 | taxi_data = spark.read.table("feature_store_taxi_example.nyc_yellow_taxi_with_zips") 44 | taxi_data = rounded_taxi_data(spark, taxi_data_df = taxi_data) 45 | 46 | return taxi_data 47 | 48 | def predict( 49 | fs, 50 | model_uri, 51 | taxi_data 52 | ): 53 | 54 | with_predictions = fs.score_batch( 55 | model_uri, 56 | taxi_data 57 | ) 58 | 59 | expected_y = with_predictions.select('fare_amount').toPandas() 60 | predicted_y = with_predictions.select('prediction').toPandas() 61 | 62 | from sklearn import metrics 63 | r2 = metrics.r2_score( 64 | expected_y, 65 | predicted_y 66 | ) 67 | 68 | display(expected_y) 69 | display(with_predictions) 70 | 71 | print(f"R2: {r2}") 72 | 73 | # Display Data For Demo Purposes 74 | 75 | import pyspark.sql.functions as func 76 | cols = ['prediction', 'fare_amount', 'trip_distance', 'pickup_zip', 'dropoff_zip', 77 | 'rounded_pickup_datetime', 'rounded_dropoff_datetime', 'mean_fare_window_1h_pickup_zip', 78 | 'count_trips_window_1h_pickup_zip', 'count_trips_window_30m_dropoff_zip', 'dropoff_is_weekend'] 79 | 80 | with_predictions_reordered = ( 81 | with_predictions.select( 82 | cols, 83 | ) 84 | .withColumnRenamed( 85 | "prediction", 86 | "predicted_fare_amount", 87 | ) 88 | .withColumn( 89 | "predicted_fare_amount", 90 | func.round("predicted_fare_amount", 2), 91 | ) 92 | ) 93 | display(with_predictions_reordered) 94 | 95 | return r2 96 | 97 | def evaluation( 98 | score_latest_model, 99 | score_production_model 100 | ): 101 | model_name = "taxi_example_fare_packaged" 102 | 103 | if score_latest_model > score_production_model: 104 | print("Latest Model Is Better Than Production Model") 105 | 106 | # Demote Production 107 | production_stage = 'production' 108 | 109 | # Get the latest model version in the production stage 110 | 111 | mlflow_client = MlflowClient() 112 | 113 | latest_production_version = mlflow_client.get_latest_versions( 114 | name=model_name, 115 | stages=[production_stage])[0].version 116 | 117 | #print(latest_production_version[0].version) 118 | #print(type(latest_production_version)) 119 | 120 | 121 | # Promote Latest Model To Production 122 | latest_model_version = get_latest_model_version(model_name) 123 | mlflow_client.transition_model_version_stage( 124 | name=model_name, 125 | version=latest_model_version, 126 | stage="production", 127 | archive_existing_versions = True 128 | ) 129 | 130 | 131 | def run_registration(model_name): 132 | fs = feature_store.FeatureStoreClient() 133 | 134 | latest_model_version = get_latest_model_version(model_name) 135 | 136 | taxi_data = get_data( 137 | feature_table_name = "feature_store_taxi_example.nyc_yellow_taxi_with_zips" 138 | ) 139 | 140 | model_uri_latest = get_model_uri( 141 | fs=fs, 142 | model_name = model_name, 143 | model_stage = "latest" 144 | 145 | ) 146 | 147 | print(model_uri_latest) 148 | 149 | model_uri_production = get_model_uri( 150 | fs=fs, 151 | model_name = model_name, 152 | model_stage = "production" 153 | ) 154 | 155 | print(model_uri_production) 156 | 157 | new = production_model_exists( 158 | model_name = model_name, 159 | model_stage = "production" 160 | ) 161 | 162 | print(new) 163 | 164 | if production_model_exists( 165 | model_name = model_name, 166 | model_stage = "Production" 167 | ): 168 | 169 | score_latest_model = predict( 170 | fs = fs, 171 | model_uri = model_uri_latest, 172 | taxi_data = taxi_data 173 | ) 174 | 175 | score_production_model = predict( 176 | fs = fs, 177 | model_uri = model_uri_production, 178 | taxi_data = taxi_data 179 | ) 180 | 181 | evaluation( 182 | score_latest_model = score_latest_model, 183 | score_production_model = score_production_model 184 | ) 185 | else: 186 | print("No production model found. Promoting latest model to production") 187 | mlflow_client = MlflowClient() 188 | mlflow_client.transition_model_version_stage( 189 | name="taxi_example_fare_packaged", 190 | version=latest_model_version, 191 | stage="production", 192 | archive_existing_versions = True 193 | ) 194 | 195 | 196 | def production_model_exists( 197 | model_name, 198 | model_stage 199 | ): 200 | 201 | 202 | mlflow_client = MlflowClient() 203 | for mv in mlflow_client.search_model_versions("name = '%s'" % model_name): 204 | if mv.current_stage == model_stage: 205 | return True 206 | 207 | # COMMAND ---------- 208 | 209 | if __name__ == "__main__": 210 | run_registration( 211 | model_name = "taxi_example_fare_packaged" 212 | ) 213 | 214 | #latest_model_version = get_latest_model_version(model_name) 215 | #mlflow_client = MlflowClient() 216 | #mlflow_client.get_latest_versions(name=model_name, stages=[model_stage], order_by=['creation_time desc'], max_results=1) 217 | #experiment = mlflow_client.get_experiment_by_name("/Shared/ciaran_experiment_nyc_taxi") 218 | #experiment_id = experiment.experiment_id 219 | 220 | # INCREDIBLY IMPORTANT - "runs" IS GIVING US EVERYTHING, INCLUDING R2 AND PARAMETERS - USE THIS FOR POWERBI 221 | #runs = mlflow.search_runs( 222 | # experiment_ids=experiment_id 223 | #) 224 | #display(runs) 225 | #runs_2 = mlflow.search_runs( 226 | # experiment_ids=experiment_id, 227 | # filter_string=f"tags.model_version = '{latest_model_version}'") 228 | #display(runs_2) 229 | #model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage) 230 | #model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage="latest") -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | from mlflow.tracking import MlflowClient 4 | import math 5 | from datetime import timedelta 6 | from pytz import timezone 7 | from pyspark.sql.types import FloatType, IntegerType, StringType 8 | import mlflow 9 | #from databricks import feature_store 10 | from pyspark.sql.functions import * 11 | from pyspark.sql.types import FloatType, IntegerType, StringType 12 | from pytz import timezone 13 | import mlflow 14 | 15 | 16 | 17 | 18 | 19 | def utils_test_function(): 20 | a = 8 21 | b = 10 22 | 23 | c = a + b 24 | return c 25 | 26 | @udf(returnType=IntegerType()) 27 | def is_weekend(dt): 28 | tz = "America/New_York" 29 | return int(dt.astimezone(timezone(tz)).weekday() >= 5) # 5 = Saturday, 6 = Sunday 30 | 31 | @udf(returnType=StringType()) 32 | def partition_id(dt): 33 | # datetime -> "YYYY-MM" 34 | return f"{dt.year:04d}-{dt.month:02d}" 35 | 36 | 37 | def filter_df_by_ts(df, ts_column, start_date, end_date): 38 | if ts_column and start_date: 39 | df = df.filter(col(ts_column) >= start_date) 40 | if ts_column and end_date: 41 | df = df.filter(col(ts_column) < end_date) 42 | return df 43 | 44 | 45 | def rounded_unix_timestamp(dt, num_minutes=15): 46 | """ 47 | Ceilings datetime dt to interval num_minutes, then returns the unix timestamp. 48 | """ 49 | nsecs = dt.minute * 60 + dt.second + dt.microsecond * 1e-6 50 | delta = math.ceil(nsecs / (60 * num_minutes)) * (60 * num_minutes) - nsecs 51 | return int((dt + timedelta(seconds=delta)).timestamp()) 52 | 53 | 54 | rounded_unix_timestamp_udf = udf(rounded_unix_timestamp, IntegerType()) 55 | 56 | 57 | def rounded_taxi_data( 58 | spark, 59 | taxi_data_df 60 | ): 61 | # Round the taxi data timestamp to 15 and 30 minute intervals so we can join with the pickup and dropoff features 62 | # respectively 63 | taxi_data_df = ( 64 | taxi_data_df.withColumn( 65 | "rounded_pickup_datetime", 66 | rounded_unix_timestamp_udf(taxi_data_df["tpep_pickup_datetime"], lit(15)), 67 | ) 68 | .withColumn( 69 | "rounded_dropoff_datetime", 70 | rounded_unix_timestamp_udf(taxi_data_df["tpep_dropoff_datetime"], lit(30)), 71 | ) 72 | .drop("tpep_pickup_datetime") 73 | .drop("tpep_dropoff_datetime") 74 | ) 75 | taxi_data_df.createOrReplaceTempView("taxi_data") 76 | return taxi_data_df 77 | 78 | def get_latest_model_version(model_name): 79 | latest_version = 1 80 | 81 | mlflow_client = MlflowClient() 82 | #mlflow.set_experiment() 83 | for mv in mlflow_client.search_model_versions(f"name='{model_name}'"): 84 | version_int = int(mv.version) 85 | if version_int > latest_version: 86 | latest_version = version_int 87 | return latest_version 88 | 89 | 90 | class fareClassifier(mlflow.pyfunc.PythonModel): 91 | def __init__(self, trained_model): 92 | self.model = trained_model 93 | 94 | def preprocess_result(self, model_input): 95 | return model_input 96 | 97 | def postprocess_result(self, results): 98 | '''Return post-processed results. 99 | Creates a set of fare ranges 100 | and returns the predicted range.''' 101 | 102 | return ["$0 - $9.99" if result < 10 else "$10 - $19.99" if result < 20 else " > $20" for result in results] 103 | 104 | def predict(self, context, model_input): 105 | processed_df = self.preprocess_result(model_input.copy()) 106 | results = self.model.predict(processed_df) 107 | return self.postprocess_result(results) -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1-py3-none-any.whl -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-mlops-databricks/74c0fbbfab54de04078e0d45f267cd80f9537f11/src/pkg/nyc_taxi/dist/src_nyc_taxi-0.0.1.tar.gz -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/entrypoint.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | from featurization import run_feature_store_refresh 4 | run_feature_store_refresh() 5 | 6 | # COMMAND ---------- 7 | from training import run_training 8 | 9 | run_training( 10 | experiment_name = "nyc_e2e_mlops", 11 | model_name = "taxi_example_fare_packaged", 12 | model_params = { 13 | "objective": "regression", 14 | "metric": "rmse", 15 | "num_leaves": 25, 16 | "learning_rate": 0.2, 17 | "bagging_fraction": 0.9, 18 | "feature_fraction": 0.9, 19 | "bagging_seed": 42, 20 | "verbosity": -1, 21 | "seed": 42, 22 | "num_rounds": 100 23 | } 24 | ) 25 | from registration import run_registration 26 | run_registration( 27 | model_name = "taxi_example_fare_packaged" 28 | ) -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # TO DO -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/prediction/__init__.py: -------------------------------------------------------------------------------- 1 | # TO DO -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools" 4 | ] 5 | build-backend = "setuptools.build_meta" 6 | 7 | [tool.distutils.bdist_wheel] 8 | universal = true 9 | -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/registration/__init__.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | from databricks.sdk.runtime import * 4 | from databricks import feature_store 5 | from pyspark.sql.types import * 6 | from pyspark.sql.functions import * 7 | from pyspark.sql.types import FloatType, IntegerType, StringType 8 | import mlflow 9 | from mlflow.tracking import MlflowClient 10 | from databricks import feature_store 11 | from common import * 12 | 13 | # COMMAND ---------- 14 | 15 | def wait_until_ready(model_name, model_version, client): 16 | for _ in range(10): 17 | model_version_details = client.get_model_version( 18 | name=model_name, 19 | version=model_version, 20 | ) 21 | status = ModelVersionStatus.from_string(model_version_details.status) 22 | print("Model status: %s" % ModelVersionStatus.to_string(status)) 23 | if status == ModelVersionStatus.READY: 24 | break 25 | time.sleep(1) 26 | 27 | 28 | def get_model_uri( 29 | fs, 30 | model_name, 31 | model_stage 32 | ): 33 | 34 | fs = feature_store.FeatureStoreClient() 35 | 36 | model_uri_production = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage) 37 | 38 | return model_uri_production 39 | 40 | def get_data(feature_table_name): 41 | 42 | 43 | taxi_data = spark.read.table("feature_store_taxi_example.nyc_yellow_taxi_with_zips") 44 | taxi_data = rounded_taxi_data(spark, taxi_data_df = taxi_data) 45 | 46 | return taxi_data 47 | 48 | def predict( 49 | fs, 50 | model_uri, 51 | taxi_data 52 | ): 53 | 54 | with_predictions = fs.score_batch( 55 | model_uri, 56 | taxi_data 57 | ) 58 | 59 | expected_y = with_predictions.select('fare_amount').toPandas() 60 | predicted_y = with_predictions.select('prediction').toPandas() 61 | 62 | from sklearn import metrics 63 | r2 = metrics.r2_score( 64 | expected_y, 65 | predicted_y 66 | ) 67 | 68 | display(expected_y) 69 | display(with_predictions) 70 | 71 | print(f"R2: {r2}") 72 | 73 | # Display Data For Demo Purposes 74 | 75 | import pyspark.sql.functions as func 76 | cols = ['prediction', 'fare_amount', 'trip_distance', 'pickup_zip', 'dropoff_zip', 77 | 'rounded_pickup_datetime', 'rounded_dropoff_datetime', 'mean_fare_window_1h_pickup_zip', 78 | 'count_trips_window_1h_pickup_zip', 'count_trips_window_30m_dropoff_zip', 'dropoff_is_weekend'] 79 | 80 | with_predictions_reordered = ( 81 | with_predictions.select( 82 | cols, 83 | ) 84 | .withColumnRenamed( 85 | "prediction", 86 | "predicted_fare_amount", 87 | ) 88 | .withColumn( 89 | "predicted_fare_amount", 90 | func.round("predicted_fare_amount", 2), 91 | ) 92 | ) 93 | display(with_predictions_reordered) 94 | 95 | return r2 96 | 97 | def evaluation( 98 | score_latest_model, 99 | score_production_model 100 | ): 101 | model_name = "taxi_example_fare_packaged" 102 | 103 | if score_latest_model > score_production_model: 104 | print("Latest Model Is Better Than Production Model") 105 | 106 | # Demote Production 107 | production_stage = 'production' 108 | 109 | # Get the latest model version in the production stage 110 | 111 | mlflow_client = MlflowClient() 112 | 113 | latest_production_version = mlflow_client.get_latest_versions( 114 | name=model_name, 115 | stages=[production_stage])[0].version 116 | 117 | #print(latest_production_version[0].version) 118 | #print(type(latest_production_version)) 119 | 120 | 121 | # Promote Latest Model To Production 122 | latest_model_version = get_latest_model_version(model_name) 123 | mlflow_client.transition_model_version_stage( 124 | name=model_name, 125 | version=latest_model_version, 126 | stage="production", 127 | archive_existing_versions = True 128 | ) 129 | 130 | 131 | def run_registration(model_name): 132 | fs = feature_store.FeatureStoreClient() 133 | 134 | latest_model_version = get_latest_model_version(model_name) 135 | 136 | taxi_data = get_data( 137 | feature_table_name = "feature_store_taxi_example.nyc_yellow_taxi_with_zips" 138 | ) 139 | 140 | model_uri_latest = get_model_uri( 141 | fs=fs, 142 | model_name = model_name, 143 | model_stage = "latest" 144 | 145 | ) 146 | 147 | print(model_uri_latest) 148 | 149 | model_uri_production = get_model_uri( 150 | fs=fs, 151 | model_name = model_name, 152 | model_stage = "production" 153 | ) 154 | 155 | print(model_uri_production) 156 | 157 | new = production_model_exists( 158 | model_name = model_name, 159 | model_stage = "production" 160 | ) 161 | 162 | print(new) 163 | 164 | if production_model_exists( 165 | model_name = model_name, 166 | model_stage = "Production" 167 | ): 168 | 169 | score_latest_model = predict( 170 | fs = fs, 171 | model_uri = model_uri_latest, 172 | taxi_data = taxi_data 173 | ) 174 | 175 | score_production_model = predict( 176 | fs = fs, 177 | model_uri = model_uri_production, 178 | taxi_data = taxi_data 179 | ) 180 | 181 | evaluation( 182 | score_latest_model = score_latest_model, 183 | score_production_model = score_production_model 184 | ) 185 | else: 186 | print("No production model found. Promoting latest model to production") 187 | mlflow_client = MlflowClient() 188 | mlflow_client.transition_model_version_stage( 189 | name="taxi_example_fare_packaged", 190 | version=latest_model_version, 191 | stage="production", 192 | archive_existing_versions = True 193 | ) 194 | 195 | 196 | def production_model_exists( 197 | model_name, 198 | model_stage 199 | ): 200 | 201 | 202 | mlflow_client = MlflowClient() 203 | for mv in mlflow_client.search_model_versions("name = '%s'" % model_name): 204 | if mv.current_stage == model_stage: 205 | return True 206 | 207 | # COMMAND ---------- 208 | 209 | if __name__ == "__main__": 210 | run_registration( 211 | model_name = "taxi_example_fare_packaged" 212 | ) 213 | 214 | #latest_model_version = get_latest_model_version(model_name) 215 | #mlflow_client = MlflowClient() 216 | #mlflow_client.get_latest_versions(name=model_name, stages=[model_stage], order_by=['creation_time desc'], max_results=1) 217 | #experiment = mlflow_client.get_experiment_by_name("/Shared/ciaran_experiment_nyc_taxi") 218 | #experiment_id = experiment.experiment_id 219 | 220 | # INCREDIBLY IMPORTANT - "runs" IS GIVING US EVERYTHING, INCLUDING R2 AND PARAMETERS - USE THIS FOR POWERBI 221 | #runs = mlflow.search_runs( 222 | # experiment_ids=experiment_id 223 | #) 224 | #display(runs) 225 | #runs_2 = mlflow.search_runs( 226 | # experiment_ids=experiment_id, 227 | # filter_string=f"tags.model_version = '{latest_model_version}'") 228 | #display(runs_2) 229 | #model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage) 230 | #model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage="latest") -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = src_nyc_taxi 3 | version = 0.0.1 4 | description = NYC Taxi Data Source 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown; charset=UTF-8 7 | author = Ciaran Hamill Diamond 8 | 9 | [options] 10 | package_dir = 11 | src_nyc_taxi = src_nyc_taxi 12 | packages = find: 13 | platforms = any 14 | include_package_data = True 15 | python_requires = ==3.10.* 16 | install_requires = 17 | packaging==21.* 18 | azure-identity 19 | azure-keyvault-secrets 20 | azure-keyvault-keys 21 | datetime 22 | argparse 23 | pathlib 24 | argon2-cffi==20.1.0 25 | astor==0.8.1 26 | astunparse==1.6.3 27 | async-generator==1.10 28 | attrs==21.2.0 29 | azure-core 30 | backcall==0.2.0 31 | backports.entry-points-selectable==1.1.1 32 | bcrypt==4.0.0 33 | black==22.3.0 34 | bleach==4.0.0 35 | blis==0.7.8 36 | boto3==1.21.18 37 | botocore==1.24.18 38 | cachetools==5.2.0 39 | certifi==2021.10.8 40 | cffi==1.14.6 41 | chardet==4.0.0 42 | charset-normalizer==2.0.4 43 | click==8.0.3 44 | databricks-automl-runtime==0.2.11 45 | databricks-cli==0.17.3 46 | Flask==1.1.2 47 | importlib-metadata==4.8.1 48 | ipykernel==6.12.1 49 | ipython==7.32.0 50 | ipython-genutils==0.2.0 51 | ipywidgets==7.7.0 52 | Jinja2==2.11.3 53 | jupyter-client==6.1.12 54 | jupyter-core==4.8.1 55 | jupyterlab-pygments==0.1.2 56 | jupyterlab-widgets==1.0.0 57 | mlflow-databricks-artifacts==2.0.0 58 | mlflow-skinny==1.29.0 59 | pip==23.1.2 60 | pydantic==1.9.2 61 | pytz==2021.3 62 | PyYAML==6.0 63 | pyzmq==22.2.1 64 | regex==2021.8.3 65 | requests==2.28.1 66 | requests-oauthlib==1.3.1 67 | requests-unixsocket==0.2.0 68 | urllib3==1.26.7 69 | virtualenv==20.8.0 70 | visions==0.7.4 71 | wasabi==0.10.1 72 | wcwidth==0.2.5 73 | webencodings==0.5.1 74 | websocket-client==1.3.1 75 | Werkzeug==2.0.2 76 | wheel==0.37.0 77 | widgetsnbextension==3.6.0 78 | wrapt==1.12.1 79 | xgboost==1.6.2 80 | zipp==3.6.0 81 | azureml-mlflow==1.50.0 82 | azureml-core==1.50.0 83 | azure-ai-ml==1.4.0 84 | sklearn_pandas==2.2.0 85 | azureml-sdk==1.50.0 86 | uszipcode 87 | lightgbm 88 | azureml-sdk[databricks]==1.50.0 89 | python-dotenv 90 | databricks-feature-store==0.10.* 91 | azure-cosmos==4.3.1 92 | 93 | [options.extras_require] 94 | test = 95 | bandit==1.7.4 96 | freezegun==1.2.2 97 | pydocstyle==6.1.1 98 | pylint==2.15.0 99 | pylint_junit==0.3.2 100 | pytest==7.1.2 101 | pytest-cov==3.0.0 102 | 103 | build = 104 | databricks-cli==0.17.0 105 | wheel==0.37.1 106 | 107 | 108 | [pylint] 109 | disable = 110 | missing-class-docstring, 111 | missing-function-docstring, 112 | too-few-public-methods 113 | jobs = 4 114 | output-format = colorized 115 | 116 | # Maximum number of locals for function / method body 117 | max-locals = 20 118 | 119 | # Maximum number of arguments for function / method 120 | max-args = 10 121 | 122 | good-names = df 123 | -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | if __name__ == '__main__': 4 | setuptools.setup() 5 | -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/src_nyc_taxi.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: src-nyc-taxi 3 | Version: 0.0.1 4 | Summary: NYC Taxi Data Source 5 | Author: Ciaran Hamill Diamond 6 | Requires-Python: ==3.10.* 7 | Description-Content-Type: text/markdown; charset=UTF-8 8 | Provides-Extra: test 9 | Provides-Extra: build 10 | -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/src_nyc_taxi.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | pyproject.toml 2 | setup.cfg 3 | setup.py 4 | common/__init__.py 5 | evaluation/__init__.py 6 | featurization/__init__.py 7 | prediction/__init__.py 8 | registration/__init__.py 9 | src_nyc_taxi.egg-info/PKG-INFO 10 | src_nyc_taxi.egg-info/SOURCES.txt 11 | src_nyc_taxi.egg-info/dependency_links.txt 12 | src_nyc_taxi.egg-info/requires.txt 13 | src_nyc_taxi.egg-info/top_level.txt 14 | training/__init__.py -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/src_nyc_taxi.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/src_nyc_taxi.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | packaging==21.* 2 | azure-identity 3 | azure-keyvault-secrets 4 | azure-keyvault-keys 5 | datetime 6 | argparse 7 | pathlib 8 | argon2-cffi==20.1.0 9 | astor==0.8.1 10 | astunparse==1.6.3 11 | async-generator==1.10 12 | attrs==21.2.0 13 | azure-core 14 | backcall==0.2.0 15 | backports.entry-points-selectable==1.1.1 16 | bcrypt==4.0.0 17 | black==22.3.0 18 | bleach==4.0.0 19 | blis==0.7.8 20 | boto3==1.21.18 21 | botocore==1.24.18 22 | cachetools==5.2.0 23 | certifi==2021.10.8 24 | cffi==1.14.6 25 | chardet==4.0.0 26 | charset-normalizer==2.0.4 27 | click==8.0.3 28 | databricks-automl-runtime==0.2.11 29 | databricks-cli==0.17.3 30 | Flask==1.1.2 31 | importlib-metadata==4.8.1 32 | ipykernel==6.12.1 33 | ipython==7.32.0 34 | ipython-genutils==0.2.0 35 | ipywidgets==7.7.0 36 | Jinja2==2.11.3 37 | jupyter-client==6.1.12 38 | jupyter-core==4.8.1 39 | jupyterlab-pygments==0.1.2 40 | jupyterlab-widgets==1.0.0 41 | mlflow-databricks-artifacts==2.0.0 42 | mlflow-skinny==1.29.0 43 | pip==23.1.2 44 | pydantic==1.9.2 45 | pytz==2021.3 46 | PyYAML==6.0 47 | pyzmq==22.2.1 48 | regex==2021.8.3 49 | requests==2.28.1 50 | requests-oauthlib==1.3.1 51 | requests-unixsocket==0.2.0 52 | urllib3==1.26.7 53 | virtualenv==20.8.0 54 | visions==0.7.4 55 | wasabi==0.10.1 56 | wcwidth==0.2.5 57 | webencodings==0.5.1 58 | websocket-client==1.3.1 59 | Werkzeug==2.0.2 60 | wheel==0.37.0 61 | widgetsnbextension==3.6.0 62 | wrapt==1.12.1 63 | xgboost==1.6.2 64 | zipp==3.6.0 65 | azureml-mlflow==1.50.0 66 | azureml-core==1.50.0 67 | azure-ai-ml==1.4.0 68 | sklearn_pandas==2.2.0 69 | azureml-sdk==1.50.0 70 | uszipcode 71 | lightgbm 72 | azureml-sdk[databricks]==1.50.0 73 | python-dotenv 74 | databricks-feature-store==0.10.* 75 | azure-cosmos==4.3.1 76 | 77 | [build] 78 | databricks-cli==0.17.0 79 | wheel==0.37.1 80 | 81 | [test] 82 | bandit==1.7.4 83 | freezegun==1.2.2 84 | pydocstyle==6.1.1 85 | pylint==2.15.0 86 | pylint_junit==0.3.2 87 | pytest==7.1.2 88 | pytest-cov==3.0.0 89 | -------------------------------------------------------------------------------- /src/pkg/nyc_taxi/src_nyc_taxi.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | common 2 | evaluation 3 | featurization 4 | prediction 5 | registration 6 | training 7 | -------------------------------------------------------------------------------- /test/entrypoint.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pytest 4 | 5 | if __name__ == '__main__': 6 | print(sys.argv[1:]) 7 | pytest.main(sys.argv[1:]) 8 | -------------------------------------------------------------------------------- /test/test_dbx_utils_pkg/test_utils_azure_login.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import unittest 4 | from unittest.mock import patch 5 | 6 | from dbx_utils.utils_azure_login import start_azure_login, run_cmd 7 | 8 | 9 | class TestRunCmd(unittest.TestCase): 10 | def test_run_cmd(self): 11 | test_cmd = ['echo', 'hello, world'] 12 | output, return_code = run_cmd(test_cmd) 13 | self.assertEqual(return_code, 0) 14 | self.assertEqual(output, ['hello, world']) 15 | 16 | def test_run_cmd_failure(self): 17 | test_cmd = ['12345'] 18 | with self.assertRaises(subprocess.CalledProcessError): 19 | run_cmd(test_cmd) 20 | 21 | 22 | class TestAzureLogin(unittest.TestCase): 23 | 24 | @patch('python.utils_azure_login.ARM_TENANT_ID', 'test_tenant_id') 25 | @patch('python.utils_azure_login.ARM_CLIENT_SECRET', 'test_client_secret') 26 | @patch('python.utils_azure_login.ARM_CLIENT_ID', 'test_client_id') 27 | @patch('python.utils_azure_login.run_cmd') 28 | def test_start_azure_login(self, mock_run_cmd): 29 | mock_run_cmd.return_value = ('', 0) 30 | return_code = start_azure_login() 31 | self.assertEqual(return_code, 0) # 0 is the expected return code for a successful login 32 | 33 | # This must be assessing the correct parameters are being passed to the run_cmd function 34 | # If someone changes the code, then the tests will fail 35 | mock_run_cmd.assert_called_once_with( 36 | [ 37 | 'az', 'login', 38 | '--service-principal', 39 | '-u', 'test_client_id', 40 | '-p', 'test_client_secret', 41 | '--tenant', 'test_tenant_id' 42 | ] 43 | ) 44 | -------------------------------------------------------------------------------- /test/test_dbx_utils_pkg/test_utils_create_azure_resources.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import subprocess 4 | import unittest 5 | import pytest 6 | from unittest.mock import patch, mock_open 7 | from dbx_utils.utils_create_azure_resources import deploy_azure_resources, run_cmd, LoadJson 8 | 9 | 10 | 11 | test_json = {"parameters": { 12 | "TemplateFilePath": { 13 | "value": "mlOps/devOps/infra/master_templates/main.bicep" 14 | }, 15 | "TemplateParamFilePath": { 16 | "value": "mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json" 17 | }, 18 | "location": { 19 | "value": "eastus" 20 | } 21 | } 22 | } 23 | 24 | class TestLoadJson: 25 | @patch("builtins.open", new_callable=mock_open, read_data=test_json) 26 | def test_load_json(self, mock_file): 27 | load_json = LoadJson() 28 | result = load_json.load_json() 29 | expected_result = { 30 | "parameters": { 31 | "TemplateFilePath": { 32 | "value": "mlOps/devOps/infra/master_templates/main.bicep" 33 | }, 34 | "TemplateParamFilePath": { 35 | "value": "mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json" 36 | }, 37 | "location": { 38 | "value": "eastus" 39 | } 40 | } 41 | } 42 | assert result == expected_result 43 | 44 | 45 | @patch.object(LoadJson, "load_json", return_value=test_json) 46 | def test_get_param_file_path(self, mock_load_json): 47 | load_json = LoadJson() 48 | result = load_json.get_param_file_path() 49 | expected_result = "mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json" 50 | assert result == expected_result 51 | 52 | 53 | @patch.object(LoadJson, "load_json", return_value=test_json) 54 | def test_get_template_file_path(self, mock_load_json): 55 | load_json = LoadJson() 56 | result = load_json.get_template_file_path() 57 | expected_result = "mlOps/devOps/infra/master_templates/main.bicep" 58 | assert result == expected_result 59 | 60 | 61 | @patch.object(LoadJson, "load_json", return_value=test_json) 62 | def test_get_location(self, mock_load_json): 63 | load_json = LoadJson() 64 | result = load_json.get_location() 65 | expected_result = "eastus" 66 | assert result == expected_result 67 | 68 | 69 | class TestRunCmd: 70 | def test_run_cmd_success(self): 71 | cmd = ["echo", "Hello, world!"] 72 | result = run_cmd(cmd) 73 | expected_result = ["Hello, world!"] 74 | assert result == expected_result 75 | 76 | def test_run_cmd_error(self): 77 | cmd = ["nonexistentcommand"] 78 | with pytest.raises(RuntimeError): 79 | run_cmd(cmd) 80 | 81 | 82 | class TestDeployAzureResources: 83 | @patch("python.utils_create_azure_resources.run_cmd") 84 | @patch.object(LoadJson, "get_param_file_path", return_value="mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json") 85 | @patch.object(LoadJson, "get_template_file_path", return_value="mlOps/devOps/infra/master_templates/main.bicep") 86 | @patch.object(LoadJson, "get_location", return_value="eastus") 87 | def test_deploy_azure_resources_success(self, mock_location, mock_template_file_path, mock_param_file_path, mock_run_cmd): 88 | mock_run_cmd.return_value = ('', 0) 89 | deploy_azure_resources() 90 | mock_run_cmd.assert_called_with( 91 | [ 92 | "az", "deployment", "sub", "create", 93 | "--location", "eastus", 94 | "--template-file", "path/to/template/file", 95 | "--parameters", "path/to/param/file", 96 | "--name", "test_environment", 97 | ] 98 | ) 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | def test_load_json(): 127 | # Mock the `open` function to return a mocked file object 128 | mocked_open = mock_open(read_data='''{ 129 | "parameters": { 130 | "TemplateFilePath": { 131 | "value": "mlOps/devOps/infra/master_templates/main.bicep" 132 | }, 133 | "TemplateParamFilePath": { 134 | "value": "mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json" 135 | }, 136 | "location": { 137 | "value": "eastus" 138 | } 139 | } 140 | }''') 141 | 142 | with patch('builtins.open', mocked_open): 143 | # Create an instance of the LoadJson class 144 | load_json_obj = LoadJson() 145 | 146 | # Call the load_json method and assert that it returns the expected dictionary 147 | assert load_json_obj.load_json() == { 148 | 'parameters': { 149 | 'TemplateFilePath': { 150 | 'value': 'mlOps/devOps/infra/master_templates/main.bicep' 151 | }, 152 | 'TemplateParamFilePath': { 153 | 'value': 'mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json' 154 | }, 155 | 'location': { 156 | 'value': 'eastus' 157 | } 158 | } 159 | } 160 | 161 | # Call the get_template_file_path method and assert that it returns the expected value 162 | assert load_json_obj.get_template_file_path() == 'mlOps/devOps/infra/master_templates/main.bicep' 163 | 164 | # Call the get_param_file_path method and assert that it returns the expected value 165 | assert load_json_obj.get_param_file_path() == 'mlOps/devOps/infra/master_templates/params/development/bicep.parameters.json' 166 | 167 | # Call the get_location method and assert that it returns the expected value 168 | assert load_json_obj.get_location() == 'eastus' 169 | 170 | 171 | class TestRunCmd(unittest.TestCase): 172 | def test_run_cmd(self): 173 | test_cmd = ['echo', 'hello, world'] 174 | output, return_code = run_cmd(test_cmd) 175 | self.assertEqual(return_code, 0) 176 | self.assertEqual(output, ['hello, world']) 177 | 178 | def test_run_cmd_failure(self): 179 | test_cmd = ['12345'] 180 | with self.assertRaises(subprocess.CalledProcessError): 181 | run_cmd(test_cmd) 182 | 183 | 184 | class TestCreateAzureResources(unittest.TestCase): 185 | 186 | @patch('python.utils_azure_login.ARM_TENANT_ID', 'test_tenant_id') 187 | @patch('python.utils_azure_login.ARM_CLIENT_SECRET', 'test_client_secret') 188 | @patch('python.utils_azure_login.ARM_CLIENT_ID', 'test_client_id') 189 | @patch('python.utils_azure_login.run_cmd') 190 | def test_start_azure_login(self, mock_run_cmd): 191 | mock_run_cmd.return_value = ('', 0) 192 | return_code = deploy_azure_resources() 193 | self.assertEqual(return_code, 0) # 0 is the expected return code for a successful login 194 | 195 | # This must be assessing the correct parameters are being passed to the run_cmd function 196 | # If someone changes the code, then the tests will fail. 197 | 198 | mock_run_cmd.assert_called_once_with( 199 | [ 200 | "az", "deployment", "sub", "create", 201 | "--location", location, 202 | "--template-file", template_file_path, 203 | "--parameters", template_param_file_path, 204 | "--name", ENVIRONMENT, 205 | "--only-show-errors" ] 206 | ) 207 | 208 | -------------------------------------------------------------------------------- /test/test_dbx_utils_pkg/test_utils_create_repo_folder.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, MagicMock, mock_open 3 | from unittest import mock 4 | import pytest 5 | from _pytest.monkeypatch import MonkeyPatch 6 | import json 7 | import requests 8 | 9 | from dbx_utils.utils_create_repo_folder import _ingest_repo_param_file, create_databricks_repos 10 | 11 | 12 | class TestCreateRepoFolder(unittest.TestCase): 13 | 14 | @patch('requests.post') 15 | def test_create_databricks_repos_success(self, mock_post): 16 | monkeypatch = MonkeyPatch() 17 | monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id') 18 | monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id') 19 | monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token') 20 | monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token') 21 | monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance') 22 | 23 | 24 | mock_post.return_value.status_code = 200 25 | 26 | mock_repo_json = { 27 | "url": "test_url", 28 | "provider": "test_provider", 29 | "path": "test_folder" 30 | } 31 | 32 | status_code = create_databricks_repos(mock_repo_json) 33 | 34 | assert status_code == 200 35 | expected_dbkrs_req_headers = { 36 | 'Authorization': 'Bearer test_databricks_aad_token', 37 | 'X-Databricks-Azure-SP-Management-Token': 'test_databricks_management_token', 38 | 'X-Databricks-Azure-Workspace-Resource-Id': 'test_workspace_id', 39 | 'Content-Type': 'application/json'} 40 | 41 | mock_post.assert_called_once_with( 42 | 'https://test_databricks_instance/api/2.0/repos', 43 | headers=expected_dbkrs_req_headers, 44 | json=mock_repo_json) 45 | 46 | @patch('requests.post') 47 | def test_create_databricks_repos_failure(self, mock_post): 48 | monkeypatch = MonkeyPatch() 49 | monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id') 50 | monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id') 51 | monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token') 52 | monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token') 53 | monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance') 54 | 55 | mock_post.return_value.status_code = 400 56 | 57 | mock_repo_json = { 58 | "url": "test_url", 59 | "provider": "test_provider", 60 | "path": "test_folder" 61 | } 62 | 63 | with pytest.raises(Exception) as e: 64 | status_code = create_databricks_repos(mock_repo_json) 65 | assert status_code == 400 66 | 67 | 68 | 69 | class TestIngestRepoParamFile(unittest.TestCase): 70 | 71 | test_repo_json = { 72 | "Git_Configuration": [ 73 | { 74 | "git_username": "test_username", 75 | "git_provider": "test_provider", 76 | } 77 | ], 78 | "Repo_Configuration": [ 79 | { 80 | "url": "test_url", 81 | "provider": "test_provider", 82 | "path": "test_folder" 83 | } 84 | ] 85 | } 86 | 87 | test_repo_json = json.dumps(test_repo_json) 88 | 89 | 90 | @patch("builtins.open", new_callable=mock_open, read_data=test_repo_json) 91 | def test_load_json(self, mock_open): 92 | monkeypatch = MonkeyPatch() 93 | monkeypatch.setenv('ENVIRONMENT', 'test_environment') 94 | #cluster = Cluster() 95 | 96 | result = _ingest_repo_param_file( "test_cluster_param_file.json") 97 | 98 | # Expected result is an array and not an object 99 | expected_result = [ 100 | { 101 | "url": "test_url", 102 | "provider": "test_provider", 103 | "path": "test_folder" 104 | } 105 | ] 106 | assert result == expected_result -------------------------------------------------------------------------------- /test/test_dbx_utils_pkg/test_utils_repo_pull.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, MagicMock, mock_open 3 | from unittest import mock 4 | import pytest 5 | from _pytest.monkeypatch import MonkeyPatch 6 | import json 7 | import requests 8 | 9 | from dbx_utils.utils_repo_pull import _ingest_repo_param_file, get_repos_with_management_permissions, update_repo, main 10 | 11 | 12 | class TestIngestRepoParamFile(unittest.TestCase): 13 | 14 | test_repo_json = { 15 | "Git_Configuration": [ 16 | { 17 | "git_username": "test_username", 18 | "git_provider": "test_provider", 19 | } 20 | ], 21 | "Repo_Configuration": [ 22 | { 23 | "url": "test_url", 24 | "provider": "test_provider", 25 | "path": "test_folder" 26 | } 27 | ] 28 | } 29 | 30 | test_repo_json = json.dumps(test_repo_json) 31 | 32 | 33 | @patch("builtins.open", new_callable=mock_open, read_data=test_repo_json) 34 | def test_load_json(self, mock_open): 35 | monkeypatch = MonkeyPatch() 36 | monkeypatch.setenv('ENVIRONMENT', 'test_environment') 37 | #cluster = Cluster() 38 | 39 | result = _ingest_repo_param_file( "test_cluster_param_file.json") 40 | 41 | # Expected result is an array and not an object 42 | expected_result = [ 43 | { 44 | "url": "test_url", 45 | "provider": "test_provider", 46 | "path": "test_folder" 47 | } 48 | ] 49 | assert result == expected_result 50 | 51 | #get_repos_with_management_permissions 52 | class GetReposWithManagementPermissions(unittest.TestCase): 53 | 54 | @patch('requests.get') 55 | def test_get_repos_with_management_permissions_success(self, mock_get): 56 | monkeypatch = MonkeyPatch() 57 | 58 | monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id') 59 | monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id') 60 | monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token') 61 | monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token') 62 | monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance') 63 | 64 | mock_get.return_value.status_code = 200 65 | 66 | mock_return = { 67 | "repos":[ 68 | { 69 | "id":61449681029719, 70 | "path":"/Repos/***/test_dbx_repo_folder_one", 71 | "url":"https://github.com/test_repo_profile/test_repo_one", 72 | "provider":"gitHub", 73 | "branch":"main", 74 | "head_commit_id":"test_commit_id" 75 | } 76 | ] 77 | } 78 | 79 | mock_get.return_value.json.return_value = mock_return 80 | 81 | repos_with_management_permissions, status_code = get_repos_with_management_permissions() 82 | 83 | assert repos_with_management_permissions == mock_return["repos"] 84 | assert status_code == 200 85 | 86 | 87 | expected_dbkrs_req_headers = { 88 | 'Authorization': 'Bearer test_databricks_aad_token', 89 | 'X-Databricks-Azure-SP-Management-Token': 'test_databricks_management_token', 90 | 'X-Databricks-Azure-Workspace-Resource-Id': 'test_workspace_id', 91 | 'Content-Type': 'application/json'} 92 | 93 | 94 | mock_get.assert_called_once_with( 95 | 'https://test_databricks_instance/api/2.0/repos', 96 | headers=expected_dbkrs_req_headers 97 | ) 98 | 99 | 100 | @patch('requests.get') 101 | def test_get_repos_with_management_permissions_failure(mock_get): 102 | monkeypatch = MonkeyPatch() 103 | 104 | monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id') 105 | monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id') 106 | monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token') 107 | monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token') 108 | monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance') 109 | 110 | mock_get.return_value.status_code = 500 111 | 112 | with pytest.raises(Exception) as e: 113 | status_code = get_repos_with_management_permissions() 114 | assert status_code == 500 115 | 116 | 117 | # update_repo 118 | 119 | class UpdateRepo(unittest.TestCase): 120 | 121 | @patch('requests.patch') 122 | def test_update_repo_success(self, mock_patch): 123 | monkeypatch = MonkeyPatch() 124 | monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id') 125 | monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id') 126 | monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token') 127 | monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token') 128 | monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance') 129 | 130 | mock_repo_id = 123456789 131 | mock_update_branch = "test_main_branch" 132 | 133 | mock_patch.return_value.status_code = 200 134 | 135 | status_code = update_repo(mock_repo_id, mock_update_branch ) 136 | 137 | assert status_code == 200 138 | 139 | 140 | expected_dbkrs_req_headers = { 141 | 'Authorization': 'Bearer test_databricks_aad_token', 142 | 'X-Databricks-Azure-SP-Management-Token': 'test_databricks_management_token', 143 | 'X-Databricks-Azure-Workspace-Resource-Id': 'test_workspace_id', 144 | 'Content-Type': 'application/json'} 145 | 146 | mock_patch.assert_called_once_with( 147 | "https://test_databricks_instance/api/2.0/repos/" + str(mock_repo_id), 148 | headers=expected_dbkrs_req_headers, 149 | json={ 150 | "branch": mock_update_branch 151 | } 152 | ) 153 | 154 | @patch('requests.post') 155 | def test_update_repo_failure(self, mock_post): 156 | 157 | mock_repo_id = 123456789 158 | mock_update_branch = "test_main_branch" 159 | 160 | mock_post.return_value.status_code = 500 161 | 162 | with pytest.raises(Exception) as e: 163 | status_code = update_repo(mock_repo_id, mock_update_branch ) 164 | assert status_code == 500 165 | 166 | 167 | class Main(unittest.TestCase): 168 | 169 | test_repo_json = { 170 | "Git_Configuration": [ 171 | { 172 | "git_username": "test_username", 173 | "git_provider": "test_provider", 174 | } 175 | ], 176 | "Repo_Configuration": [ 177 | { 178 | "url": "test_url", 179 | "provider": "test_provider", 180 | "path": "test_folder" 181 | } 182 | ] 183 | } 184 | test_repo_json = json.dumps(test_repo_json) 185 | 186 | 187 | @patch('python.utils_repo_pull.update_repo') 188 | @patch('python.utils_repo_pull.get_repos_with_management_permissions') 189 | @patch('python.utils_repo_pull._ingest_repo_param_file') 190 | def test_main_success(self, mock_ingest_repo_param_file, mock_get_repos_with_management_permissions, mock_update_repo): 191 | 192 | # monkey patch environment variables 193 | monkeypatch = MonkeyPatch() 194 | monkeypatch.setenv('ENVIRONMENT', 'test_environment') 195 | monkeypatch.setenv('ARM_CLIENT_ID', 'test_arm_client_id') 196 | monkeypatch.setenv('WORKSPACE_ID', 'test_workspace_id') 197 | monkeypatch.setenv('DATABRICKS_MANAGEMENT_TOKEN', 'test_databricks_management_token') 198 | monkeypatch.setenv('DATABRICKS_AAD_TOKEN', 'test_databricks_aad_token') 199 | monkeypatch.setenv('DATABRICKS_INSTANCE', 'test_databricks_instance') 200 | 201 | 202 | mock_ingest_repo_param_file_json_return = [{ 203 | "url": "test_url", 204 | "provider": "test_provider", 205 | "path": "test_folder", 206 | "branch": "test_branch" 207 | }] 208 | 209 | mock_ingest_repo_param_file.return_value = mock_ingest_repo_param_file_json_return 210 | 211 | # Should be doing a mock open instead !!! 212 | #mock_ingest_repo_param_file.return_value = mock_ingest_repo_param_file_json_return 213 | 214 | # mock return value from get repos with management permissions 215 | mock_get_repos_with_management_permissions_json_return = [ 216 | { 217 | "id":61449681029719, 218 | "path":"/Repos/***/test_folder", 219 | "url":"https://github.com/test_repo_profile/test_repo_one", 220 | "provider":"gitHub", 221 | "branch":"main", 222 | "head_commit_id":"test_commit_id" 223 | } 224 | ] 225 | 226 | mock_get_repos_with_management_permissions.return_value = (mock_get_repos_with_management_permissions_json_return, 200) 227 | 228 | # mock return value from update repo 229 | mock_update_repo.return_value = 200 230 | 231 | # call main function 232 | status_code = main() 233 | 234 | # assert main function returns 200 235 | assert status_code == 200 236 | 237 | 238 | # assert mock functions were called using correct arguments 239 | mock_ingest_repo_param_file.assert_called_once_with('mlOps/devOps/params/test_environment/repos.json') 240 | mock_update_repo.assert_called_once_with("61449681029719", "test_branch") 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | --------------------------------------------------------------------------------