├── .github └── workflows │ └── cicd.yml ├── .gitignore ├── Code1.py ├── Code2.py ├── LICENSE ├── README.md ├── __init__.py ├── azure-pipelines.yml ├── images ├── cicd-workflow.png ├── create-personal-project.png ├── create-project-in-staging.png ├── create-staging-folder.png ├── release-pipeline-devops.png ├── release-pipeline-github-actions.png ├── release-pipeline.png └── release-tasks-devops.png ├── my_package ├── __init__.py ├── code1.py └── code2.py ├── terraform └── azuredevops │ ├── README.md │ ├── cluster.tf │ ├── devops.tf │ ├── main.tf │ ├── repo.tf │ └── variables.tf └── unit-tests ├── test_with_arbitrary_files.py └── test_with_percent_run.py /.github/workflows/cicd.yml: -------------------------------------------------------------------------------- 1 | name: CICD 2 | on: 3 | push: 4 | paths-ignore: 5 | - README.md 6 | - LICENSE 7 | - images 8 | - terraform 9 | - azure-pipeline.yml 10 | 11 | env: 12 | DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }} 13 | DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} 14 | REPO_DIRECTORY: ${{ vars.REPO_DIRECTORY }} 15 | CLUSTER_ID: ${{ vars.CLUSTER_ID }} 16 | 17 | permissions: 18 | checks: write 19 | jobs: 20 | OnPush: 21 | if: github.ref_name == 'dev' 22 | environment: dev 23 | runs-on: ubuntu-latest 24 | steps: 25 | - name: Checkout repo 26 | uses: actions/checkout@v3 27 | 28 | - name: Setup Python 29 | uses: actions/setup-python@v4 30 | with: 31 | python-version: 3.8 32 | 33 | - name: Install libraries 34 | run: | 35 | pip install --upgrade pip nutter 36 | - name: install-databricks-cli 37 | uses: microsoft/install-databricks-cli@v1.0.0 38 | 39 | - name: Update databricks repo 40 | run: | 41 | databricks repos update --path $REPO_DIRECTORY --branch "${{github.ref_name}}" 42 | 43 | - name: Run Tests 44 | run: | 45 | nutter run "$REPO_DIRECTORY/unit-tests/" --cluster_id $CLUSTER_ID --recursive --junit_report --timeout 500 46 | 47 | - name: Publish Test Report 48 | uses: mikepenz/action-junit-report@v3 49 | if: success() || failure() # always run even if the previous step fails 50 | with: 51 | report_paths: 'test-*.xml' 52 | 53 | 54 | OnRelease: 55 | if: startsWith(github.ref_name, 'releases') 56 | environment: staging 57 | runs-on: ubuntu-latest 58 | steps: 59 | - name: Checkout repo 60 | uses: actions/checkout@v3 61 | 62 | - name: Setup Python 63 | uses: actions/setup-python@v4 64 | with: 65 | python-version: 3.8 66 | 67 | - name: Install libraries 68 | run: | 69 | pip install --upgrade pip nutter 70 | - name: install-databricks-cli 71 | uses: microsoft/install-databricks-cli@v1.0.0 72 | 73 | - name: Update databricks repo 74 | run: | 75 | databricks repos update --path $REPO_DIRECTORY --branch "${{github.ref_name}}" 76 | - name: Run Tests 77 | run: | 78 | pwd 79 | nutter run "$REPO_DIRECTORY/unit-tests/" --cluster_id $CLUSTER_ID --recursive --junit_report --timeout 500 80 | - name: Publish Test Report 81 | uses: mikepenz/action-junit-report@v3 82 | if: success() || failure() # always run even if the previous step fails 83 | with: 84 | report_paths: 'test-*.xml' 85 | 86 | DeployProduction: 87 | environment: prod 88 | runs-on: ubuntu-latest 89 | needs: [OnRelease] 90 | steps: 91 | - name: Checkout repo 92 | uses: actions/checkout@v3 93 | 94 | - name: Setup Python 95 | uses: actions/setup-python@v4 96 | with: 97 | python-version: 3.8 98 | 99 | - name: Install libraries 100 | run: | 101 | pip install --upgrade pip nutter 102 | - name: install-databricks-cli 103 | uses: microsoft/install-databricks-cli@v1.0.0 104 | 105 | - name: Update databricks repo 106 | run: | 107 | databricks repos update --path $REPO_DIRECTORY --branch "${{github.ref_name}}" 108 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | /Release using Projects.json 3 | /test-*.xml 4 | /.DS_Store 5 | /images/.DS_Store 6 | /out 7 | .terraform/ 8 | .terraform.lock.hcl 9 | terraform.tfvars 10 | terraform.tfstate 11 | terraform.tfstate.backup 12 | -------------------------------------------------------------------------------- /Code1.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import pyspark.sql.functions as F 3 | import pyspark.sql.types as T 4 | from pyspark.sql import DataFrame, SparkSession 5 | 6 | # COMMAND ---------- 7 | 8 | def generate_data1(n=1000, name='my_cool_data'): 9 | df = SparkSession.getActiveSession().range(0, n) 10 | df.createOrReplaceTempView(name) 11 | 12 | # COMMAND ---------- 13 | 14 | def upper_columns(df: DataFrame, cols: list) -> DataFrame: 15 | new_cols = [] 16 | for field in df.schema.fields: 17 | if field.dataType == T.StringType() and field.name in cols: 18 | new_cols.append(F.upper(F.col(field.name)).alias(field.name)) 19 | else: 20 | new_cols.append(F.col(field.name)) 21 | 22 | return df.select(*new_cols) 23 | 24 | # COMMAND ---------- 25 | 26 | # add the comment 27 | -------------------------------------------------------------------------------- /Code2.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | from pyspark.sql import SparkSession 3 | 4 | 5 | def generate_data2(table_name="my_data"): 6 | df = SparkSession.getActiveSession().range(0,10) 7 | df.write.format("delta").mode("overwrite").saveAsTable(table_name) 8 | 9 | # COMMAND ---------- 10 | 11 | # just add code.... 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 Alex Ott 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository contains notebooks & instructions for setting up the demo of development workflow & CI/CD (on Azure DevOps) using the Databricks notebooks and [Repos feature](https://docs.databricks.com/repos.html). Testing of notebooks is done using the [Nutter library](https://github.com/microsoft/nutter) developed by Microsoft. 2 | 3 | Two approaches are demonstrated: 4 | 1. Using notebooks & including the code using `%run` ([doc](https://docs.databricks.com/notebooks/notebooks-use.html#run)) - the "main" code is in the notebooks `Code1.py` and `Code2.py`, and the testing code is in the `unit-tests/test_with_percent_run.py`. 5 | 1. Using notebook for test itself, but including main code as Python packages using [arbitrary files in Repos](https://docs.databricks.com/repos.html#work-with-non-notebook-files-in-a-databricks-repo) functionality (DBR 9.1+). Main code is in the `my_package/code1.py` and `my_package/code2.py` files, and test is in `unit-tests/test_with_arbitrary_files.py`. 6 | 7 | This demo shows how you can use Repos to work on your own copy of notebooks, test them after commit in the "staging" environment, and promote to "production" on successful testing of `releases` branch. 8 | 9 | * [The workflow](#the-workflow) 10 | * [Setup on Databricks side](#setup-on-databricks-side) 11 | * [Setup Azure DevOps pipelines](#setup-azure-devops-pipelines) 12 | * [Create variables group to keep common configuration](#create-variables-group-to-keep-common-configuration) 13 | * [Create a build pipeline](#create-a-build-pipeline) 14 | * [Create a release pipeline](#create-a-release-pipeline) 15 | * [FAQ & Troubleshooting](#faq--troubleshooting) 16 | * [I'm getting "Can’t find repo ID for /Repos/..." when trying to update a repo](#im-getting-cant-find-repo-id-for-repos-when-trying-to-update-a-repo) 17 | * [I'm getting "Error fetching repo ID for ... Unauthorized access to Org..."](#im-getting-error-fetching-repo-id-for--unauthorized-access-to-org) 18 | * [How can I perform Repos operations using the service principal?](#how-can-i-perform-repos-operations-using-the-service-principal) 19 | 20 | There is a possibility of automated setup of this demo using the Terraform. Look into [terraform](terraform) folder for existing implementations. 21 | 22 | 23 | # The workflow 24 | 25 | The development workflow is organized as on following image: 26 | 27 | ![Development workflow](images/cicd-workflow.png) 28 | 29 | 1. Developer works on the code in the separate environment (personal space on Databricks, etc.). When code changes are done, they are committed into some branch 30 | 1. CI/CD implementation (Azure DevOps here) picks up the changes, and tests them in a staging environment (executes the "build pipeline"). This consists of several steps (see [azure-pipelines.yml](azure-pipelines.yml) for technical details): 31 | * Update repository checkout in the "Staging" folder 32 | * Execute tests with updated code 33 | * Publish tests results 34 | 1. In current setup, there are different jobs for the "normal" branches, and for "release" branch (`releases` in this setup), this would allow to run different sets of tests when we're preparing the release 35 | 1. If commit is done to the "release branch, and there are no test failures, then the "release pipeline" is triggered, and it updates the production environment by updating the repository checkout in the "Production" folder. 36 | 37 | 38 | # Setup on Databricks side 39 | 40 | Your Databricks workspace needs to have Repos functionality enabled. If it's enabled, you should see the "Repos" icon in the navigation panel: 41 | 42 | * Fork repository into your environment - Github, or Azure DevOps (follow Databricks documentation on using it) 43 | * In the Repos, click "Create Repo" and link it to the Git repository that you've forked - this will be your personal copy of the code that will be used for work: 44 | 45 | ![Create a personal repo](images/create-personal-project.png) 46 | 47 | * Create the staging & production checkouts 48 | * In the Repos, in the top-level part, click on the "ᐯ" near the "Repos" header, select "Create" and select "Folder" (see image). Give it the name "Staging": 49 | 50 | ![Create a staging folder](images/create-staging-folder.png) 51 | 52 | * Click on the "ᐯ" near the "Staging" folder, and click "Create" and select "Repo": 53 | 54 | ![Create a staging repository](images/create-project-in-staging.png) 55 | 56 | * Link it to the Git repository, similarly how you did it for your personal checkout 57 | * Create the "Production" folder with repository inside, repeating two previous steps 58 | * Create a new cluster that will be used for execution of the tests, you will need to pass the [cluster ID](https://docs.databricks.com/workspace/workspace-details.html#cluster-url-and-id) to the Nutter to execute the tests 59 | 60 | 61 | # Setup Azure DevOps pipelines 62 | 63 | The Azure DevOps setup consists of the several steps, described in the next sections. It's assumed that project in Azure DevOps already exists. 64 | 65 | We need to create a [personal access token (PAT)](https://docs.databricks.com/administration-guide/access-control/tokens.html) that will be used for execution of the tests & updating the repository. This token will be used to authenticate to Databricks workspace, and then it will fetch configured token to authenticate to Git provider. We also need to connect Databricks workspace to the Git provider - usually it's done by using the provider-specific access tokens - see [documentation](https://docs.databricks.com/repos.html#configure-your-git-integration-with-databricks) on details of setting the integration with specific Git provider (**note, that when repository is on Azure DevOps, you still need to generate Azure DevOps token to make API working**!, and also provide the user name in the Git settings). 66 | 67 | > :warning: the previous instructions on using Repos + Azure DevOps with service principals weren't correct, so were removed! 68 | 69 | ### Create variables group to keep common configuration 70 | 71 | Because we have several pipelines, the it's makes sense to define [variable group](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups) to store the data that are necessary for execution of tests & deployment of the code. We need following configuration properties for execution of our pipelines: 72 | 73 | * `databricks_host` - the [URL of your workspace](https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) where tests will be executed (host name with `https://`, without `?o=`, and **without trailing slash character**. For example: `https://adb-1568830229861029.9.azuredatabricks.net`). 74 | * `databricks_token` - personal access token for executing commands against the workspace. Mark this variable as private! 75 | * `cluster_id` - the ID of the cluster where tests will be executed. DBR 9.1+ should be used to support arbitrary files. 76 | * `staging_directory` - the directory for staging checkout that we created above. For example, `/Repos/Staging/databricks-nutter-repos-demo`. 77 | 78 | The name of the variable group is used in the [azure-pipelines.yml](azure-pipelines.yml). By default its name is "Nutter Testing". Change the [azure-pipelines.yml](azure-pipelines.yml) if you use another name for variable group. 79 | 80 | ### Create a build pipeline 81 | 82 | Azure DevOps can work with GitHub repositories as well - see [documentation](https://docs.microsoft.com/en-us/azure/devops/pipelines/repos/github) for more details on how to link DevOps with GitHub. 83 | 84 | * In the Azure DevOps, in the Pipelines section, select Pipelines, and click "New pipeline" 85 | * Select GitHub and repository 86 | * In the "Configure" step select the "Existing Azure Pipelines YAML file" and specify the name of the existing file: [azure-pipelines.yml](azure-pipelines.yml) 87 | * Save pipeline 88 | 89 | 90 | ### Create a release pipeline 91 | 92 | * In the Azure DevOps, in the Pipelines section, select Releases, and click "New release pipeline" 93 | * Select "Empty Job" in the dialog 94 | * In the Stage dialog enter some meaningful name for it 95 | * In the "Variables" tab, link the variable group that was created previously 96 | * Configure job & task: 97 | * Configure agent - in the "Agent Specification" select "ubuntu-18.04" 98 | * Click on "+" and find the "Command line" task 99 | * Enter following code that will connect to the production environment & update the checkout of the repository (via [Repos REST API](https://docs.databricks.com/dev-tools/api/latest/repos.html)): 100 | 101 | ```sh 102 | python -m pip install --upgrade databricks-cli 103 | databricks repos update --path /Repos/Production/databricks-nutter-repos-demo --branch releases 104 | ``` 105 | 106 | * Below the code, add environment variable `DATABRICKS_TOKEN` with value `$(DATABRICKS_TOKEN)` - this will pull it from the variable group into the script's execution context 107 | * Save task & job 108 | * We also need to configure an artifact: 109 | * Click on "Add artifact", select project, and source (the name of the build pipeline). Also, for "Default version" select "Latest from specific branch with tags" and select the "releases" branch. Click on "Add" to add artifact into pipeline 110 | * Click on the "⚡" icon to configure the continuous deployment (by default, release is triggered manually). Add branch filter and also select the `releases` branch 111 | * Your release pipeline should look as following: 112 | 113 | ![Release pipeline](images/release-pipeline.png) 114 | 115 | * Save the pipeline 116 | 117 | After all of this done, the release pipeline will be automatically executed on every successful build in the `releases` branch. 118 | 119 | ## Github Actions Workflow: 120 | 121 | * We need to create a [personal access token (PAT)](https://docs.databricks.com/administration-guide/access-control/tokens.html) that will be used for execution of the tests & updating the repository. This token will be used to authenticate to Databricks workspace, and then it will fetch configured token to authenticate to Git provider. We also need to connect Databricks workspace to the Git provider - usually it's done by using the provider-specific access tokens - see [documentation](https://docs.databricks.com/repos.html#configure-your-git-integration-with-databricks) on details of setting the integration with specific Git provider 122 | 123 | * Create dev, stage and prod [Environments](https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment) in github settings. With environments it is easy to use the same variables names and secret names accross different environments 124 | 125 | Create the following properties within each environment: 126 | 127 | * `databricks_host` - the [URL of your workspace](https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) where tests will be executed (host name with `https://`, without `?o=`, and **without trailing slash character**. For example: `https://adb-1568830229861029.9.azuredatabricks.net`). 128 | * `databricks_token` - personal access token for executing commands against the workspace. Create this as a secrete variable 129 | * `cluster_id` - the ID of the cluster where tests will be executed. DBR 9.1+ should be used to support arbitrary files. 130 | * `repo_directory` - the directory for checkout for specific environment. For example, `/Repos/Staging/databricks-nutter-repos-demo`. 131 | 132 | The workflow is the same as above and the pipeline looks as following: 133 | ![Release pipeline](images/release-pipeline-github-actions.png) 134 | 135 | # FAQ & Troubleshooting 136 | 137 | ## I'm getting "Can’t find repo ID for /Repos/..." when trying to update a repo 138 | 139 | This often happens when you're trying to use `databricks repos update` for workspace that have IP Access Lists enabled. The error message is a misleading, and will be fixed by [this pull request](https://github.com/databricks/databricks-cli/pull/428). 140 | 141 | ## I'm getting "Error fetching repo ID for ... Unauthorized access to Org..." 142 | 143 | This usually happens when you're trying to run CI/CD pipeline against a Databricks workspace with IP Access Lists enabled, and CI/CD server not in the allow list. 144 | 145 | ## How can I perform Repos operations using the service principal? 146 | 147 | To perform operations on Repos (update, etc.) we need to associate a Git token with an identity that performs that operation. Please see the following documentation: 148 | * [CICD with SPNs](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/ci-cd/ci-cd-sp) 149 | * [Git Credentials REST API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/gitcredentials) 150 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/__init__.py -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Grab variables from the specific variable group and 2 | # determine sourceBranchName (avoids SourchBranchName=merge 3 | # for PR) 4 | variables: 5 | - group: 'Nutter Testing' 6 | - name: 'branchName' 7 | ${{ if startsWith(variables['Build.SourceBranch'], 'refs/heads/') }}: 8 | value: $[ replace(variables['Build.SourceBranch'], 'refs/heads/', '') ] 9 | ${{ if startsWith(variables['Build.SourceBranch'], 'refs/pull/') }}: 10 | value: $[ replace(variables['System.PullRequest.SourceBranch'], 'refs/heads/', '') ] 11 | 12 | trigger: 13 | batch: true 14 | branches: 15 | include: 16 | - '*' 17 | paths: 18 | exclude: 19 | - README.md 20 | - LICENSE 21 | - images 22 | - terraform 23 | - .github 24 | 25 | # tags: 26 | # include: 27 | # - v*.* 28 | # - prod 29 | 30 | # This need an additional debugging 31 | # pr: 32 | # branches: 33 | # include: 34 | # - master 35 | # - releases 36 | # paths: 37 | # exclude: 38 | # - README.md 39 | # - images 40 | 41 | stages: 42 | - stage: onPush 43 | condition: | 44 | and( 45 | ne(variables['Build.SourceBranch'], 'refs/heads/releases'), 46 | not(startsWith(variables['Build.SourceBranch'], 'refs/tags/v')) 47 | ) 48 | jobs: 49 | - job: onPushJob 50 | pool: 51 | vmImage: 'ubuntu-20.04' 52 | 53 | steps: 54 | - script: env | sort 55 | displayName: 'Environment / Context' 56 | 57 | - task: UsePythonVersion@0 58 | displayName: 'Use Python 3.8' 59 | inputs: 60 | versionSpec: 3.8 61 | 62 | - checkout: self 63 | displayName: 'Checkout & Build.Reason: $(Build.Reason) & Build.SourceBranchName: $(Build.SourceBranchName)' 64 | 65 | - script: | 66 | python -m pip install pip nutter 67 | # this is because of the old dependency inside Nutter 68 | python -m pip install --upgrade databricks-cli 69 | displayName: 'Install dependencies' 70 | 71 | # https://docs.databricks.com/dev-tools/api/latest/repos.html 72 | # this is simplification, and won't work with concurrent commits. Ideally it should be a 73 | # separate repo for each commit 74 | - script: | 75 | echo "Checking out the $(branchName) branch" 76 | databricks repos update --path $(STAGING_DIRECTORY) --branch "$(branchName)" 77 | env: 78 | DATABRICKS_HOST: $(DATABRICKS_HOST) 79 | DATABRICKS_TOKEN: $(DATABRICKS_TOKEN) 80 | displayName: 'Update Staging project' 81 | 82 | - script: | 83 | nutter run "$(STAGING_DIRECTORY)/unit-tests/" --cluster_id $(CLUSTER_ID) --recursive --junit_report --timeout 500 84 | env: 85 | DATABRICKS_HOST: $(DATABRICKS_HOST) 86 | DATABRICKS_TOKEN: $(DATABRICKS_TOKEN) 87 | displayName: 'Execute Nutter tests' 88 | 89 | - task: PublishTestResults@2 90 | condition: succeededOrFailed() 91 | inputs: 92 | testResultsFormat: 'JUnit' 93 | testResultsFiles: '**/test-*.xml' 94 | failTaskOnFailedTests: true 95 | 96 | - stage: onRelease 97 | condition: | 98 | eq(variables['Build.SourceBranch'], 'refs/heads/releases') 99 | jobs: 100 | - job: onReleaseJob 101 | pool: 102 | vmImage: 'ubuntu-20.04' 103 | 104 | steps: 105 | - script: env | sort 106 | displayName: 'Environment / Context' 107 | 108 | - task: UsePythonVersion@0 109 | displayName: 'Use Python 3.8' 110 | inputs: 111 | versionSpec: 3.8 112 | 113 | - checkout: self 114 | persistCredentials: true 115 | clean: true 116 | displayName: 'Checkout & Build.Reason: $(Build.Reason) & Build.SourceBranchName: $(Build.SourceBranchName)' 117 | 118 | - script: | 119 | python -m pip install --upgrade pip nutter 120 | # this is because of the old dependency inside Nutter 121 | python -m pip install --upgrade databricks-cli 122 | displayName: 'Install dependencies' 123 | 124 | - script: | 125 | echo "Checking out the releases branch" 126 | databricks repos update --path $(STAGING_DIRECTORY) --branch "$(Build.SourceBranchName)" 127 | env: 128 | DATABRICKS_HOST: $(DATABRICKS_HOST) 129 | DATABRICKS_TOKEN: $(DATABRICKS_TOKEN) 130 | displayName: 'Update Staging repository' 131 | 132 | # We can do a separate set of the tests for release branches 133 | - script: | 134 | nutter run "$(STAGING_DIRECTORY)/unit-tests/" --cluster_id $(CLUSTER_ID) --recursive --junit_report --timeout 500 135 | env: 136 | DATABRICKS_HOST: $(DATABRICKS_HOST) 137 | DATABRICKS_TOKEN: $(DATABRICKS_TOKEN) 138 | displayName: 'Execute Nutter tests on release' 139 | 140 | - task: PublishTestResults@2 141 | condition: succeededOrFailed() 142 | inputs: 143 | testResultsFormat: 'JUnit' 144 | testResultsFiles: '**/test-*.xml' 145 | failTaskOnFailedTests: true 146 | 147 | -------------------------------------------------------------------------------- /images/cicd-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/cicd-workflow.png -------------------------------------------------------------------------------- /images/create-personal-project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/create-personal-project.png -------------------------------------------------------------------------------- /images/create-project-in-staging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/create-project-in-staging.png -------------------------------------------------------------------------------- /images/create-staging-folder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/create-staging-folder.png -------------------------------------------------------------------------------- /images/release-pipeline-devops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/release-pipeline-devops.png -------------------------------------------------------------------------------- /images/release-pipeline-github-actions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/release-pipeline-github-actions.png -------------------------------------------------------------------------------- /images/release-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/release-pipeline.png -------------------------------------------------------------------------------- /images/release-tasks-devops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/release-tasks-devops.png -------------------------------------------------------------------------------- /my_package/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/my_package/__init__.py -------------------------------------------------------------------------------- /my_package/code1.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | import pyspark.sql.types as T 3 | from pyspark.sql import DataFrame 4 | 5 | def generate_data1(spark, n=1000, name='my_cool_data'): 6 | df = spark.range(0, n) 7 | df.createOrReplaceTempView(name) 8 | 9 | def upper_columns(df: DataFrame, cols: list) -> DataFrame: 10 | new_cols = [] 11 | for field in df.schema.fields: 12 | if field.dataType == T.StringType() and field.name in cols: 13 | new_cols.append(F.upper(F.col(field.name)).alias(field.name)) 14 | else: 15 | new_cols.append(F.col(field.name)) 16 | 17 | return df.select(*new_cols) 18 | 19 | def lower_columns(df: DataFrame, cols: list) -> DataFrame: 20 | new_cols = [] 21 | for field in df.schema.fields: 22 | if field.dataType == T.StringType() and field.name in cols: 23 | new_cols.append(F.lower(F.col(field.name)).alias(field.name)) 24 | else: 25 | new_cols.append(F.col(field.name)) 26 | 27 | return df.select(*new_cols) -------------------------------------------------------------------------------- /my_package/code2.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | def generate_data2(table_name="my_data"): 4 | df = SparkSession.getActiveSession().range(0,10) 5 | df.write.format("delta").mode("overwrite").saveAsTable(table_name) 6 | -------------------------------------------------------------------------------- /terraform/azuredevops/README.md: -------------------------------------------------------------------------------- 1 | This directory contains Terraform code to setup a Nutter demo project in the Azure DevOps. To do that, you need to create a file `terraform.tfvars` with following variables: 2 | 3 | * `devops_org_url` - URL of your Azure DevOps instance, like, `https://dev.azure.com/company_name`. 4 | * `devops_pat` - Azure DevOps personal access token (PAT) obtained as described in [documentation](https://registry.terraform.io/providers/microsoft/azuredevops/latest/docs/guides/authenticating_using_the_personal_access_token). This PAT will be used to create a project in your Azure DevOps organization, and will be set inside Databricks workspace. 5 | * `devops_user_name` - your user name inside Azure DevOps organization. 6 | 7 | And then execute `terraform apply` (use `terraform plan` to understand what changes will be made). 8 | 9 | The code performs following actions: 10 | 11 | * Creates a new project inside Azure DevOps organization. Default name is `NutterDemoProject` and could be changed by setting the `devops_project_name` variable. 12 | * Creates a new Azure DevOps Git repository by cloning this demo. 13 | * Set ups Git credential in Databricks workspace using provided Azure DevOps PAT. 14 | * Creates 3 Databricks checkouts in the current user's folder with names `nutter-tf-dev`, `nutter-tf-staging`, and `nutter-tf-prod` to emulate transition between different stages. 15 | * Creates a Databricks cluster that will be used to execute the tests. 16 | * Creates a temporary Databricks Personal Access Token (PAT) that will be used to authenticate to Databricks workspace from the build pipeline. 17 | * Creates an Azure DevOps variable group that contains all parameters that are used by the build pipeline. 18 | * Creates an Azure DevOps build pipeline using `azure-pipelines.yaml` file from the cloned repository. 19 | 20 | After code is executed, you will have fully configured repositories & build pipeline. Follow instructions from the [top-level README](../../README.md) to setup release pipeline. 21 | 22 | 23 | Limitations: 24 | 25 | * This code doesn't setup release pipeline as no corresponding functionality is available. 26 | -------------------------------------------------------------------------------- /terraform/azuredevops/cluster.tf: -------------------------------------------------------------------------------- 1 | data "databricks_node_type" "smallest" { 2 | local_disk = true 3 | } 4 | 5 | data "databricks_spark_version" "latest_lts" { 6 | long_term_support = true 7 | } 8 | 9 | resource "databricks_cluster" "nutter_demo" { 10 | cluster_name = "Nutter demo (${data.databricks_current_user.me.alphanumeric})" 11 | spark_version = data.databricks_spark_version.latest_lts.id 12 | node_type_id = data.databricks_node_type.smallest.id 13 | autotermination_minutes = 20 14 | spark_conf = { 15 | # Single-node 16 | "spark.databricks.cluster.profile" : "singleNode" 17 | "spark.master" : "local[*]" 18 | } 19 | 20 | custom_tags = { 21 | "ResourceClass" = "SingleNode" 22 | } 23 | 24 | library { 25 | pypi { 26 | package = "nutter" 27 | } 28 | } 29 | library { 30 | pypi { 31 | package = "chispa" 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /terraform/azuredevops/devops.tf: -------------------------------------------------------------------------------- 1 | resource "azuredevops_project" "project" { 2 | name = var.devops_project_name 3 | description = "Test Project for Nutter demo" 4 | visibility = "private" 5 | version_control = "Git" 6 | } 7 | 8 | resource "azuredevops_git_repository" "repository" { 9 | project_id = azuredevops_project.project.id 10 | name = "NutteronDevOps" 11 | initialization { 12 | init_type = "Import" 13 | source_type = "Git" 14 | source_url = "https://github.com/alexott/databricks-nutter-repos-demo" 15 | } 16 | } 17 | 18 | resource "azuredevops_build_definition" "build" { 19 | project_id = azuredevops_project.project.id 20 | name = "Nutter Build Pipeline" 21 | 22 | ci_trigger { 23 | use_yaml = true 24 | } 25 | 26 | repository { 27 | repo_type = "TfsGit" 28 | repo_id = azuredevops_git_repository.repository.id 29 | branch_name = azuredevops_git_repository.repository.default_branch 30 | yml_path = "azure-pipelines.yml" 31 | } 32 | 33 | variable_groups = [azuredevops_variable_group.vg.id] 34 | } 35 | 36 | resource "databricks_token" "pat_for_devops" { 37 | comment = "Azure DevOps Nutter demo (10 days)" 38 | lifetime_seconds = 864000 39 | } 40 | 41 | resource "azuredevops_variable_group" "vg" { 42 | project_id = azuredevops_project.project.id 43 | name = "Nutter Testing" 44 | description = "Variable group for build job" 45 | allow_access = true 46 | 47 | variable { 48 | name = "databricks_host" 49 | value = data.databricks_current_user.me.workspace_url 50 | } 51 | 52 | variable { 53 | name = "databricks_token" 54 | secret_value = databricks_token.pat_for_devops.token_value 55 | is_secret = true 56 | } 57 | 58 | variable { 59 | name = "cluster_id" 60 | value = databricks_cluster.nutter_demo.id 61 | } 62 | 63 | variable { 64 | name = "staging_directory" 65 | value = local.staging_repo_path 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /terraform/azuredevops/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | azuredevops = { 4 | source = "microsoft/azuredevops" 5 | version = "0.2.1" 6 | } 7 | databricks = { 8 | source = "databrickslabs/databricks" 9 | version = "0.5.8" 10 | } 11 | } 12 | } 13 | 14 | # https://registry.terraform.io/providers/microsoft/azuredevops/latest/docs 15 | 16 | provider "azuredevops" { 17 | org_service_url = var.devops_org_url 18 | personal_access_token = var.devops_pat 19 | } 20 | 21 | provider "databricks" { 22 | } 23 | -------------------------------------------------------------------------------- /terraform/azuredevops/repo.tf: -------------------------------------------------------------------------------- 1 | data "databricks_current_user" "me" { 2 | } 3 | 4 | locals { 5 | staging_repo_path = "${data.databricks_current_user.me.repos}/nutter-tf-staging" 6 | } 7 | 8 | resource "databricks_git_credential" "global" { 9 | git_provider = "azureDevOpsServices" 10 | git_username = var.devops_user_name 11 | personal_access_token = var.devops_pat 12 | force = true 13 | } 14 | 15 | resource "databricks_repo" "nutter_in_user_home" { 16 | depends_on = [databricks_git_credential.global] 17 | url = azuredevops_git_repository.repository.remote_url 18 | path = "${data.databricks_current_user.me.repos}/nutter-tf-dev" 19 | branch = "releases" 20 | } 21 | 22 | resource "databricks_repo" "nutter_in_staging" { 23 | depends_on = [databricks_git_credential.global] 24 | url = azuredevops_git_repository.repository.remote_url 25 | path = local.staging_repo_path 26 | branch = "releases" 27 | } 28 | 29 | resource "databricks_repo" "nutter_in_prod" { 30 | depends_on = [databricks_git_credential.global] 31 | url = azuredevops_git_repository.repository.remote_url 32 | path = "${data.databricks_current_user.me.repos}/nutter-tf-prod" 33 | branch = "releases" 34 | } 35 | -------------------------------------------------------------------------------- /terraform/azuredevops/variables.tf: -------------------------------------------------------------------------------- 1 | variable "devops_org_url" { 2 | description = "DevOps URL" 3 | type = string 4 | } 5 | 6 | 7 | variable "devops_pat" { 8 | description = "DevOps PAT" 9 | type = string 10 | } 11 | 12 | variable "devops_user_name" { 13 | description = "DevOps User Name" 14 | type = string 15 | } 16 | 17 | variable "devops_project_name" { 18 | description = "Project name in Azure DevOps" 19 | type = string 20 | default = "NutterDemoProject" 21 | } 22 | -------------------------------------------------------------------------------- /unit-tests/test_with_arbitrary_files.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %pip install -U nutter chispa 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %load_ext autoreload 7 | # MAGIC %autoreload 2 8 | 9 | # COMMAND ---------- 10 | 11 | from my_package.code1 import * # instead of %run ./Code1 12 | from my_package.code2 import * # instead of %run ./Code2 13 | 14 | # COMMAND ---------- 15 | 16 | # https://github.com/microsoft/nutter 17 | from runtime.nutterfixture import NutterFixture, tag 18 | # https://github.com/MrPowers/chispa 19 | from chispa.dataframe_comparer import * 20 | 21 | class TestFixtureArbitraryFiles(NutterFixture): 22 | def __init__(self): 23 | self.code2_table_name = "my_data" 24 | self.code1_view_name = "my_cool_data" 25 | self.code1_num_entries = 100 26 | NutterFixture.__init__(self) 27 | 28 | def run_code1_arbitrary_files(self): 29 | generate_data1(spark, n = self.code1_num_entries, name = self.code1_view_name) 30 | 31 | def assertion_code1_arbitrary_files(self): 32 | df = spark.read.table(self.code1_view_name) 33 | assert(df.count() == self.code1_num_entries) 34 | 35 | def run_code2_arbitrary_files(self): 36 | generate_data2(table_name = self.code2_table_name) 37 | 38 | def assertion_code2_arbitrary_files(self): 39 | some_tbl = spark.sql(f'SELECT COUNT(*) AS total FROM {self.code2_table_name}') 40 | first_row = some_tbl.first() 41 | assert (first_row[0] == 10) 42 | 43 | def after_code2_arbitrary_files(self): 44 | spark.sql(f"drop table {self.code2_table_name}") 45 | 46 | # we're using Chispa library here to compare the content of the processed dataframe with expected results 47 | def assertion_upper_columns_arbitrary_files(self): 48 | cols = ["col1", "col2", "col3"] 49 | df = spark.createDataFrame([("abc", "cef", 1)], cols) 50 | upper_df = upper_columns(df, cols) 51 | expected_df = spark.createDataFrame([("ABC", "CEF", 1)], cols) 52 | assert_df_equality(upper_df, expected_df) 53 | 54 | def assertion_lower_columns_arbitrary_files(self): 55 | cols = ["col1", "col2", "col3"] 56 | df = spark.createDataFrame([("Abc", "Cef", 1)], cols) 57 | upper_df = lower_columns(df, cols) 58 | expected_df = spark.createDataFrame([("abc", "cef", 1)], cols) 59 | assert_df_equality(upper_df, expected_df) 60 | 61 | # COMMAND ---------- 62 | 63 | result = TestFixtureArbitraryFiles().execute_tests() 64 | print(result.to_string()) 65 | is_job = dbutils.notebook.entry_point.getDbutils().notebook().getContext().currentRunId().isDefined() 66 | if is_job: 67 | result.exit(dbutils) 68 | -------------------------------------------------------------------------------- /unit-tests/test_with_percent_run.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %pip install -U nutter chispa 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %run ../Code1 7 | 8 | # COMMAND ---------- 9 | 10 | # MAGIC %run ../Code2 11 | 12 | # COMMAND ---------- 13 | 14 | # https://github.com/microsoft/nutter 15 | from runtime.nutterfixture import NutterFixture, tag 16 | # https://github.com/MrPowers/chispa 17 | from chispa.dataframe_comparer import * 18 | 19 | 20 | class TestPercentRunFixture(NutterFixture): 21 | def __init__(self): 22 | self.code2_table_name = "my_data" 23 | self.code1_view_name = "my_cool_data" 24 | self.code1_num_entries = 100 25 | NutterFixture.__init__(self) 26 | 27 | def run_code1_percent_run(self): 28 | generate_data1(n = self.code1_num_entries, name = self.code1_view_name) 29 | 30 | def assertion_code1_percent_run(self): 31 | df = spark.read.table(self.code1_view_name) 32 | assert(df.count() == self.code1_num_entries) 33 | 34 | def run_code2_percent_run(self): 35 | generate_data2(table_name = self.code2_table_name) 36 | 37 | def assertion_code2_percent_run(self): 38 | some_tbl = spark.sql(f'SELECT COUNT(*) AS total FROM {self.code2_table_name}') 39 | first_row = some_tbl.first() 40 | assert (first_row[0] == 10) 41 | 42 | def after_code2_percent_run(self): 43 | spark.sql(f"drop table {self.code2_table_name}") 44 | 45 | # we're using Chispa library here to compare the content of the processed dataframe with expected results 46 | def assertion_upper_columns_percent_run(self): 47 | cols = ["col1", "col2", "col3"] 48 | df = spark.createDataFrame([("abc", "cef", 1)], cols) 49 | upper_df = upper_columns(df, cols) 50 | expected_df = spark.createDataFrame([("ABC", "CEF", 1)], cols) 51 | assert_df_equality(upper_df, expected_df) 52 | 53 | # COMMAND ---------- 54 | 55 | result = TestPercentRunFixture().execute_tests() 56 | print(result.to_string()) 57 | is_job = dbutils.notebook.entry_point.getDbutils().notebook().getContext().currentRunId().isDefined() 58 | if is_job: 59 | result.exit(dbutils) 60 | 61 | # COMMAND ---------- 62 | 63 | # just add the code 64 | --------------------------------------------------------------------------------