├── .github
    └── workflows
    │   └── cicd.yml
├── .gitignore
├── Code1.py
├── Code2.py
├── LICENSE
├── README.md
├── __init__.py
├── azure-pipelines.yml
├── images
    ├── cicd-workflow.png
    ├── create-personal-project.png
    ├── create-project-in-staging.png
    ├── create-staging-folder.png
    ├── release-pipeline-devops.png
    ├── release-pipeline-github-actions.png
    ├── release-pipeline.png
    └── release-tasks-devops.png
├── my_package
    ├── __init__.py
    ├── code1.py
    └── code2.py
├── terraform
    └── azuredevops
    │   ├── README.md
    │   ├── cluster.tf
    │   ├── devops.tf
    │   ├── main.tf
    │   ├── repo.tf
    │   └── variables.tf
└── unit-tests
    ├── test_with_arbitrary_files.py
    └── test_with_percent_run.py


/.github/workflows/cicd.yml:
--------------------------------------------------------------------------------
  1 | name: CICD
  2 | on:
  3 |   push:
  4 |     paths-ignore:
  5 |       - README.md
  6 |       - LICENSE
  7 |       - images
  8 |       - terraform
  9 |       - azure-pipeline.yml
 10 |       
 11 | env:
 12 |   DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }}
 13 |   DATABRICKS_TOKEN:  ${{ secrets.DATABRICKS_TOKEN }}
 14 |   REPO_DIRECTORY: ${{ vars.REPO_DIRECTORY }}
 15 |   CLUSTER_ID: ${{ vars.CLUSTER_ID }}
 16 | 
 17 | permissions:
 18 |   checks: write
 19 | jobs:
 20 |   OnPush:
 21 |     if: github.ref_name == 'dev'
 22 |     environment: dev
 23 |     runs-on: ubuntu-latest
 24 |     steps:
 25 |     - name: Checkout repo
 26 |       uses: actions/checkout@v3
 27 |       
 28 |     - name: Setup Python
 29 |       uses: actions/setup-python@v4
 30 |       with:
 31 |         python-version: 3.8
 32 |       
 33 |     - name: Install libraries
 34 |       run: |
 35 |         pip install --upgrade pip nutter
 36 |     - name: install-databricks-cli
 37 |       uses: microsoft/install-databricks-cli@v1.0.0
 38 | 
 39 |     - name: Update databricks repo
 40 |       run: |
 41 |         databricks repos update --path $REPO_DIRECTORY --branch "${{github.ref_name}}"
 42 |         
 43 |     - name: Run Tests
 44 |       run: |
 45 |         nutter run "$REPO_DIRECTORY/unit-tests/" --cluster_id $CLUSTER_ID --recursive --junit_report --timeout 500
 46 |     
 47 |     - name: Publish Test Report
 48 |       uses: mikepenz/action-junit-report@v3
 49 |       if: success() || failure() # always run even if the previous step fails
 50 |       with:
 51 |         report_paths: 'test-*.xml'    
 52 |         
 53 |    
 54 |   OnRelease:
 55 |     if: startsWith(github.ref_name, 'releases')
 56 |     environment: staging
 57 |     runs-on: ubuntu-latest
 58 |     steps:
 59 |     - name: Checkout repo
 60 |       uses: actions/checkout@v3
 61 | 
 62 |     - name: Setup Python
 63 |       uses: actions/setup-python@v4
 64 |       with:
 65 |         python-version: 3.8
 66 | 
 67 |     - name: Install libraries
 68 |       run: |
 69 |         pip install --upgrade pip nutter
 70 |     - name: install-databricks-cli
 71 |       uses: microsoft/install-databricks-cli@v1.0.0
 72 | 
 73 |     - name: Update databricks repo
 74 |       run: |
 75 |         databricks repos update --path $REPO_DIRECTORY --branch "${{github.ref_name}}"
 76 |     - name: Run Tests
 77 |       run: |
 78 |         pwd
 79 |         nutter run "$REPO_DIRECTORY/unit-tests/" --cluster_id $CLUSTER_ID --recursive --junit_report --timeout 500
 80 |     - name: Publish Test Report
 81 |       uses: mikepenz/action-junit-report@v3
 82 |       if: success() || failure() # always run even if the previous step fails
 83 |       with:
 84 |         report_paths: 'test-*.xml'
 85 |         
 86 |   DeployProduction:
 87 |     environment: prod
 88 |     runs-on: ubuntu-latest
 89 |     needs: [OnRelease]
 90 |     steps:
 91 |     - name: Checkout repo
 92 |       uses: actions/checkout@v3
 93 | 
 94 |     - name: Setup Python
 95 |       uses: actions/setup-python@v4
 96 |       with:
 97 |         python-version: 3.8
 98 | 
 99 |     - name: Install libraries
100 |       run: |
101 |         pip install --upgrade pip nutter
102 |     - name: install-databricks-cli
103 |       uses: microsoft/install-databricks-cli@v1.0.0
104 | 
105 |     - name: Update databricks repo
106 |       run: |
107 |         databricks repos update --path $REPO_DIRECTORY --branch "${{github.ref_name}}"
108 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | /Release using Projects.json
 3 | /test-*.xml
 4 | /.DS_Store
 5 | /images/.DS_Store
 6 | /out
 7 | .terraform/
 8 | .terraform.lock.hcl
 9 | terraform.tfvars
10 | terraform.tfstate
11 | terraform.tfstate.backup
12 | 


--------------------------------------------------------------------------------
/Code1.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | import pyspark.sql.functions as F
 3 | import pyspark.sql.types as T
 4 | from pyspark.sql import DataFrame, SparkSession
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | def generate_data1(n=1000, name='my_cool_data'):
 9 |     df = SparkSession.getActiveSession().range(0, n)
10 |     df.createOrReplaceTempView(name)
11 | 
12 | # COMMAND ----------
13 | 
14 | def upper_columns(df: DataFrame, cols: list) -> DataFrame:
15 |     new_cols = []
16 |     for field in df.schema.fields:
17 |         if field.dataType == T.StringType() and field.name in cols:
18 |             new_cols.append(F.upper(F.col(field.name)).alias(field.name))
19 |         else:
20 |             new_cols.append(F.col(field.name))
21 |             
22 |     return df.select(*new_cols)
23 | 
24 | # COMMAND ----------
25 | 
26 | # add the comment
27 | 


--------------------------------------------------------------------------------
/Code2.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | 
 5 | def generate_data2(table_name="my_data"):
 6 |   df = SparkSession.getActiveSession().range(0,10)
 7 |   df.write.format("delta").mode("overwrite").saveAsTable(table_name)
 8 | 
 9 | # COMMAND ----------
10 | 
11 | # just add code....
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2021 Alex Ott
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | This repository contains notebooks & instructions for setting up the demo of development workflow & CI/CD (on Azure DevOps) using the Databricks notebooks and [Repos feature](https://docs.databricks.com/repos.html).  Testing of notebooks is done using the [Nutter library](https://github.com/microsoft/nutter) developed by Microsoft.
  2 | 
  3 | Two approaches are demonstrated:
  4 | 1. Using notebooks & including the code using `%run` ([doc](https://docs.databricks.com/notebooks/notebooks-use.html#run)) - the "main" code is in the notebooks `Code1.py` and `Code2.py`, and the testing code is in the `unit-tests/test_with_percent_run.py`.
  5 | 1. Using notebook for test itself, but including main code as Python packages using [arbitrary files in Repos](https://docs.databricks.com/repos.html#work-with-non-notebook-files-in-a-databricks-repo) functionality (DBR 9.1+).  Main code is in the `my_package/code1.py` and `my_package/code2.py` files, and test is in `unit-tests/test_with_arbitrary_files.py`.
  6 | 
  7 | This demo shows how you can use Repos to work on your own copy of notebooks, test them after commit in the "staging" environment, and promote to "production" on successful testing of `releases` branch.
  8 | 
  9 | * [The workflow](#the-workflow)
 10 | * [Setup on Databricks side](#setup-on-databricks-side)
 11 | * [Setup Azure DevOps pipelines](#setup-azure-devops-pipelines)
 12 |       * [Create variables group to keep common configuration](#create-variables-group-to-keep-common-configuration)
 13 |       * [Create a build pipeline](#create-a-build-pipeline)
 14 |       * [Create a release pipeline](#create-a-release-pipeline)
 15 | * [FAQ &amp; Troubleshooting](#faq--troubleshooting)
 16 |    * [I'm getting "Can’t find repo ID for /Repos/..." when trying to update a repo](#im-getting-cant-find-repo-id-for-repos-when-trying-to-update-a-repo)
 17 |    * [I'm getting "Error fetching repo ID for ... Unauthorized access to Org..."](#im-getting-error-fetching-repo-id-for--unauthorized-access-to-org)
 18 |    * [How can I perform Repos operations using the service principal?](#how-can-i-perform-repos-operations-using-the-service-principal)
 19 | 
 20 | There is a possibility of automated setup of this demo using the Terraform.  Look into [terraform](terraform) folder for existing implementations.
 21 | 
 22 | 
 23 | # The workflow
 24 | 
 25 | The development workflow is organized as on following image:
 26 | 
 27 | ![Development workflow](images/cicd-workflow.png)
 28 | 
 29 | 1. Developer works on the code in the separate environment (personal space on Databricks, etc.).  When code changes are done, they are committed into some branch
 30 | 1. CI/CD implementation (Azure DevOps here) picks up the changes, and tests them in a staging environment (executes the "build pipeline").  This consists of several steps (see [azure-pipelines.yml](azure-pipelines.yml) for technical details):
 31 |    * Update repository checkout in the "Staging" folder
 32 |    * Execute tests with updated code
 33 |    * Publish tests results
 34 | 1. In current setup, there are different jobs for the "normal" branches, and for "release" branch (`releases` in this setup), this would allow to run different sets of tests when we're preparing the release
 35 | 1. If commit is done to the "release branch, and there are no test failures, then the "release pipeline" is triggered, and it updates the production environment by updating the repository checkout in the "Production" folder.
 36 | 
 37 | 
 38 | # Setup on Databricks side
 39 | 
 40 | Your Databricks workspace needs to have Repos functionality enabled.  If it's enabled, you should see the "Repos" icon in the navigation panel:
 41 | 
 42 | * Fork repository into your environment - Github, or Azure DevOps (follow Databricks documentation on using it)
 43 | * In the Repos, click "Create Repo" and link it to the Git repository that you've forked - this will be your personal copy of the code that will be used for work:
 44 | 
 45 | ![Create a personal repo](images/create-personal-project.png)
 46 | 
 47 | * Create the staging & production checkouts
 48 |   * In the Repos, in the top-level part, click on the "ᐯ" near the "Repos" header, select "Create" and select "Folder" (see image).  Give it the name "Staging":
 49 | 
 50 | ![Create a staging folder](images/create-staging-folder.png)
 51 | 
 52 |   * Click on the  "ᐯ" near the "Staging" folder, and click "Create" and select "Repo":
 53 | 
 54 | ![Create a staging repository](images/create-project-in-staging.png)
 55 | 
 56 |   * Link it to the Git repository, similarly how you did it for your personal checkout
 57 |   * Create the "Production" folder with repository inside, repeating two previous steps
 58 | * Create a new cluster that will be used for execution of the tests, you will need to pass the [cluster ID](https://docs.databricks.com/workspace/workspace-details.html#cluster-url-and-id) to the Nutter to execute the tests
 59 | 
 60 | 
 61 | # Setup Azure DevOps pipelines
 62 | 
 63 | The Azure DevOps setup consists of the several steps, described in the next sections.  It's assumed that project in Azure DevOps already exists.
 64 | 
 65 | We need to create a [personal access token (PAT)](https://docs.databricks.com/administration-guide/access-control/tokens.html) that will be used for execution of the tests & updating the repository.  This token will be used to authenticate to Databricks workspace, and then it will fetch configured token to authenticate to Git provider.  We also need to connect Databricks workspace to the Git provider - usually it's done by using the provider-specific access tokens - see [documentation](https://docs.databricks.com/repos.html#configure-your-git-integration-with-databricks) on details of setting the integration with specific Git provider (**note, that when repository is on Azure DevOps, you still need to generate Azure DevOps token to make API working**!, and also provide the user name in the Git settings). 
 66 | 
 67 | > :warning: the previous instructions on using Repos + Azure DevOps with service principals weren't correct, so were removed!
 68 | 
 69 | ### Create variables group to keep common configuration
 70 | 
 71 | Because we have several pipelines, the it's makes sense to define [variable group](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups) to store the data that are necessary for execution of tests & deployment of the code.  We need following configuration properties for execution of our pipelines:
 72 | 
 73 | * `databricks_host` - the [URL of your workspace](https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) where tests will be executed (host name with `https://`, without `?o=`, and **without trailing slash character**.  For example: `https://adb-1568830229861029.9.azuredatabricks.net`).
 74 | * `databricks_token` - personal access token for executing commands against the workspace.  Mark this variable as private!
 75 | * `cluster_id` - the ID of the cluster where tests will be executed. DBR 9.1+ should be used to support arbitrary files.
 76 | * `staging_directory` - the directory for staging checkout that we created above. For example, `/Repos/Staging/databricks-nutter-repos-demo`.
 77 | 
 78 | The name of the variable group is used in the [azure-pipelines.yml](azure-pipelines.yml). By default its name is "Nutter Testing".  Change the [azure-pipelines.yml](azure-pipelines.yml) if you use another name for variable group.
 79 | 
 80 | ### Create a build pipeline
 81 | 
 82 | Azure DevOps can work with GitHub repositories as well - see [documentation](https://docs.microsoft.com/en-us/azure/devops/pipelines/repos/github) for more details on how to link DevOps with GitHub.
 83 | 
 84 | * In the Azure DevOps, in the Pipelines section, select Pipelines, and click "New pipeline"
 85 | * Select GitHub and repository
 86 | * In the "Configure" step select the "Existing Azure Pipelines YAML file" and specify the name of the existing file: [azure-pipelines.yml](azure-pipelines.yml)
 87 | * Save pipeline
 88 | 
 89 | 
 90 | ### Create a release pipeline
 91 | 
 92 | * In the Azure DevOps, in the Pipelines section, select Releases, and click "New release pipeline"
 93 | * Select "Empty Job" in the dialog
 94 | * In the Stage dialog enter some meaningful name for it
 95 | * In the "Variables" tab, link the variable group that was created previously
 96 | * Configure job & task:
 97 |   * Configure agent - in the "Agent Specification" select "ubuntu-18.04"
 98 |   * Click on "+" and find the "Command line" task
 99 |   * Enter following code that will connect to the production environment & update the checkout of the repository (via [Repos REST API](https://docs.databricks.com/dev-tools/api/latest/repos.html)):
100 | 
101 | ```sh
102 | python -m pip install --upgrade databricks-cli
103 | databricks repos update --path /Repos/Production/databricks-nutter-repos-demo --branch releases
104 | ```
105 | 
106 |   * Below the code, add environment variable `DATABRICKS_TOKEN` with value `$(DATABRICKS_TOKEN)` - this will pull it from the variable group into the script's execution context
107 |   * Save task & job
108 | * We also need to configure an artifact:
109 |   * Click on "Add artifact", select project, and source (the name of the build pipeline). Also, for "Default version" select "Latest from specific branch with tags" and select the "releases" branch.  Click on "Add" to add artifact into pipeline
110 |   * Click on the "⚡" icon to configure the continuous deployment (by default, release is triggered manually).  Add branch filter and also select the `releases` branch
111 | * Your release pipeline should look as following:
112 | 
113 | ![Release pipeline](images/release-pipeline.png)
114 | 
115 | * Save the pipeline
116 | 
117 | After all of this done, the release pipeline will be automatically executed on every successful build in the `releases` branch.
118 | 
119 | ## Github Actions Workflow:
120 | 
121 | * We need to create a [personal access token (PAT)](https://docs.databricks.com/administration-guide/access-control/tokens.html) that will be used for execution of the tests & updating the repository.  This token will be used to authenticate to Databricks workspace, and then it will fetch configured token to authenticate to Git provider.  We also need to connect Databricks workspace to the Git provider - usually it's done by using the provider-specific access tokens - see [documentation](https://docs.databricks.com/repos.html#configure-your-git-integration-with-databricks) on details of setting the integration with specific Git provider
122 | 
123 | * Create dev, stage and prod [Environments](https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment) in github settings. With environments it is easy to use the same variables names and secret names accross different environments
124 |  
125 |  Create the following properties within each environment:
126 | 
127 | * `databricks_host` - the [URL of your workspace](https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) where tests will be executed (host name with `https://`, without `?o=`, and **without trailing slash character**.  For example: `https://adb-1568830229861029.9.azuredatabricks.net`).
128 | * `databricks_token` - personal access token for executing commands against the workspace. Create this as a secrete variable
129 | * `cluster_id` - the ID of the cluster where tests will be executed. DBR 9.1+ should be used to support arbitrary files.
130 | * `repo_directory` - the directory for checkout for specific environment. For example, `/Repos/Staging/databricks-nutter-repos-demo`.
131 | 
132 | The workflow is the same as above and the pipeline looks as following:
133 | ![Release pipeline](images/release-pipeline-github-actions.png)
134 | 
135 | # FAQ & Troubleshooting
136 | 
137 | ## I'm getting "Can’t find repo ID for /Repos/..." when trying to update a repo
138 | 
139 | This often happens when you're trying to use `databricks repos update` for workspace that have IP Access Lists enabled.  The error message is a misleading, and will be fixed by [this pull request](https://github.com/databricks/databricks-cli/pull/428).
140 | 
141 | ## I'm getting "Error fetching repo ID for ... Unauthorized access to Org..."
142 | 
143 | This usually happens when you're trying to run CI/CD pipeline against a Databricks workspace with IP Access Lists enabled, and CI/CD server not in the allow list.
144 | 
145 | ## How can I perform Repos operations using the service principal?
146 | 
147 | To perform operations on Repos (update, etc.) we need to associate a Git token with an identity that performs that operation. Please see the following documentation:
148 | * [CICD with SPNs](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/ci-cd/ci-cd-sp)
149 | * [Git Credentials REST API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/gitcredentials) 
150 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/__init__.py


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
  1 | # Grab variables from the specific variable group and
  2 | # determine sourceBranchName (avoids SourchBranchName=merge
  3 | # for PR)
  4 | variables:
  5 |   - group: 'Nutter Testing'
  6 |   - name: 'branchName'
  7 |     ${{ if startsWith(variables['Build.SourceBranch'], 'refs/heads/') }}:
  8 |       value: $[ replace(variables['Build.SourceBranch'], 'refs/heads/', '') ] 
  9 |     ${{ if startsWith(variables['Build.SourceBranch'], 'refs/pull/') }}:
 10 |       value: $[ replace(variables['System.PullRequest.SourceBranch'], 'refs/heads/', '') ]
 11 | 
 12 | trigger:
 13 |   batch: true
 14 |   branches:
 15 |     include:
 16 |     - '*'
 17 |   paths:
 18 |     exclude:
 19 |       - README.md
 20 |       - LICENSE
 21 |       - images
 22 |       - terraform
 23 |       - .github
 24 | 
 25 | #  tags:
 26 | #    include:
 27 | #      - v*.*
 28 | #      - prod
 29 | 
 30 | # This need an additional debugging
 31 | # pr:
 32 | #   branches:
 33 | #     include:
 34 | #       - master
 35 | #       - releases
 36 | #   paths:
 37 | #     exclude:
 38 | #       - README.md
 39 | #       - images
 40 |       
 41 | stages:
 42 | - stage: onPush
 43 |   condition: |
 44 |     and(
 45 |       ne(variables['Build.SourceBranch'], 'refs/heads/releases'),
 46 |       not(startsWith(variables['Build.SourceBranch'], 'refs/tags/v'))
 47 |     )
 48 |   jobs:
 49 |   - job: onPushJob
 50 |     pool:
 51 |       vmImage: 'ubuntu-20.04'
 52 | 
 53 |     steps:
 54 |     - script: env | sort
 55 |       displayName: 'Environment / Context'
 56 | 
 57 |     - task: UsePythonVersion@0
 58 |       displayName: 'Use Python 3.8'
 59 |       inputs:
 60 |         versionSpec: 3.8
 61 | 
 62 |     - checkout: self
 63 |       displayName: 'Checkout & Build.Reason: $(Build.Reason) & Build.SourceBranchName: $(Build.SourceBranchName)'
 64 | 
 65 |     - script: |
 66 |         python -m pip install pip nutter
 67 |         # this is because of the old dependency inside Nutter
 68 |         python -m pip install --upgrade databricks-cli
 69 |       displayName: 'Install dependencies'
 70 | 
 71 | # https://docs.databricks.com/dev-tools/api/latest/repos.html
 72 | # this is simplification, and won't work with concurrent commits. Ideally it should be a
 73 | # separate repo for each commit
 74 |     - script: |
 75 |         echo "Checking out the $(branchName) branch"
 76 |         databricks repos update --path $(STAGING_DIRECTORY) --branch "$(branchName)"
 77 |       env:
 78 |         DATABRICKS_HOST: $(DATABRICKS_HOST)
 79 |         DATABRICKS_TOKEN: $(DATABRICKS_TOKEN)
 80 |       displayName: 'Update Staging project'
 81 | 
 82 |     - script: |
 83 |         nutter run "$(STAGING_DIRECTORY)/unit-tests/" --cluster_id $(CLUSTER_ID) --recursive --junit_report --timeout 500
 84 |       env:
 85 |         DATABRICKS_HOST: $(DATABRICKS_HOST)
 86 |         DATABRICKS_TOKEN: $(DATABRICKS_TOKEN)
 87 |       displayName: 'Execute Nutter tests'
 88 | 
 89 |     - task: PublishTestResults@2
 90 |       condition: succeededOrFailed()
 91 |       inputs:
 92 |         testResultsFormat: 'JUnit'
 93 |         testResultsFiles: '**/test-*.xml' 
 94 |         failTaskOnFailedTests: true
 95 | 
 96 | - stage: onRelease
 97 |   condition: |
 98 |     eq(variables['Build.SourceBranch'], 'refs/heads/releases')
 99 |   jobs:
100 |   - job: onReleaseJob
101 |     pool:
102 |       vmImage: 'ubuntu-20.04'
103 | 
104 |     steps:
105 |       - script: env | sort
106 |         displayName: 'Environment / Context'
107 | 
108 |       - task: UsePythonVersion@0
109 |         displayName: 'Use Python 3.8'
110 |         inputs:
111 |           versionSpec: 3.8
112 | 
113 |       - checkout: self
114 |         persistCredentials: true
115 |         clean: true
116 |         displayName: 'Checkout & Build.Reason: $(Build.Reason) & Build.SourceBranchName: $(Build.SourceBranchName)'
117 | 
118 |       - script: |
119 |           python -m pip install --upgrade pip nutter
120 |           # this is because of the old dependency inside Nutter
121 |           python -m pip install --upgrade databricks-cli
122 |         displayName: 'Install dependencies'
123 | 
124 |       - script: |
125 |           echo "Checking out the releases branch"
126 |           databricks repos update --path $(STAGING_DIRECTORY) --branch "$(Build.SourceBranchName)"
127 |         env:
128 |           DATABRICKS_HOST: $(DATABRICKS_HOST)
129 |           DATABRICKS_TOKEN: $(DATABRICKS_TOKEN)
130 |         displayName: 'Update Staging repository'
131 | 
132 | # We can do a separate set of the tests for release branches
133 |       - script: |
134 |           nutter run "$(STAGING_DIRECTORY)/unit-tests/" --cluster_id $(CLUSTER_ID) --recursive --junit_report --timeout 500
135 |         env:
136 |           DATABRICKS_HOST: $(DATABRICKS_HOST)
137 |           DATABRICKS_TOKEN: $(DATABRICKS_TOKEN)
138 |         displayName: 'Execute Nutter tests on release'
139 | 
140 |       - task: PublishTestResults@2
141 |         condition: succeededOrFailed()
142 |         inputs:
143 |           testResultsFormat: 'JUnit'
144 |           testResultsFiles: '**/test-*.xml' 
145 |           failTaskOnFailedTests: true
146 | 
147 | 


--------------------------------------------------------------------------------
/images/cicd-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/cicd-workflow.png


--------------------------------------------------------------------------------
/images/create-personal-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/create-personal-project.png


--------------------------------------------------------------------------------
/images/create-project-in-staging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/create-project-in-staging.png


--------------------------------------------------------------------------------
/images/create-staging-folder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/create-staging-folder.png


--------------------------------------------------------------------------------
/images/release-pipeline-devops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/release-pipeline-devops.png


--------------------------------------------------------------------------------
/images/release-pipeline-github-actions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/release-pipeline-github-actions.png


--------------------------------------------------------------------------------
/images/release-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/release-pipeline.png


--------------------------------------------------------------------------------
/images/release-tasks-devops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/images/release-tasks-devops.png


--------------------------------------------------------------------------------
/my_package/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-nutter-repos-demo/ddac5cc3183b3c336e1f77530582f1d4ec7a1642/my_package/__init__.py


--------------------------------------------------------------------------------
/my_package/code1.py:
--------------------------------------------------------------------------------
 1 | import pyspark.sql.functions as F
 2 | import pyspark.sql.types as T
 3 | from pyspark.sql import DataFrame
 4 | 
 5 | def generate_data1(spark, n=1000, name='my_cool_data'):
 6 |     df = spark.range(0, n)
 7 |     df.createOrReplaceTempView(name)
 8 | 
 9 | def upper_columns(df: DataFrame, cols: list) -> DataFrame:
10 |     new_cols = []
11 |     for field in df.schema.fields:
12 |         if field.dataType == T.StringType() and field.name in cols:
13 |             new_cols.append(F.upper(F.col(field.name)).alias(field.name))
14 |         else:
15 |             new_cols.append(F.col(field.name))
16 |             
17 |     return df.select(*new_cols)
18 | 
19 | def lower_columns(df: DataFrame, cols: list) -> DataFrame:
20 |     new_cols = []
21 |     for field in df.schema.fields:
22 |         if field.dataType == T.StringType() and field.name in cols:
23 |             new_cols.append(F.lower(F.col(field.name)).alias(field.name))
24 |         else:
25 |             new_cols.append(F.col(field.name))
26 |             
27 |     return df.select(*new_cols)


--------------------------------------------------------------------------------
/my_package/code2.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | 
3 | def generate_data2(table_name="my_data"):
4 |   df = SparkSession.getActiveSession().range(0,10)
5 |   df.write.format("delta").mode("overwrite").saveAsTable(table_name)
6 |   


--------------------------------------------------------------------------------
/terraform/azuredevops/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains Terraform code to setup a Nutter demo project in the Azure DevOps.  To do that, you need to create a file `terraform.tfvars` with following variables:
 2 | 
 3 | * `devops_org_url`   - URL of your Azure DevOps instance, like, `https://dev.azure.com/company_name`.
 4 | * `devops_pat`       - Azure DevOps personal access token (PAT) obtained as described in [documentation](https://registry.terraform.io/providers/microsoft/azuredevops/latest/docs/guides/authenticating_using_the_personal_access_token).  This PAT will be used to create a project in your Azure DevOps organization, and will be set inside Databricks workspace.
 5 | * `devops_user_name` - your user name inside Azure DevOps organization.
 6 | 
 7 | And then execute `terraform apply` (use `terraform plan` to understand what changes will be made).
 8 | 
 9 | The code performs following actions:
10 | 
11 | * Creates a new project inside Azure DevOps organization.  Default name is `NutterDemoProject` and could be changed by setting the `devops_project_name` variable.
12 | * Creates a new Azure DevOps Git repository by cloning this demo.
13 | * Set ups Git credential in Databricks workspace using provided Azure DevOps PAT.
14 | * Creates 3 Databricks checkouts in the current user's folder with names `nutter-tf-dev`, `nutter-tf-staging`, and `nutter-tf-prod` to emulate transition between different stages.
15 | * Creates a Databricks cluster that will be used to execute the tests.
16 | * Creates a temporary Databricks Personal Access Token (PAT) that will be used to authenticate to Databricks workspace from the build pipeline.
17 | * Creates an Azure DevOps variable group that contains all parameters that are used by the build pipeline.
18 | * Creates an Azure DevOps build pipeline using `azure-pipelines.yaml` file from the cloned repository.
19 | 
20 | After code is executed, you will have fully configured repositories & build pipeline.  Follow instructions from the [top-level README](../../README.md) to setup release pipeline.
21 | 
22 | 
23 | Limitations:
24 | 
25 | * This code doesn't setup release pipeline as no corresponding functionality is available.
26 | 


--------------------------------------------------------------------------------
/terraform/azuredevops/cluster.tf:
--------------------------------------------------------------------------------
 1 | data "databricks_node_type" "smallest" {
 2 |   local_disk = true
 3 | }
 4 | 
 5 | data "databricks_spark_version" "latest_lts" {
 6 |   long_term_support = true
 7 | }
 8 | 
 9 | resource "databricks_cluster" "nutter_demo" {
10 |   cluster_name            = "Nutter demo (${data.databricks_current_user.me.alphanumeric})"
11 |   spark_version           = data.databricks_spark_version.latest_lts.id
12 |   node_type_id            = data.databricks_node_type.smallest.id
13 |   autotermination_minutes = 20
14 |   spark_conf = {
15 |     # Single-node
16 |     "spark.databricks.cluster.profile" : "singleNode"
17 |     "spark.master" : "local[*]"
18 |   }
19 | 
20 |   custom_tags = {
21 |     "ResourceClass" = "SingleNode"
22 |   }
23 | 
24 |   library {
25 |     pypi {
26 |       package = "nutter"
27 |     }
28 |   }
29 |   library {
30 |     pypi {
31 |       package = "chispa"
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/terraform/azuredevops/devops.tf:
--------------------------------------------------------------------------------
 1 | resource "azuredevops_project" "project" {
 2 |   name            = var.devops_project_name
 3 |   description     = "Test Project for Nutter demo"
 4 |   visibility      = "private"
 5 |   version_control = "Git"
 6 | }
 7 | 
 8 | resource "azuredevops_git_repository" "repository" {
 9 |   project_id = azuredevops_project.project.id
10 |   name       = "NutteronDevOps"
11 |   initialization {
12 |     init_type   = "Import"
13 |     source_type = "Git"
14 |     source_url  = "https://github.com/alexott/databricks-nutter-repos-demo"
15 |   }
16 | }
17 | 
18 | resource "azuredevops_build_definition" "build" {
19 |   project_id = azuredevops_project.project.id
20 |   name       = "Nutter Build Pipeline"
21 | 
22 |   ci_trigger {
23 |     use_yaml = true
24 |   }
25 | 
26 |   repository {
27 |     repo_type   = "TfsGit"
28 |     repo_id     = azuredevops_git_repository.repository.id
29 |     branch_name = azuredevops_git_repository.repository.default_branch
30 |     yml_path    = "azure-pipelines.yml"
31 |   }
32 | 
33 |   variable_groups = [azuredevops_variable_group.vg.id]
34 | }
35 | 
36 | resource "databricks_token" "pat_for_devops" {
37 |   comment          = "Azure DevOps Nutter demo (10 days)"
38 |   lifetime_seconds = 864000
39 | }
40 | 
41 | resource "azuredevops_variable_group" "vg" {
42 |   project_id   = azuredevops_project.project.id
43 |   name         = "Nutter Testing"
44 |   description  = "Variable group for build job"
45 |   allow_access = true
46 | 
47 |   variable {
48 |     name  = "databricks_host"
49 |     value = data.databricks_current_user.me.workspace_url
50 |   }
51 | 
52 |   variable {
53 |     name         = "databricks_token"
54 |     secret_value = databricks_token.pat_for_devops.token_value
55 |     is_secret    = true
56 |   }
57 | 
58 |   variable {
59 |     name  = "cluster_id"
60 |     value = databricks_cluster.nutter_demo.id
61 |   }
62 | 
63 |   variable {
64 |     name  = "staging_directory"
65 |     value = local.staging_repo_path
66 |   }
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/terraform/azuredevops/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     azuredevops = {
 4 |       source  = "microsoft/azuredevops"
 5 |       version = "0.2.1"
 6 |     }
 7 |     databricks = {
 8 |       source  = "databrickslabs/databricks"
 9 |       version = "0.5.8"
10 |     }
11 |   }
12 | }
13 | 
14 | # https://registry.terraform.io/providers/microsoft/azuredevops/latest/docs
15 | 
16 | provider "azuredevops" {
17 |   org_service_url       = var.devops_org_url
18 |   personal_access_token = var.devops_pat
19 | }
20 | 
21 | provider "databricks" {
22 | }
23 | 


--------------------------------------------------------------------------------
/terraform/azuredevops/repo.tf:
--------------------------------------------------------------------------------
 1 | data "databricks_current_user" "me" {
 2 | }
 3 | 
 4 | locals {
 5 |   staging_repo_path = "${data.databricks_current_user.me.repos}/nutter-tf-staging"
 6 | }
 7 | 
 8 | resource "databricks_git_credential" "global" {
 9 |   git_provider          = "azureDevOpsServices"
10 |   git_username          = var.devops_user_name
11 |   personal_access_token = var.devops_pat
12 |   force                 = true
13 | }
14 | 
15 | resource "databricks_repo" "nutter_in_user_home" {
16 |   depends_on = [databricks_git_credential.global]
17 |   url        = azuredevops_git_repository.repository.remote_url
18 |   path       = "${data.databricks_current_user.me.repos}/nutter-tf-dev"
19 |   branch     = "releases"
20 | }
21 | 
22 | resource "databricks_repo" "nutter_in_staging" {
23 |   depends_on = [databricks_git_credential.global]
24 |   url        = azuredevops_git_repository.repository.remote_url
25 |   path       = local.staging_repo_path
26 |   branch     = "releases"
27 | }
28 | 
29 | resource "databricks_repo" "nutter_in_prod" {
30 |   depends_on = [databricks_git_credential.global]
31 |   url        = azuredevops_git_repository.repository.remote_url
32 |   path       = "${data.databricks_current_user.me.repos}/nutter-tf-prod"
33 |   branch     = "releases"
34 | }
35 | 


--------------------------------------------------------------------------------
/terraform/azuredevops/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "devops_org_url" {
 2 |   description = "DevOps URL"
 3 |   type        = string
 4 | }
 5 | 
 6 | 
 7 | variable "devops_pat" {
 8 |   description = "DevOps PAT"
 9 |   type        = string
10 | }
11 | 
12 | variable "devops_user_name" {
13 |   description = "DevOps User Name"
14 |   type        = string
15 | }
16 | 
17 | variable "devops_project_name" {
18 |   description = "Project name in Azure DevOps"
19 |   type        = string
20 |   default     = "NutterDemoProject"
21 | }
22 | 


--------------------------------------------------------------------------------
/unit-tests/test_with_arbitrary_files.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %pip install -U nutter chispa
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %load_ext autoreload
 7 | # MAGIC %autoreload 2
 8 | 
 9 | # COMMAND ----------
10 | 
11 | from my_package.code1 import * # instead of %run ./Code1
12 | from my_package.code2 import * # instead of %run ./Code2
13 | 
14 | # COMMAND ----------
15 | 
16 | # https://github.com/microsoft/nutter
17 | from runtime.nutterfixture import NutterFixture, tag
18 | # https://github.com/MrPowers/chispa
19 | from chispa.dataframe_comparer import *
20 | 
21 | class TestFixtureArbitraryFiles(NutterFixture):
22 |   def __init__(self):
23 |     self.code2_table_name = "my_data"
24 |     self.code1_view_name = "my_cool_data"
25 |     self.code1_num_entries = 100
26 |     NutterFixture.__init__(self)
27 |     
28 |   def run_code1_arbitrary_files(self):
29 |     generate_data1(spark, n = self.code1_num_entries, name = self.code1_view_name)
30 |     
31 |   def assertion_code1_arbitrary_files(self):
32 |     df = spark.read.table(self.code1_view_name)
33 |     assert(df.count() == self.code1_num_entries)
34 |     
35 |   def run_code2_arbitrary_files(self):
36 |     generate_data2(table_name = self.code2_table_name)
37 |     
38 |   def assertion_code2_arbitrary_files(self):
39 |     some_tbl = spark.sql(f'SELECT COUNT(*) AS total FROM {self.code2_table_name}')
40 |     first_row = some_tbl.first()
41 |     assert (first_row[0] == 10)
42 | 
43 |   def after_code2_arbitrary_files(self):
44 |     spark.sql(f"drop table {self.code2_table_name}")
45 |     
46 |   # we're using Chispa library here to compare the content of the processed dataframe with expected results
47 |   def assertion_upper_columns_arbitrary_files(self):
48 |     cols = ["col1", "col2", "col3"]
49 |     df = spark.createDataFrame([("abc", "cef", 1)], cols)
50 |     upper_df = upper_columns(df, cols)
51 |     expected_df = spark.createDataFrame([("ABC", "CEF", 1)], cols)
52 |     assert_df_equality(upper_df, expected_df)
53 | 
54 |   def assertion_lower_columns_arbitrary_files(self):
55 |     cols = ["col1", "col2", "col3"]
56 |     df = spark.createDataFrame([("Abc", "Cef", 1)], cols)
57 |     upper_df = lower_columns(df, cols)
58 |     expected_df = spark.createDataFrame([("abc", "cef", 1)], cols)
59 |     assert_df_equality(upper_df, expected_df)
60 | 
61 | # COMMAND ----------
62 | 
63 | result = TestFixtureArbitraryFiles().execute_tests()
64 | print(result.to_string())
65 | is_job = dbutils.notebook.entry_point.getDbutils().notebook().getContext().currentRunId().isDefined()
66 | if is_job:
67 |   result.exit(dbutils)
68 | 


--------------------------------------------------------------------------------
/unit-tests/test_with_percent_run.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %pip install -U nutter chispa
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %run ../Code1
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | # MAGIC %run ../Code2
11 | 
12 | # COMMAND ----------
13 | 
14 | # https://github.com/microsoft/nutter
15 | from runtime.nutterfixture import NutterFixture, tag
16 | # https://github.com/MrPowers/chispa
17 | from chispa.dataframe_comparer import *
18 | 
19 | 
20 | class TestPercentRunFixture(NutterFixture):
21 |   def __init__(self):
22 |     self.code2_table_name = "my_data"
23 |     self.code1_view_name = "my_cool_data"
24 |     self.code1_num_entries = 100
25 |     NutterFixture.__init__(self)
26 |     
27 |   def run_code1_percent_run(self):
28 |     generate_data1(n = self.code1_num_entries, name = self.code1_view_name)
29 |     
30 |   def assertion_code1_percent_run(self):
31 |     df = spark.read.table(self.code1_view_name)
32 |     assert(df.count() == self.code1_num_entries)
33 |     
34 |   def run_code2_percent_run(self):
35 |     generate_data2(table_name = self.code2_table_name)
36 |     
37 |   def assertion_code2_percent_run(self):
38 |     some_tbl = spark.sql(f'SELECT COUNT(*) AS total FROM {self.code2_table_name}')
39 |     first_row = some_tbl.first()
40 |     assert (first_row[0] == 10)
41 | 
42 |   def after_code2_percent_run(self):
43 |     spark.sql(f"drop table {self.code2_table_name}")
44 |     
45 |   # we're using Chispa library here to compare the content of the processed dataframe with expected results
46 |   def assertion_upper_columns_percent_run(self):
47 |     cols = ["col1", "col2", "col3"]
48 |     df = spark.createDataFrame([("abc", "cef", 1)], cols)
49 |     upper_df = upper_columns(df, cols)
50 |     expected_df = spark.createDataFrame([("ABC", "CEF", 1)], cols)
51 |     assert_df_equality(upper_df, expected_df)
52 | 
53 | # COMMAND ----------
54 | 
55 | result = TestPercentRunFixture().execute_tests()
56 | print(result.to_string())
57 | is_job = dbutils.notebook.entry_point.getDbutils().notebook().getContext().currentRunId().isDefined()
58 | if is_job:
59 |   result.exit(dbutils)
60 | 
61 | # COMMAND ----------
62 | 
63 | # just add the code
64 | 


--------------------------------------------------------------------------------