├── README.md ├── setup.py ├── notpackage └── notfunctions.py ├── azure-pipelines.yml └── cicd-scripts └── installWhlLibrary.py /README.md: -------------------------------------------------------------------------------- 1 | # databricks-cicd-definitelynotademo -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='notademo', 4 | version='0.0.21', 5 | description='A sample PySpark application - 0.0.21', 6 | author='Silviu Tofan', 7 | author_email='silviu@databricks.com', 8 | url='www.databricks.com', 9 | packages=['notpackage'], 10 | zip_safe=False) -------------------------------------------------------------------------------- /notpackage/notfunctions.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | spark = SparkSession\ 3 | .builder\ 4 | .getOrCreate() 5 | 6 | def spark_f(n_rows): 7 | # The Spark code will execute on the Azure Databricks cluster... 8 | n = spark.range(n_rows).count() 9 | return n 10 | 11 | def python_f(n_rows): 12 | n = len(range(n_rows)) 13 | return n 14 | 15 | print("Hello, I'm testing my new pipeline!") 16 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Python package 2 | # Create and test a Python package on multiple Python versions. 3 | # Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: 4 | # https://docs.microsoft.com/azure/devops/pipelines/languages/python 5 | 6 | parameters: 7 | - name: dbrVersion 8 | displayName: 'DBR Version to use?' 9 | type: string 10 | default: '6.6.*' 11 | 12 | # New code merged into the release branch initiaties a build 13 | trigger: 14 | - release 15 | 16 | #Specify VM 17 | pool: 18 | vmImage: 'ubuntu-latest' 19 | 20 | # Install Python. The version must match the version on the Databricks cluster. 21 | steps: 22 | - task: UsePythonVersion@0 23 | displayName: 'Use Python 3.7' 24 | inputs: 25 | versionSpec: 3.7 26 | 27 | # Install required Python modules, including databricks-connect, required to execute a unit test 28 | # on a cluster. 29 | - script: | 30 | pip install pytest requests setuptools wheel 31 | pip install -U databricks-connect==${{ parameters.dbrVersion }} # we may not need databricks-connect 32 | pip install -U databricks-cli 33 | displayName: 'Load Python Dependencies' 34 | 35 | # Use environment variables to pass Databricks login information to the Databricks Connect 36 | # configuration function 37 | - script: | 38 | echo "y 39 | $(databricks_host) 40 | $(databricks_token) 41 | $(cluster_id) 42 | $(org_id) 43 | 15001" | databricks-connect configure 44 | displayName: 'Configure DBConnect' 45 | 46 | #Download code from designated branch to the agent 47 | - checkout: self 48 | persistCredentials: true 49 | clean: true 50 | 51 | - script: git checkout release 52 | displayName: 'Get Latest Branch' 53 | 54 | #Test goes here 55 | 56 | #Package code into a Py Wheel 57 | - script: | 58 | cd $(Build.Repository.LocalPath)/ 59 | python3 setup.py sdist bdist_wheel 60 | ls 61 | ls dist/ 62 | displayName: 'Build Python Wheel for Libs' 63 | 64 | # Use git diff to flag files added in the most recent git merge 65 | - script: | 66 | git diff --name-only --diff-filter=AMR HEAD^1 HEAD | xargs -I '{}' cp --parents -r '{}' $(Build.BinariesDirectory) 67 | mkdir -p $(Build.BinariesDirectory)/libraries/python/libs 68 | cp $(Build.Repository.LocalPath)/dist/*.* $(Build.BinariesDirectory)/libraries/python/libs 69 | mkdir -p $(Build.BinariesDirectory)/cicd-scripts 70 | cp $(Build.Repository.LocalPath)/cicd-scripts/*.* $(Build.BinariesDirectory)/cicd-scripts 71 | displayName: 'Get Changes' 72 | # Add the wheel file you just created 73 | # The objective is to add all files intended for the current release. 74 | 75 | # Create the deployment artifact and publish it to the artifact repository 76 | - task: ArchiveFiles@2 77 | inputs: 78 | rootFolderOrFile: '$(Build.BinariesDirectory)' 79 | includeRootFolder: false 80 | archiveType: 'zip' 81 | archiveFile: '$(Build.ArtifactStagingDirectory)/$(Build.BuildId).zip' 82 | replaceExistingArchive: true 83 | 84 | - script: | 85 | databricks fs mkdirs 'dbfs:/FileStore/artifacts/' 86 | databricks fs cp '$(Build.ArtifactStagingDirectory)/$(Build.BuildId).zip' 'dbfs:/FileStore/artifacts/' 87 | displayName: "Copy artifact to Databricks" 88 | 89 | - task: PublishBuildArtifacts@1 90 | inputs: 91 | ArtifactName: 'DatabricksBuild' 92 | -------------------------------------------------------------------------------- /cicd-scripts/installWhlLibrary.py: -------------------------------------------------------------------------------- 1 | # installWhlLibrary.py 2 | #!/usr/bin/python3 3 | import json 4 | import requests 5 | import sys 6 | import getopt 7 | import time 8 | import os 9 | 10 | def main(): 11 | shard = '' 12 | token = '' 13 | clusterid = '' 14 | libspath = '' 15 | dbfspath = '' 16 | 17 | try: 18 | opts, args = getopt.getopt(sys.argv[1:], 'hstcld', 19 | ['shard=', 'token=', 'clusterid=', 'libs=', 'dbfspath=']) 20 | except getopt.GetoptError: 21 | print( 22 | 'installWhlLibrary.py -s -t -c -l -d ') 23 | sys.exit(2) 24 | 25 | for opt, arg in opts: 26 | if opt == '-h': 27 | print( 28 | 'installWhlLibrary.py -s -t -c -l -d ') 29 | sys.exit() 30 | elif opt in ('-s', '--shard'): 31 | shard = arg 32 | elif opt in ('-t', '--token'): 33 | token = arg 34 | elif opt in ('-c', '--clusterid'): 35 | clusterid = arg 36 | elif opt in ('-l', '--libs'): 37 | libspath=arg 38 | elif opt in ('-d', '--dbfspath'): 39 | dbfspath=arg 40 | 41 | print('-s is ' + shard) 42 | print('-t is ' + token) 43 | print('-c is ' + clusterid) 44 | print('-l is ' + libspath) 45 | print('-d is ' + dbfspath) 46 | 47 | # Uninstall library if exists on cluster 48 | i=0 49 | 50 | # Generate array from walking local path 51 | libslist = [] 52 | for path, subdirs, files in os.walk(libspath): 53 | for name in files: 54 | 55 | name, file_extension = os.path.splitext(name) 56 | if file_extension.lower() in ['.whl']: 57 | libslist.append(name + file_extension.lower()) 58 | 59 | for lib in libslist: 60 | dbfslib = dbfspath + '/' + lib 61 | print(dbfslib + ' before:' + getLibStatus(shard, token, clusterid, dbfslib)) 62 | 63 | if (getLibStatus(shard, token, clusterid, dbfslib) != 'not found'): 64 | print(dbfslib + " exists. Uninstalling.") 65 | i = i + 1 66 | values = {'cluster_id': clusterid, 'libraries': [{'whl': dbfslib}]} 67 | 68 | resp = requests.post(shard + '/api/2.0/libraries/uninstall', data=json.dumps(values), auth=("token", token)) 69 | runjson = resp.text 70 | d = json.loads(runjson) 71 | print(dbfslib + ' after:' + getLibStatus(shard, token, clusterid, dbfslib)) 72 | 73 | # Restart if libraries uninstalled 74 | if i > 0: 75 | values = {'cluster_id': clusterid} 76 | print("Restarting cluster:" + clusterid) 77 | resp = requests.post(shard + '/api/2.0/clusters/restart', data=json.dumps(values), auth=("token", token)) 78 | restartjson = resp.text 79 | print(restartjson) 80 | 81 | p = 0 82 | waiting = True 83 | while waiting: 84 | time.sleep(30) 85 | clusterresp = requests.get(shard + '/api/2.0/clusters/get?cluster_id=' + clusterid, 86 | auth=("token", token)) 87 | clusterjson = clusterresp.text 88 | jsonout = json.loads(clusterjson) 89 | current_state = jsonout['state'] 90 | print(clusterid + " state:" + current_state) 91 | if current_state in ['TERMINATED', 'RUNNING','INTERNAL_ERROR', 'SKIPPED'] or p >= 10: 92 | break 93 | p = p + 1 94 | 95 | print("Installing " + dbfslib) 96 | values = {'cluster_id': clusterid, 'libraries': [{'whl': 'dbfs:' + dbfslib}]} 97 | 98 | resp = requests.post(shard + '/api/2.0/libraries/install', data=json.dumps(values), auth=("token", token)) 99 | runjson = resp.text 100 | d = json.loads(runjson) 101 | print(dbfslib + ' after:' + getLibStatus(shard, token, clusterid, dbfslib)) 102 | 103 | def getLibStatus(shard, token, clusterid, dbfslib): 104 | 105 | resp = requests.get(shard + '/api/2.0/libraries/cluster-status?cluster_id='+ clusterid, auth=("token", token)) 106 | libjson = resp.text 107 | d = json.loads(libjson) 108 | if (d.get('library_statuses')): 109 | statuses = d['library_statuses'] 110 | 111 | for status in statuses: 112 | if (status['library'].get('whl')): 113 | if (status['library']['whl'] == 'dbfs:' + dbfslib): 114 | return status['status'] 115 | else: 116 | return "not found" 117 | else: 118 | # No libraries found 119 | return "not found" 120 | 121 | if __name__ == '__main__': 122 | main() --------------------------------------------------------------------------------