├── .gitignore ├── __init__.py ├── data ├── default_jobs_cluster_aws.json ├── default_jobs_cluster_azure.json ├── azure_cluster.json ├── aws_cluster.json ├── repair_tables_for_migration.py └── workspace_migration_analysis.py ├── dbclient ├── __init__.py ├── LibraryClient.py ├── DbfsClient.py ├── SecretsClient.py ├── JobsClient.py ├── dbclient.py ├── parser.py ├── ScimClient.py ├── ClustersClient.py ├── WorkspaceClient.py └── HiveClient.py ├── setup.py ├── test_connection.py ├── METASTORE.md ├── LICENSE ├── import_db.py ├── export_db.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | venv 3 | logs 4 | user_logs 5 | azure_logs 6 | __pycache__ 7 | dbclient/*.pyc 8 | build/ 9 | databricks_migration.egg-info/ 10 | dist/ 11 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # outer __init__.py 2 | from dbclient import * 3 | from timeit import default_timer as timer 4 | from datetime import timedelta 5 | from os import makedirs, path 6 | from datetime import datetime 7 | -------------------------------------------------------------------------------- /data/default_jobs_cluster_aws.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_workers": 8, 3 | "spark_version": "7.3.x-scala2.12", 4 | "node_type_id": "i3.xlarge", 5 | "spark_env_vars": { 6 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /data/default_jobs_cluster_azure.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_workers": 8, 3 | "spark_version": "7.3.x-scala2.12", 4 | "node_type_id": "Standard_DS3_v2", 5 | "spark_env_vars": { 6 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /dbclient/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | from .dbclient import dbclient 3 | from .ClustersClient import ClustersClient 4 | from .JobsClient import JobsClient 5 | from .DbfsClient import DbfsClient 6 | from .ScimClient import ScimClient 7 | from .LibraryClient import LibraryClient 8 | from .WorkspaceClient import WorkspaceClient 9 | from .HiveClient import HiveClient 10 | from .SecretsClient import SecretsClient 11 | from .parser import * 12 | -------------------------------------------------------------------------------- /data/azure_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_workers": 1, 3 | "cluster_name": "API_Metastore_Work_Leave_Me_Alone", 4 | "spark_version": "7.3.x-scala2.12", 5 | "spark_conf": {}, 6 | "node_type_id": "Standard_D8_v3", 7 | "ssh_public_keys": [], 8 | "custom_tags": {}, 9 | "spark_env_vars": { 10 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 11 | }, 12 | "autotermination_minutes": 20, 13 | "init_scripts": [] 14 | } 15 | -------------------------------------------------------------------------------- /data/aws_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_workers": 1, 3 | "cluster_name": "API_Metastore_Work_Leave_Me_Alone", 4 | "spark_version": "7.3.x-scala2.12", 5 | "aws_attributes": { 6 | "first_on_demand": 1, 7 | "availability": "SPOT_WITH_FALLBACK", 8 | "zone_id": "us-west-2b", 9 | "spot_bid_price_percent": 100, 10 | "ebs_volume_count": 0 11 | }, 12 | "driver_node_type_id": "i3.xlarge", 13 | "node_type_id": "i3.xlarge", 14 | "spark_env_vars": { 15 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 16 | }, 17 | "autotermination_minutes": 20 18 | } 19 | -------------------------------------------------------------------------------- /data/repair_tables_for_migration.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | migration_log = '/dbfs/tmp/migration/repair_ddl.log' 3 | 4 | num_of_tables = 0 5 | with open(migration_log, 'r') as fp: 6 | for line in fp: 7 | # this is the db_name.tbl_name value 8 | fqdn_table = line.rstrip() 9 | fix_sql_statement = f"MSCK REPAIR TABLE {fqdn_table}" 10 | print(fix_sql_statement) 11 | df = spark.sql(fix_sql_statement) 12 | num_of_tables += 1 13 | 14 | # COMMAND ---------- 15 | 16 | print(f"Total number of tables repaired {num_of_tables}") 17 | 18 | # COMMAND ---------- 19 | 20 | dbutils.fs.rm('/tmp/migration/repair_ddl.log') 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="databricks-migration", # Replace with your own username 8 | version="0.0.2", 9 | author="Miklos C", 10 | author_email="mwc@databricks.com", 11 | description="Databricks Migration scripts", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/mrchristine/db-migration", 15 | license="http://www.apache.org/licenses/LICENSE-2.0", 16 | packages=setuptools.find_packages(), 17 | install_requires=[ 18 | 'cron-descriptor', 19 | 'requests' 20 | ], 21 | py_modules=["export_db","import_db","test_connection"], 22 | classifiers=[ 23 | "Programming Language :: Python :: 3", 24 | "License :: OSI Approved :: MIT License", 25 | "Operating System :: OS Independent", 26 | ], 27 | python_requires='>=3.6', 28 | ) 29 | -------------------------------------------------------------------------------- /test_connection.py: -------------------------------------------------------------------------------- 1 | from dbclient import * 2 | import sys, requests 3 | 4 | # python 3.6 5 | 6 | def main(): 7 | # define a parser to identify what component to import / export 8 | parser = get_export_parser() 9 | # parse the args 10 | args = parser.parse_args() 11 | p = args.profile 12 | 13 | # parse the path location of the Databricks CLI configuration 14 | login_args = get_login_credentials(profile=p) 15 | 16 | # parse the credentials 17 | url = login_args['host'] 18 | token = login_args['token'] 19 | client_config = build_client_config(url, token, args) 20 | 21 | print("Test connection at {0} with profile {1}\n".format(url, args.profile)) 22 | db_client = dbclient(client_config) 23 | try: 24 | is_successful = db_client.test_connection() 25 | except requests.exceptions.RequestException as e: 26 | print(e) 27 | print("\nUnsuccessful connection. Verify credentials.\n") 28 | sys.exit(1) 29 | if is_successful == 0: 30 | print("Connection successful!") 31 | else: 32 | print("\nUnsuccessful connection. Verify credentials.\n") 33 | 34 | if __name__ == '__main__': 35 | main() 36 | -------------------------------------------------------------------------------- /dbclient/LibraryClient.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dbclient import * 3 | 4 | 5 | class LibraryClient(dbclient): 6 | 7 | def get_cluster_list(self, alive=True): 8 | """ Returns an array of json objects for the running clusters. Grab the cluster_name or cluster_id """ 9 | cl = self.get("/clusters/list", print_json=False) 10 | if alive: 11 | running = filter(lambda x: x['state'] == "RUNNING", cl['clusters']) 12 | return list(running) 13 | else: 14 | return cl['clusters'] 15 | 16 | def log_library_details(self, log_file='lib_details.log'): 17 | libs_log = self.get_export_dir() + log_file 18 | all_libs = self.get('/libraries/list', version='1.2') 19 | with open(libs_log, "w") as fp: 20 | for x in all_libs.get('elements', None): 21 | lib_details = self.get('/libraries/status?libraryId={0}'.format(x['id']), version='1.2') 22 | fp.write(json.dumps(lib_details) + '\n') 23 | 24 | def log_cluster_libs(self, cl_log_file='attached_cluster_libs.log'): 25 | cl_lib_log = self.get_export_dir() + cl_log_file 26 | cl = self.get_cluster_list(False) 27 | with open(cl_lib_log, "w") as fp: 28 | for x in cl: 29 | cid = x['cluster_id'] 30 | libs = self.get("/libraries/cluster-status?cluster_id={0}".format(cid)) 31 | fp.write(json.dumps(libs)) 32 | fp.write("\n") 33 | -------------------------------------------------------------------------------- /METASTORE.md: -------------------------------------------------------------------------------- 1 | # Databricks Metastore Migration 2 | 3 | This document discusses the metastore migration options and process. 4 | 5 | 1. Export the metastore DDL 6 | 2. Import the metastore DDL 7 | a. The tool will import `TABLES` first 8 | b. The tool will sideline `VIEWS` to be applied after all tables are created. Views will be sidelined into 9 | `metastore_views/` directory in the export directory. 10 | c. The tool will import all `VIEWS` 11 | 3. Copy the underlying DBFS / root table data. Databricks support team will need to help with this step. 12 | 4. Report on legacy table DDLs to be repaired within the new workspace and metastore. 13 | a. Use the `--get-repair-log` option with the import tool. This will generate a list of tables that need to be 14 | repaired. The most common case of this is to register hive partitions within the metastore. 15 | b. The repair option will upload a list of tables to be repaired, and users can use the notebook included in this 16 | repo, `data/repair_tables_for_migration.py`, to run this operation. 17 | 18 | 19 | **Recommendation / Caveats:** 20 | 1. Use the `--metastore-unicode` option to export and import if you do not know if tables contain unicode characters. 21 | This should be applied to both export and import operations. 22 | 2. Use DBR 6.x / Spark 2.x releases if you have legacy table definitions. 23 | Spark 3.x deprecates `SERDE` support and can cause import issues if you require those tables to use `SERDE` 24 | definitions. 25 | 3. If you manually register table partitions using `ALTER TABLE table_name ADD PARTITION ()` to tables, you will need 26 | to manually report and add these partitions. The tool does not support this today. 27 | Or if you need to drop partitions, you can use `ALTER TABLE table_name DROP PARTITION ()` 28 | -------------------------------------------------------------------------------- /dbclient/DbfsClient.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | import os 4 | import time 5 | from datetime import timedelta 6 | from timeit import default_timer as timer 7 | 8 | from dbclient import * 9 | 10 | 11 | class DbfsClient(ClustersClient): 12 | 13 | @staticmethod 14 | def get_num_of_lines(fname): 15 | if not os.path.exists(fname): 16 | return 0 17 | else: 18 | i = 0 19 | with open(fname) as fp: 20 | for line in fp: 21 | i += 1 22 | return i 23 | 24 | def export_dbfs_mounts(self): 25 | # check if instance profile exists, ask users to use --users first or enter yes to proceed. 26 | start = timer() 27 | cid = self.launch_cluster() 28 | end = timer() 29 | print("Cluster creation time: " + str(timedelta(seconds=end - start))) 30 | time.sleep(5) 31 | ec_id = self.get_execution_context(cid) 32 | 33 | # get all dbfs mount metadata 34 | dbfs_mount_logfile = self.get_export_dir() + 'dbfs_mounts.log' 35 | all_mounts_cmd = 'all_mounts = [{"path": x.mountPoint, "source": x.source, ' \ 36 | '"encryptionType": x.encryptionType} for x in dbutils.fs.mounts()]' 37 | results = self.submit_command(cid, ec_id, all_mounts_cmd) 38 | results = self.submit_command(cid, ec_id, 'print(len(all_mounts))') 39 | # grab the number of mounts to bucket / batch the export 40 | num_of_mounts = ast.literal_eval(results['data']) 41 | 42 | batch_size = 100 # batch size to iterate over databases 43 | num_of_buckets = (num_of_mounts // batch_size) + 1 # number of slices of the list to take 44 | 45 | with open(dbfs_mount_logfile, 'w') as fp_log: 46 | for m in range(0, num_of_buckets): 47 | mounts_slice = 'print(all_mounts[{0}:{1}])'.format(batch_size*m, batch_size*(m+1)) 48 | results = self.submit_command(cid, ec_id, mounts_slice) 49 | mounts_slice_data = ast.literal_eval(results['data']) 50 | for mount_path in mounts_slice_data: 51 | print("Mounts: {0}".format(mount_path)) 52 | fp_log.write(json.dumps(mount_path)) 53 | fp_log.write('\n') 54 | return True 55 | -------------------------------------------------------------------------------- /dbclient/SecretsClient.py: -------------------------------------------------------------------------------- 1 | from dbclient import * 2 | import os 3 | import time 4 | from timeit import default_timer as timer 5 | 6 | 7 | class SecretsClient(ClustersClient): 8 | 9 | def get_secret_scopes_list(self): 10 | scopes_list = self.get('/secrets/scopes/list').get('scopes', []) 11 | return scopes_list 12 | 13 | def get_secrets(self, scope_name): 14 | secrets_list = self.get('/secrets/list', {'scope': scope_name}).get('secrets', []) 15 | return secrets_list 16 | 17 | def get_secret_value(self, scope_name, secret_key, cid, ec_id): 18 | cmd_set_value = f"value = dbutils.secrets.get(scope = '{scope_name}', key = '{secret_key}')" 19 | cmd_convert_b64 = "import base64; b64_value = base64.b64encode(value.encode('ascii'))" 20 | cmd_get_b64 = "print(b64_value.decode('ascii'))" 21 | results_set = self.submit_command(cid, ec_id, cmd_set_value) 22 | results_convert = self.submit_command(cid, ec_id, cmd_convert_b64) 23 | results_get = self.submit_command(cid, ec_id, cmd_get_b64) 24 | if results_set['resultType'] == 'error' \ 25 | or results_convert['resultType'] == 'error'\ 26 | or results_get['resultType'] == 'error': 27 | print("Error:") 28 | print(results_set) 29 | print(results_convert) 30 | print(results_get) 31 | s_value = results_get.get('data') 32 | return s_value 33 | 34 | def log_all_secrets(self, cluster_name, log_dir='secret_scopes/'): 35 | scopes_dir = self.get_export_dir() + log_dir 36 | scopes_list = self.get_secret_scopes_list() 37 | os.makedirs(scopes_dir, exist_ok=True) 38 | start = timer() 39 | cid = self.start_cluster_by_name(cluster_name) 40 | time.sleep(5) 41 | ec_id = self.get_execution_context(cid) 42 | for scope_json in scopes_list: 43 | scope_name = scope_json.get('name') 44 | secrets_list = self.get_secrets(scope_name) 45 | scopes_logfile = scopes_dir + scope_name 46 | with open(scopes_logfile, 'w') as fp: 47 | for secret_json in secrets_list: 48 | secret_name = secret_json.get('key') 49 | b64_value = self.get_secret_value(scope_name, secret_name, cid, ec_id) 50 | s_json = {'name': secret_name, 'value': b64_value} 51 | fp.write(json.dumps(s_json) + '\n') 52 | 53 | def log_all_secrets_acls(self, log_name='secret_scopes_acls.log'): 54 | acls_file = self.get_export_dir() + log_name 55 | scopes_list = self.get_secret_scopes_list() 56 | with open(acls_file, 'w') as fp: 57 | for scope_json in scopes_list: 58 | scope_name = scope_json.get('name', None) 59 | resp = self.get('/secrets/acls/list', {'scope': scope_name}) 60 | resp['scope_name'] = scope_name 61 | fp.write(json.dumps(resp) + '\n') -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Workspace Migration Tool 2 | 3 | Copyright (2019) Databricks, Inc. 4 | 5 | This library (the "Software") may not be used except in connection with the Licensee's use of the Databricks Platform Services pursuant 6 | to an Agreement (defined below) between Licensee (defined below) and Databricks, Inc. ("Databricks"). The Object Code version of the 7 | Software shall be deemed part of the Downloadable Services under the Agreement, or if the Agreement does not define Downloadable Services, 8 | Subscription Services, or if neither are defined then the term in such Agreement that refers to the applicable Databricks Platform 9 | Services (as defined below) shall be substituted herein for “Downloadable Services.” Licensee's use of the Software must comply at 10 | all times with any restrictions applicable to the Downlodable Services and Subscription Services, generally, and must be used in 11 | accordance with any applicable documentation. For the avoidance of doubt, the Software constitutes Databricks Confidential Information 12 | under the Agreement. 13 | 14 | Additionally, and notwithstanding anything in the Agreement to the contrary: 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 16 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 17 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 18 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | * you may view, make limited copies of, and may compile the Source Code version of the Software into an Object Code version of the 20 | Software. For the avoidance of doubt, you may not make derivative works of Software (or make any any changes to the Source Code 21 | version of the unless you have agreed to separate terms with Databricks permitting such modifications (e.g., a contribution license 22 | agreement)). 23 | 24 | If you have not agreed to an Agreement or otherwise do not agree to these terms, you may not use the Software or view, copy or compile 25 | the Source Code of the Software. 26 | 27 | This license terminates automatically upon the termination of the Agreement or Licensee's breach of these terms. Additionally, 28 | Databricks may terminate this license at any time on notice. Upon termination, you must permanently delete the Software and all 29 | copies thereof (including the Source Code). 30 | 31 | Agreement: the agreement between Databricks and Licensee governing the use of the Databricks Platform Services, which shall be, with 32 | respect to Databricks, the Databricks Terms of Service located at www.databricks.com/termsofservice, and with respect to Databricks 33 | Community Edition, the Community Edition Terms of Service located at www.databricks.com/ce-termsofuse, in each case unless Licensee 34 | has entered into a separate written agreement with Databricks governing the use of the applicable Databricks Platform Services. 35 | 36 | Databricks Platform Services: the Databricks services or the Databricks Community Edition services, according to where the Software is used. 37 | 38 | Licensee: the user of the Software, or, if the Software is being used on behalf of a company, the company. 39 | 40 | Object Code: is version of the Software produced when an interpreter or a compiler translates the Source Code into recognizable and 41 | executable machine code. 42 | 43 | Source Code: the human readable portion of the Software. 44 | 45 | -------------------------------------------------------------------------------- /import_db.py: -------------------------------------------------------------------------------- 1 | from dbclient import * 2 | from timeit import default_timer as timer 3 | from datetime import timedelta, datetime 4 | from os import makedirs 5 | 6 | 7 | # python 3.6 8 | def main(): 9 | # define a parser to identify what component to import / export 10 | my_parser = get_import_parser() 11 | # parse the args 12 | args = my_parser.parse_args() 13 | 14 | # parse the path location of the Databricks CLI configuration 15 | login_args = get_login_credentials(profile=args.profile) 16 | if is_azure_creds(login_args) and (not args.azure): 17 | raise ValueError('Login credentials do not match args. Please provide --azure flag for azure environments.') 18 | 19 | # cant use netrc credentials because requests module tries to load the credentials into http basic auth headers 20 | url = login_args['host'] 21 | token = login_args['token'] 22 | client_config = build_client_config(url, token, args) 23 | 24 | makedirs(client_config['export_dir'], exist_ok=True) 25 | 26 | if client_config['debug']: 27 | print(url, token) 28 | now = str(datetime.now()) 29 | 30 | if args.users: 31 | print("Import all users and groups at {0}".format(now)) 32 | scim_c = ScimClient(client_config) 33 | if client_config['is_aws']: 34 | print("Start import of instance profiles first to ensure they exist...") 35 | cl_c = ClustersClient(client_config) 36 | start = timer() 37 | cl_c.import_instance_profiles() 38 | end = timer() 39 | print("Complete Instance Profile Import Time: " + str(timedelta(seconds=end - start))) 40 | start = timer() 41 | scim_c.import_all_users_and_groups() 42 | end = timer() 43 | print("Complete Users and Groups Import Time: " + str(timedelta(seconds=end - start))) 44 | 45 | if args.workspace: 46 | print("Import the complete workspace at {0}".format(now)) 47 | print("Import on {0}".format(url)) 48 | ws_c = WorkspaceClient(client_config) 49 | start = timer() 50 | # log notebooks and libraries 51 | ws_c.import_all_workspace_items(archive_missing=args.archive_missing, 52 | restart_from_last=args.restart_from_checkpoint) 53 | end = timer() 54 | print("Complete Workspace Import Time: " + str(timedelta(seconds=end - start))) 55 | 56 | if args.workspace_top_level: 57 | print("Import the top level workspace items at {0}".format(now)) 58 | print("Import on {0}".format(url)) 59 | ws_c = WorkspaceClient(client_config) 60 | start = timer() 61 | # log notebooks and libraries 62 | ws_c.import_current_workspace_items() 63 | end = timer() 64 | print("Complete Workspace Import Time: " + str(timedelta(seconds=end - start))) 65 | 66 | if args.workspace_acls: 67 | print("Import workspace ACLs at {0}".format(now)) 68 | print("Import on {0}".format(url)) 69 | ws_c = WorkspaceClient(client_config) 70 | start = timer() 71 | # log notebooks and libraries 72 | ws_c.import_workspace_acls() 73 | end = timer() 74 | print("Complete Workspace acl Import Time: " + str(timedelta(seconds=end - start))) 75 | 76 | if args.clusters: 77 | print("Import all cluster configs at {0}".format(now)) 78 | cl_c = ClustersClient(client_config) 79 | if client_config['is_aws']: 80 | print("Start import of instance profiles ...") 81 | start = timer() 82 | cl_c.import_instance_profiles() 83 | end = timer() 84 | print("Complete Instance Profile Import Time: " + str(timedelta(seconds=end - start))) 85 | print("Start import of cluster policies ...") 86 | start = timer() 87 | cl_c.import_cluster_policies() 88 | end = timer() 89 | print("Complete Cluster Policies Creation Time: " + str(timedelta(seconds=end - start))) 90 | print("Start import of instance pool configurations ...") 91 | start = timer() 92 | cl_c.import_instance_pools() 93 | end = timer() 94 | print("Complete Instance Pools Creation Time: " + str(timedelta(seconds=end - start))) 95 | print("Start import of cluster configurations ...") 96 | start = timer() 97 | cl_c.import_cluster_configs() 98 | end = timer() 99 | print("Complete Cluster Import Time: " + str(timedelta(seconds=end - start))) 100 | 101 | if args.jobs: 102 | print("Importing the jobs configs at {0}".format(now)) 103 | start = timer() 104 | jobs_c = JobsClient(client_config) 105 | jobs_c.import_job_configs() 106 | end = timer() 107 | print("Complete Jobs Export Time: " + str(timedelta(seconds=end - start))) 108 | 109 | if args.metastore or args.metastore_unicode: 110 | print("Importing the metastore configs at {0}".format(now)) 111 | start = timer() 112 | hive_c = HiveClient(client_config) 113 | # log job configs 114 | hive_c.import_hive_metastore(cluster_name=args.cluster_name, has_unicode=args.metastore_unicode) 115 | end = timer() 116 | print("Complete Metastore Import Time: " + str(timedelta(seconds=end - start))) 117 | 118 | if args.pause_all_jobs: 119 | print("Pause all current jobs {0}".format(now)) 120 | start = timer() 121 | jobs_c = JobsClient(client_config) 122 | # log job configs 123 | jobs_c.pause_all_jobs() 124 | end = timer() 125 | print("Paused all jobs time: " + str(timedelta(seconds=end - start))) 126 | 127 | if args.unpause_all_jobs: 128 | print("Unpause all current jobs {0}".format(now)) 129 | start = timer() 130 | jobs_c = JobsClient(client_config) 131 | # log job configs 132 | jobs_c.pause_all_jobs(False) 133 | end = timer() 134 | print("Unpaused all jobs time: " + str(timedelta(seconds=end - start))) 135 | 136 | if args.delete_all_jobs: 137 | print("Delete all current jobs {0}".format(now)) 138 | start = timer() 139 | jobs_c = JobsClient(client_config) 140 | url = jobs_c.get_url() 141 | response = prompt_for_input(f'\nPlease confirm that you would like to delete jobs from {url} [yes/no]:') 142 | if response: 143 | print("Deleting all job configs ... ") 144 | jobs_c.delete_all_jobs() 145 | end = timer() 146 | print("Delete all jobs time: " + str(timedelta(seconds=end - start))) 147 | 148 | if args.single_user: 149 | user_email = args.single_user 150 | print(f"Import user {user_email} at {now}") 151 | scim_c = ScimClient(client_config) 152 | start = timer() 153 | # log all users 154 | scim_c.import_single_user(user_email) 155 | end = timer() 156 | print("Complete single user import: " + str(timedelta(seconds=end - start))) 157 | 158 | if args.import_home: 159 | username = args.import_home 160 | print("Importing home directory: {0}".format(username)) 161 | ws_c = WorkspaceClient(client_config) 162 | start = timer() 163 | # log notebooks and libraries 164 | ws_c.import_user_home(username, 'user_exports') 165 | end = timer() 166 | print("Complete Single User Import Time: " + str(timedelta(seconds=end - start))) 167 | 168 | if args.import_groups: 169 | print("Importing Groups from logs") 170 | start = timer() 171 | scim_c = ScimClient(client_config) 172 | scim_c.import_all_users_and_groups() 173 | user_names = scim_c.get_users_from_log() 174 | print('Export users notebooks:', user_names) 175 | ws_c = WorkspaceClient(client_config) 176 | for username in user_names: 177 | ws_c.import_user_home(username, 'user_exports') 178 | jobs_c = JobsClient(client_config) 179 | # this will only import the groups jobs since we're filtering the jobs during the export process 180 | print('Importing the groups members jobs:') 181 | jobs_c.import_job_configs() 182 | end = timer() 183 | print("Complete User Export Time: " + str(timedelta(seconds=end - start))) 184 | 185 | if args.libs: 186 | start = timer() 187 | print("Not supported today") 188 | end = timer() 189 | # print("Complete Library Import Time: " + str(timedelta(seconds=end - start))) 190 | 191 | if args.get_repair_log: 192 | print("Finding partitioned tables to repair at {0}".format(now)) 193 | start = timer() 194 | hive_c = HiveClient(client_config) 195 | # log job configs 196 | hive_c.report_legacy_tables_to_fix() 197 | end = timer() 198 | print("Complete Report Time: " + str(timedelta(seconds=end - start))) 199 | 200 | 201 | if __name__ == '__main__': 202 | main() 203 | -------------------------------------------------------------------------------- /dbclient/JobsClient.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dbclient import * 4 | 5 | 6 | class JobsClient(ClustersClient): 7 | 8 | def get_jobs_default_cluster_conf(self): 9 | if self.is_aws(): 10 | cluster_json_file = 'data/default_jobs_cluster_aws.json' 11 | else: 12 | cluster_json_file = 'data/default_jobs_cluster_azure.json' 13 | with open(cluster_json_file, 'r') as fp: 14 | cluster_json = json.loads(fp.read()) 15 | return cluster_json 16 | 17 | def get_jobs_list(self, print_json=False): 18 | """ Returns an array of json objects for jobs """ 19 | jobs = self.get("/jobs/list", print_json) 20 | return jobs.get('jobs', []) 21 | 22 | def get_job_id_by_name(self): 23 | """ 24 | get a dict mapping of job name to job id for the new job ids 25 | :return: 26 | """ 27 | jobs = self.get_jobs_list() 28 | job_ids = {} 29 | for job in jobs: 30 | job_ids[job['settings']['name']] = job['job_id'] 31 | return job_ids 32 | 33 | def update_imported_job_names(self): 34 | # loop through and update the job names to remove the custom delimiter + job_id suffix 35 | current_jobs_list = self.get_jobs_list() 36 | for job in current_jobs_list: 37 | job_id = job['job_id'] 38 | job_name = job['settings']['name'] 39 | # job name was set to `old_job_name:::{job_id}` to support duplicate job names 40 | # we need to parse the old job name and update the current jobs 41 | old_job_name = job_name.split(':::')[0] 42 | new_settings = {'name': old_job_name} 43 | update_args = {'job_id': job_id, 'new_settings': new_settings} 44 | print('Updating job name:', update_args) 45 | resp = self.post('/jobs/update', update_args) 46 | print(resp) 47 | 48 | def log_job_configs(self, users_list=[], log_file='jobs.log', acl_file='acl_jobs.log'): 49 | """ 50 | log all job configs and the ACLs for each job 51 | :param users_list: a list of users / emails to filter the results upon (optional for group exports) 52 | :param log_file: log file to store job configs as json entries per line 53 | :param acl_file: log file to store job ACLs 54 | :return: 55 | """ 56 | jobs_log = self.get_export_dir() + log_file 57 | acl_jobs_log = self.get_export_dir() + acl_file 58 | # pinned by cluster_user is a flag per cluster 59 | jl_full = self.get_jobs_list(False) 60 | if users_list: 61 | # filter the jobs list to only contain users that exist within this list 62 | jl = list(filter(lambda x: x['creator_user_name'] in users_list, jl_full)) 63 | else: 64 | jl = jl_full 65 | with open(jobs_log, "w") as log_fp, open(acl_jobs_log, 'w') as acl_fp: 66 | for x in jl: 67 | job_id = x['job_id'] 68 | new_job_name = x['settings']['name'] + ':::' + str(job_id) 69 | # grab the settings obj 70 | job_settings = x['settings'] 71 | # update the job name 72 | job_settings['name'] = new_job_name 73 | # reset the original struct with the new settings 74 | x['settings'] = job_settings 75 | log_fp.write(json.dumps(x) + '\n') 76 | job_perms = self.get(f'/preview/permissions/jobs/{job_id}') 77 | job_perms['job_name'] = new_job_name 78 | acl_fp.write(json.dumps(job_perms) + '\n') 79 | 80 | def import_job_configs(self, log_file='jobs.log', acl_file='acl_jobs.log'): 81 | jobs_log = self.get_export_dir() + log_file 82 | acl_jobs_log = self.get_export_dir() + acl_file 83 | if not os.path.exists(jobs_log): 84 | print("No job configurations to import.") 85 | return 86 | # get an old cluster id to new cluster id mapping object 87 | cluster_mapping = self.get_cluster_id_mapping() 88 | old_2_new_policy_ids = self.get_new_policy_id_dict() # dict { old_policy_id : new_policy_id } 89 | with open(jobs_log, 'r') as fp: 90 | for line in fp: 91 | job_conf = json.loads(line) 92 | job_creator = job_conf.get('creator_user_name', '') 93 | job_settings = job_conf['settings'] 94 | job_schedule = job_settings.get('schedule', None) 95 | if job_schedule: 96 | # set all imported jobs as paused 97 | job_schedule['pause_status'] = 'PAUSED' 98 | job_settings['schedule'] = job_schedule 99 | if 'existing_cluster_id' in job_settings: 100 | old_cid = job_settings['existing_cluster_id'] 101 | # set new cluster id for existing cluster attribute 102 | new_cid = cluster_mapping.get(old_cid, None) 103 | if not new_cid: 104 | print("Existing cluster has been removed. Resetting job to use new cluster.") 105 | job_settings.pop('existing_cluster_id') 106 | job_settings['new_cluster'] = self.get_jobs_default_cluster_conf() 107 | else: 108 | job_settings['existing_cluster_id'] = new_cid 109 | else: # new cluster config 110 | cluster_conf = job_settings['new_cluster'] 111 | if 'policy_id' in cluster_conf: 112 | old_policy_id = cluster_conf['policy_id'] 113 | cluster_conf['policy_id'] = old_2_new_policy_ids[old_policy_id] 114 | # check for instance pools and modify cluster attributes 115 | if 'instance_pool_id' in cluster_conf: 116 | new_cluster_conf = self.cleanup_cluster_pool_configs(cluster_conf, job_creator, True) 117 | else: 118 | new_cluster_conf = cluster_conf 119 | job_settings['new_cluster'] = new_cluster_conf 120 | print("Current Job Name: {0}".format(job_conf['settings']['name'])) 121 | # creator can be none if the user is no longer in the org. see our docs page 122 | creator_user_name = job_conf.get('creator_user_name', None) 123 | create_resp = self.post('/jobs/create', job_settings) 124 | if 'error_code' in create_resp: 125 | print("Resetting job to use default cluster configs due to expired configurations.") 126 | job_settings['new_cluster'] = self.get_jobs_default_cluster_conf() 127 | create_resp_retry = self.post('/jobs/create', job_settings) 128 | # update the jobs with their ACLs 129 | with open(acl_jobs_log, 'r') as acl_fp: 130 | job_id_by_name = self.get_job_id_by_name() 131 | for line in acl_fp: 132 | acl_conf = json.loads(line) 133 | current_job_id = job_id_by_name[acl_conf['job_name']] 134 | job_path = f'jobs/{current_job_id}' # contains `/jobs/{job_id}` path 135 | api = f'/preview/permissions/{job_path}' 136 | # get acl permissions for jobs 137 | acl_perms = self.build_acl_args(acl_conf['access_control_list'], True) 138 | acl_create_args = {'access_control_list': acl_perms} 139 | acl_resp = self.patch(api, acl_create_args) 140 | print(acl_resp) 141 | # update the imported job names 142 | self.update_imported_job_names() 143 | 144 | def pause_all_jobs(self, pause=True): 145 | job_list = self.get('/jobs/list').get('jobs', None) 146 | for job_conf in job_list: 147 | job_settings = job_conf['settings'] 148 | job_schedule = job_settings.get('schedule', None) 149 | if job_schedule: 150 | # set all imported jobs as paused or un-paused 151 | if pause: 152 | job_schedule['pause_status'] = 'PAUSED' 153 | else: 154 | job_schedule['pause_status'] = 'UNPAUSED' 155 | job_settings['schedule'] = job_schedule 156 | update_job_conf = {'job_id': job_conf['job_id'], 157 | 'new_settings': job_settings} 158 | update_job_resp = self.post('/jobs/reset', update_job_conf) 159 | 160 | def delete_all_jobs(self): 161 | job_list = self.get('/jobs/list').get('jobs', []) 162 | for job in job_list: 163 | self.post('/jobs/delete', {'job_id': job['job_id']}) 164 | 165 | def get_cluster_id_mapping(self, log_file='clusters.log'): 166 | """ 167 | Get a dict mapping of old cluster ids to new cluster ids for jobs connecting to existing clusters 168 | :param log_file: 169 | :return: 170 | """ 171 | cluster_logfile = self.get_export_dir() + log_file 172 | current_cl = self.get('/clusters/list').get('clusters', []) 173 | old_clusters = {} 174 | # build dict with old cluster name to cluster id mapping 175 | if not os.path.exists(cluster_logfile): 176 | raise ValueError('Clusters log must exist to map clusters to previous existing cluster ids') 177 | with open(cluster_logfile, 'r') as fp: 178 | for line in fp: 179 | conf = json.loads(line) 180 | old_clusters[conf['cluster_name']] = conf['cluster_id'] 181 | new_to_old_mapping = {} 182 | for new_cluster in current_cl: 183 | old_cluster_id = old_clusters.get(new_cluster['cluster_name'], None) 184 | if old_cluster_id: 185 | new_to_old_mapping[old_cluster_id] = new_cluster['cluster_id'] 186 | return new_to_old_mapping 187 | -------------------------------------------------------------------------------- /data/workspace_migration_analysis.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import json, os, datetime, requests 3 | import requests.packages.urllib3 4 | 5 | global pprint_j 6 | 7 | requests.packages.urllib3.disable_warnings() 8 | 9 | 10 | # Helper to pretty print json 11 | def pprint_j(i): 12 | print(json.dumps(i, indent=4, sort_keys=True)) 13 | 14 | 15 | class dbclient: 16 | """ 17 | Rest API Wrapper for Databricks APIs 18 | """ 19 | # set of http error codes to throw an exception if hit. Handles client and auth errors 20 | http_error_codes = (401, 403) 21 | 22 | def __init__(self, token, url): 23 | self._token = {'Authorization': 'Bearer {0}'.format(token)} 24 | self._url = url.rstrip("/") 25 | self._is_verbose = False 26 | self._verify_ssl = False 27 | if self._verify_ssl: 28 | # set these env variables if skip SSL verification is enabled 29 | os.environ['REQUESTS_CA_BUNDLE'] = "" 30 | os.environ['CURL_CA_BUNDLE'] = "" 31 | 32 | def is_aws(self): 33 | return self._is_aws 34 | 35 | def is_verbose(self): 36 | return self._is_verbose 37 | 38 | def is_skip_failed(self): 39 | return self._skip_failed 40 | 41 | def test_connection(self): 42 | # verify the proper url settings to configure this client 43 | if self._url[-4:] != '.com' and self._url[-4:] != '.net': 44 | print("Hostname should end in '.com'") 45 | return -1 46 | results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token, 47 | verify=self._verify_ssl) 48 | http_status_code = results.status_code 49 | if http_status_code != 200: 50 | print("Error. Either the credentials have expired or the credentials don't have proper permissions.") 51 | print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.") 52 | print(results.text) 53 | return -1 54 | return 0 55 | 56 | def get(self, endpoint, json_params=None, version='2.0', print_json=False): 57 | if version: 58 | ver = version 59 | full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint 60 | if self.is_verbose(): 61 | print("Get: {0}".format(full_endpoint)) 62 | if json_params: 63 | raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl) 64 | http_status_code = raw_results.status_code 65 | if http_status_code in dbclient.http_error_codes: 66 | raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) 67 | results = raw_results.json() 68 | else: 69 | raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl) 70 | http_status_code = raw_results.status_code 71 | if http_status_code in dbclient.http_error_codes: 72 | raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) 73 | results = raw_results.json() 74 | if print_json: 75 | print(json.dumps(results, indent=4, sort_keys=True)) 76 | if type(results) == list: 77 | results = {'elements': results} 78 | results['http_status_code'] = raw_results.status_code 79 | return results 80 | 81 | def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None): 82 | if version: 83 | ver = version 84 | full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint 85 | if self.is_verbose(): 86 | print("{0}: {1}".format(http_type, full_endpoint)) 87 | if json_params: 88 | if http_type == 'post': 89 | if files_json: 90 | raw_results = requests.post(full_endpoint, headers=self._token, 91 | data=json_params, files=files_json, verify=self._verify_ssl) 92 | else: 93 | raw_results = requests.post(full_endpoint, headers=self._token, 94 | json=json_params, verify=self._verify_ssl) 95 | if http_type == 'put': 96 | raw_results = requests.put(full_endpoint, headers=self._token, 97 | json=json_params, verify=self._verify_ssl) 98 | if http_type == 'patch': 99 | raw_results = requests.patch(full_endpoint, headers=self._token, 100 | json=json_params, verify=self._verify_ssl) 101 | 102 | http_status_code = raw_results.status_code 103 | if http_status_code in dbclient.http_error_codes: 104 | raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type, 105 | http_status_code, 106 | raw_results.text)) 107 | results = raw_results.json() 108 | else: 109 | print("Must have a payload in json_args param.") 110 | return {} 111 | if print_json: 112 | print(json.dumps(results, indent=4, sort_keys=True)) 113 | # if results are empty, let's return the return status 114 | if results: 115 | results['http_status_code'] = raw_results.status_code 116 | return results 117 | else: 118 | return {'http_status_code': raw_results.status_code} 119 | 120 | def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None): 121 | return self.http_req('post', endpoint, json_params, version, print_json, files_json) 122 | 123 | def put(self, endpoint, json_params, version='2.0', print_json=False): 124 | return self.http_req('put', endpoint, json_params, version, print_json) 125 | 126 | def patch(self, endpoint, json_params, version='2.0', print_json=False): 127 | return self.http_req('patch', endpoint, json_params, version, print_json) 128 | 129 | @staticmethod 130 | def my_map(F, items): 131 | to_return = [] 132 | for elem in items: 133 | to_return.append(F(elem)) 134 | return to_return 135 | 136 | def set_export_dir(self, dir_location): 137 | self._export_dir = dir_location 138 | 139 | def get_export_dir(self): 140 | return self._export_dir 141 | 142 | def get_latest_spark_version(self): 143 | versions = self.get('/clusters/spark-versions')['versions'] 144 | v_sorted = sorted(versions, key=lambda i: i['key'], reverse=True) 145 | for x in v_sorted: 146 | img_type = x['key'].split('-')[1][0:5] 147 | if img_type == 'scala': 148 | return x 149 | 150 | 151 | # COMMAND ---------- 152 | 153 | class migrateclient(dbclient): 154 | 155 | def get_num_defined_jobs(self): 156 | jobs_list = self.get('/jobs/list').get('jobs', []) 157 | return len(jobs_list) 158 | 159 | def get_num_external_jobs(self): 160 | job_runs = self.get('/jobs/runs/list').get('runs', []) 161 | job_ids_list = set(map(lambda x: x.get('job_id', None), self.get('/jobs/list').get('jobs', []))) 162 | job_ids_from_runs = set(map(lambda x: x.get('job_id', None), job_runs)) 163 | ephemeral_job_ids = job_ids_from_runs - job_ids_list 164 | return len(ephemeral_job_ids) 165 | 166 | def get_num_users(self): 167 | users = self.get('/preview/scim/v2/Users').get('Resources', []) 168 | return len(users) 169 | 170 | def get_num_groups(self): 171 | groups = self.get('/preview/scim/v2/Groups').get('Resources', []) 172 | return len(groups) 173 | 174 | def get_num_notebooks(self, second_level=False): 175 | users = self.get('/preview/scim/v2/Users').get('Resources', []) 176 | total_nbs = 0 177 | second_level_dirs = [] 178 | for user in users: 179 | path = '/Users/' + user['userName'] 180 | ls = self.get('/workspace/list', {'path' : path}).get('objects', []) 181 | nbs = list(filter(lambda x: x.get('object_type', None) == 'NOTEBOOK', ls)) 182 | total_nbs += len(nbs) 183 | dirs = list(filter(lambda x: x.get('object_type', None) == 'DIRECTORY', ls)) 184 | for p in dirs: 185 | dir_path = p.get('path') 186 | ls_dir = self.get('/workspace/list', {'path' : dir_path}).get('objects', []) 187 | dir_nbs = list(filter(lambda x: x.get('object_type', None) == 'NOTEBOOK', ls_dir)) 188 | second_level_dirs.extend(filter(lambda x: x.get('object_type', None) == 'DIRECTORY', ls_dir)) 189 | total_nbs += len(dir_nbs) 190 | # search 2 levels deep only to get an approximate notebook count 191 | if second_level: 192 | for p in second_level_dirs: 193 | dir_path = p.get('path') 194 | ls_dir = self.get('/workspace/list', {'path' : dir_path}).get('objects', []) 195 | dir_nbs = list(filter(lambda x: x.get('object_type', None) == 'NOTEBOOK', ls_dir)) 196 | total_nbs += len(dir_nbs) 197 | return total_nbs 198 | 199 | def get_num_databases(self): 200 | dbs = spark.catalog.listDatabases() 201 | return len(dbs) 202 | 203 | def get_num_tables(self): 204 | dbs = spark.catalog.listDatabases() 205 | table_count = 0 206 | for db in dbs: 207 | tables = spark.catalog.listTables(db.name) 208 | table_count += len(tables) 209 | return table_count 210 | 211 | 212 | # COMMAND ---------- 213 | 214 | url = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) 215 | token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) 216 | 217 | client = migrateclient(token, url) 218 | 219 | # COMMAND ---------- 220 | 221 | print("Num of users: ", client.get_num_users()) 222 | print("Num of groups: ", client.get_num_groups()) 223 | print("Approximate num of notebooks: ", client.get_num_notebooks(True)) 224 | print("Num of internal jobs: ", client.get_num_defined_jobs()) 225 | print("Num of external jobs: ", client.get_num_external_jobs()) 226 | print("Num of databases: ", client.get_num_databases()) 227 | print("Num of tables: ", client.get_num_tables()) 228 | 229 | # COMMAND ---------- 230 | 231 | 232 | -------------------------------------------------------------------------------- /export_db.py: -------------------------------------------------------------------------------- 1 | from dbclient import * 2 | from timeit import default_timer as timer 3 | from datetime import timedelta, datetime 4 | import os 5 | import shutil 6 | 7 | 8 | # python 3.6 9 | def main(): 10 | # define a parser to identify what component to import / export 11 | my_parser = get_export_parser() 12 | # parse the args 13 | args = my_parser.parse_args() 14 | 15 | if os.name == 'nt' and (not args.bypass_windows_check): 16 | raise ValueError('This tool currently does not support running on Windows OS') 17 | 18 | # parse the path location of the Databricks CLI configuration 19 | login_args = get_login_credentials(profile=args.profile) 20 | if is_azure_creds(login_args) and (not args.azure): 21 | raise ValueError('Login credentials do not match args. Please provide --azure flag for azure envs.') 22 | 23 | # cant use netrc credentials because requests module tries to load the credentials into http basic auth headers 24 | # parse the credentials 25 | url = login_args['host'] 26 | token = login_args['token'] 27 | client_config = build_client_config(url, token, args) 28 | 29 | os.makedirs(client_config['export_dir'], exist_ok=True) 30 | 31 | if client_config['debug']: 32 | print(url, token) 33 | now = str(datetime.now()) 34 | 35 | if args.users: 36 | print("Export all users and groups at {0}".format(now)) 37 | scim_c = ScimClient(client_config) 38 | start = timer() 39 | # log all users 40 | scim_c.log_all_users() 41 | end = timer() 42 | print("Complete Users Export Time: " + str(timedelta(seconds=end - start))) 43 | start = timer() 44 | # log all groups 45 | scim_c.log_all_groups() 46 | end = timer() 47 | print("Complete Group Export Time: " + str(timedelta(seconds=end - start))) 48 | # log the instance profiles 49 | if scim_c.is_aws(): 50 | cl_c = ClustersClient(client_config) 51 | print("Start instance profile logging ...") 52 | start = timer() 53 | cl_c.log_instance_profiles() 54 | end = timer() 55 | print("Complete Instance Profile Export Time: " + str(timedelta(seconds=end - start))) 56 | 57 | if args.workspace: 58 | print("Export the complete workspace at {0}".format(now)) 59 | ws_c = WorkspaceClient(client_config) 60 | start = timer() 61 | # log notebooks and libraries 62 | ws_c.init_workspace_logfiles() 63 | num_notebooks = ws_c.log_all_workspace_items() 64 | print("Total number of notebooks logged: ", num_notebooks) 65 | end = timer() 66 | print("Complete Workspace Export Time: " + str(timedelta(seconds=end - start))) 67 | 68 | if args.workspace_acls: 69 | print("Export the ACLs for workspace objects at {0}".format(now)) 70 | ws_c = WorkspaceClient(client_config) 71 | start = timer() 72 | # log notebooks and directory acls 73 | ws_c.log_all_workspace_acls() 74 | end = timer() 75 | print("Complete Workspace Permission Export Time: " + str(timedelta(seconds=end - start))) 76 | 77 | if args.download: 78 | print("Starting complete workspace download at {0}".format(now)) 79 | ws_c = WorkspaceClient(client_config) 80 | start = timer() 81 | # log notebooks and libraries 82 | num_notebooks = ws_c.download_notebooks() 83 | print(f"Total number of notebooks downloaded: {num_notebooks}") 84 | end = timer() 85 | print("Complete Workspace Download Time: " + str(timedelta(seconds=end - start))) 86 | 87 | if args.libs: 88 | if not client_config['is_aws']: 89 | print("Databricks does not support library exports on Azure today") 90 | else: 91 | print("Starting complete library log at {0}".format(now)) 92 | lib_c = LibraryClient(client_config) 93 | start = timer() 94 | lib_c.log_library_details() 95 | end = timer() 96 | print("Complete Library Download Time: " + str(timedelta(seconds=end - start))) 97 | 98 | if args.clusters: 99 | print("Export the cluster configs at {0}".format(now)) 100 | cl_c = ClustersClient(client_config) 101 | start = timer() 102 | # log the cluster json 103 | cl_c.log_cluster_configs() 104 | cl_c.log_cluster_policies() 105 | end = timer() 106 | print("Complete Cluster Export Time: " + str(timedelta(seconds=end - start))) 107 | # log the instance pools 108 | print("Start instance pool logging ...") 109 | start = timer() 110 | cl_c.log_instance_pools() 111 | end = timer() 112 | print("Complete Instance Pools Export Time: " + str(timedelta(seconds=end - start))) 113 | 114 | if args.jobs: 115 | print("Export the jobs configs at {0}".format(now)) 116 | start = timer() 117 | jobs_c = JobsClient(client_config) 118 | # log job configs 119 | jobs_c.log_job_configs() 120 | end = timer() 121 | print("Complete Jobs Export Time: " + str(timedelta(seconds=end - start))) 122 | 123 | if args.pause_all_jobs: 124 | print("Pause all current jobs {0}".format(now)) 125 | start = timer() 126 | jobs_c = JobsClient(client_config) 127 | # log job configs 128 | jobs_c.pause_all_jobs() 129 | end = timer() 130 | print("Paused all jobs time: " + str(timedelta(seconds=end - start))) 131 | 132 | if args.unpause_all_jobs: 133 | print("Unpause all current jobs {0}".format(now)) 134 | start = timer() 135 | jobs_c = JobsClient(client_config) 136 | # log job configs 137 | jobs_c.pause_all_jobs(False) 138 | end = timer() 139 | print("Unpaused all jobs time: " + str(timedelta(seconds=end - start))) 140 | 141 | if args.metastore or args.metastore_unicode: 142 | print("Export the metastore configs at {0}".format(now)) 143 | start = timer() 144 | hive_c = HiveClient(client_config) 145 | if args.database is not None: 146 | # export only a single database with a given iam role 147 | database_name = args.database 148 | hive_c.export_database(database_name, args.cluster_name, args.iam, has_unicode=args.metastore_unicode) 149 | else: 150 | # export all of the metastore 151 | hive_c.export_hive_metastore(cluster_name=args.cluster_name, has_unicode=args.metastore_unicode) 152 | end = timer() 153 | print("Complete Metastore Export Time: " + str(timedelta(seconds=end - start))) 154 | 155 | if args.secrets: 156 | if not args.cluster_name: 157 | print("Please provide an existing cluster name w/ --cluster-name option\n") 158 | return 159 | print("Export the secret scopes configs at {0}".format(now)) 160 | start = timer() 161 | sc = SecretsClient(client_config) 162 | # log job configs 163 | sc.log_all_secrets(args.cluster_name) 164 | sc.log_all_secrets_acls() 165 | end = timer() 166 | print("Complete Secrets Export Time: " + str(timedelta(seconds=end - start))) 167 | 168 | if args.mounts: 169 | print("Export the mount configs at {0}".format(now)) 170 | start = timer() 171 | dbfs_c = DbfsClient(client_config) 172 | # log job configs 173 | dbfs_c.export_dbfs_mounts() 174 | end = timer() 175 | print("Complete Mounts Export Time: " + str(timedelta(seconds=end - start))) 176 | 177 | if args.update_account_id and args.old_account_id: 178 | print("Updating old account id to new account at {0}".format(now)) 179 | start = timer() 180 | client = dbclient(client_config) 181 | client.update_account_id(args.update_account_id, args.old_account_id) 182 | end = timer() 183 | print("Complete account id update time: " + str(timedelta(seconds=end - start))) 184 | 185 | if args.replace_old_email and args.update_new_email: 186 | print("Updating old email to new email address at {0}".format(now)) 187 | start = timer() 188 | client = dbclient(client_config) 189 | client.update_email_addresses(args.replace_old_email, args.update_new_email) 190 | end = timer() 191 | print("Complete email update time: " + str(timedelta(seconds=end - start))) 192 | 193 | if args.replace_email: 194 | print("Updating old email(s) to new email(s)) at {0}".format(now)) 195 | start = timer() 196 | client = dbclient(client_config) 197 | #parse list list of e-mail mapping pairs. Format is: old1@email.com:new1@e-mail.com,old2email.com:new2@email.com 198 | emailpairs = args.replace_email.split(',') 199 | print(str(len(emailpairs)) +' emails found to replace') 200 | for emailpair in emailpairs: 201 | if len(emailpair.split(':')) < 2: 202 | print('Syntax error in e-mail '+emailpair+'. Old e-mail address and new e-mail address new to be separated by a :') 203 | else: 204 | old_email=emailpair.split(':')[0] 205 | new_email=emailpair.split(':')[1] 206 | print('Replacing old e-mail: '+old_email+' with new e-mail '+new_email) 207 | client.update_email_addresses(old_email, new_email) 208 | end = timer() 209 | print("Complete email update time: " + str(timedelta(seconds=end - start))) 210 | 211 | if args.single_user: 212 | user_email = args.single_user 213 | print(f"Export user {user_email} at {now}") 214 | scim_c = ScimClient(client_config) 215 | start = timer() 216 | # log all users 217 | scim_c.log_single_user(user_email) 218 | end = timer() 219 | print("Complete single user export: " + str(timedelta(seconds=end - start))) 220 | 221 | if args.workspace_top_level_only: 222 | print("Export top level workspace objects at {0}".format(now)) 223 | ws_c = WorkspaceClient(client_config) 224 | start = timer() 225 | # log notebooks and directory acls 226 | ws_c.export_top_level_folders() 227 | end = timer() 228 | print("Complete Workspace Top Level Notebooks Export Time: " + str(timedelta(seconds=end - start))) 229 | 230 | if args.export_home: 231 | username = args.export_home 232 | print("Exporting home directory: {0}".format(username)) 233 | ws_c = WorkspaceClient(client_config) 234 | start = timer() 235 | # log notebooks and libraries 236 | ws_c.export_user_home(username, 'user_exports') 237 | end = timer() 238 | print("Complete User Export Time: " + str(timedelta(seconds=end - start))) 239 | 240 | if args.export_groups: 241 | group_name_list = convert_args_to_list(args.export_groups) 242 | print("Exporting Groups: {0}".format(group_name_list)) 243 | start = timer() 244 | scim_c = ScimClient(client_config) 245 | # log notebooks and libraries 246 | user_names = scim_c.log_groups_from_list(group_name_list) 247 | print('Export users notebooks:', user_names) 248 | ws_c = WorkspaceClient(client_config) 249 | for username in user_names: 250 | is_user_home_empty = ws_c.is_user_home_empty(username) 251 | if not is_user_home_empty: 252 | ws_c.export_user_home(username, 'user_exports') 253 | print('Exporting users jobs:') 254 | jobs_c = JobsClient(client_config) 255 | jobs_c.log_job_configs(users_list=user_names) 256 | end = timer() 257 | print("Complete User Export Time: " + str(timedelta(seconds=end - start))) 258 | 259 | if args.reset_exports: 260 | print('Request to clean up old export directory') 261 | start = timer() 262 | client = dbclient(client_config) 263 | export_dir = client.get_export_dir() 264 | response = prompt_for_input(f'\nPlease confirm that you would like to delete all the logs from {export_dir}' 265 | f' [yes/no]:') 266 | if response: 267 | print('Deleting old export directory and logs ...') 268 | try: 269 | shutil.rmtree(export_dir) 270 | except OSError as e: 271 | print("Error: %s - %s." % (e.filename, e.strerror)) 272 | end = timer() 273 | print("Completed cleanup: " + str(timedelta(seconds=end - start))) 274 | 275 | 276 | if __name__ == '__main__': 277 | main() 278 | -------------------------------------------------------------------------------- /dbclient/dbclient.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import requests 4 | import fileinput 5 | import requests.packages.urllib3 6 | 7 | global pprint_j 8 | 9 | requests.packages.urllib3.disable_warnings() 10 | 11 | 12 | # Helper to pretty print json 13 | def pprint_j(i): 14 | print(json.dumps(i, indent=4, sort_keys=True)) 15 | 16 | 17 | class dbclient: 18 | """ 19 | Rest API Wrapper for Databricks APIs 20 | """ 21 | # set of http error codes to throw an exception if hit. Handles client and auth errors 22 | http_error_codes = (401, 403) 23 | 24 | def __init__(self, configs): 25 | self._token = {'Authorization': 'Bearer {0}'.format(configs['token'])} 26 | self._url = configs['url'].rstrip("/") 27 | self._export_dir = configs['export_dir'] 28 | self._is_aws = configs['is_aws'] 29 | self._skip_failed = configs['skip_failed'] 30 | self._is_verbose = configs['verbose'] 31 | self._verify_ssl = configs['verify_ssl'] 32 | self._file_format = configs['file_format'] 33 | if self._verify_ssl: 34 | # set these env variables if skip SSL verification is enabled 35 | os.environ['REQUESTS_CA_BUNDLE'] = "" 36 | os.environ['CURL_CA_BUNDLE'] = "" 37 | os.makedirs(self._export_dir, exist_ok=True) 38 | 39 | def is_aws(self): 40 | return self._is_aws 41 | 42 | def is_verbose(self): 43 | return self._is_verbose 44 | 45 | def is_skip_failed(self): 46 | return self._skip_failed 47 | 48 | def get_file_format(self): 49 | return self._file_format 50 | 51 | def is_source_file_format(self): 52 | if self._file_format == 'SOURCE': 53 | return True 54 | return False 55 | 56 | def test_connection(self): 57 | # verify the proper url settings to configure this client 58 | if self._url[-4:] != '.com' and self._url[-4:] != '.net': 59 | print("Hostname should end in '.com'") 60 | return -1 61 | results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token, 62 | verify=self._verify_ssl) 63 | http_status_code = results.status_code 64 | if http_status_code != 200: 65 | print("Error. Either the credentials have expired or the credentials don't have proper permissions.") 66 | print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.") 67 | print(results.text) 68 | return -1 69 | return 0 70 | 71 | @staticmethod 72 | def delete_dir_if_empty(local_dir): 73 | if len(os.listdir(local_dir)) == 0: 74 | os.rmdir(local_dir) 75 | 76 | def get(self, endpoint, json_params=None, version='2.0', print_json=False): 77 | if version: 78 | ver = version 79 | full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint 80 | if self.is_verbose(): 81 | print("Get: {0}".format(full_endpoint)) 82 | if json_params: 83 | raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl) 84 | http_status_code = raw_results.status_code 85 | if http_status_code in dbclient.http_error_codes: 86 | raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) 87 | results = raw_results.json() 88 | else: 89 | raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl) 90 | http_status_code = raw_results.status_code 91 | if http_status_code in dbclient.http_error_codes: 92 | raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) 93 | results = raw_results.json() 94 | if print_json: 95 | print(json.dumps(results, indent=4, sort_keys=True)) 96 | if type(results) == list: 97 | results = {'elements': results} 98 | results['http_status_code'] = raw_results.status_code 99 | return results 100 | 101 | def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None): 102 | if version: 103 | ver = version 104 | full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint 105 | if self.is_verbose(): 106 | print("{0}: {1}".format(http_type, full_endpoint)) 107 | if json_params: 108 | if http_type == 'post': 109 | if files_json: 110 | raw_results = requests.post(full_endpoint, headers=self._token, 111 | data=json_params, files=files_json, verify=self._verify_ssl) 112 | else: 113 | raw_results = requests.post(full_endpoint, headers=self._token, 114 | json=json_params, verify=self._verify_ssl) 115 | if http_type == 'put': 116 | raw_results = requests.put(full_endpoint, headers=self._token, 117 | json=json_params, verify=self._verify_ssl) 118 | if http_type == 'patch': 119 | raw_results = requests.patch(full_endpoint, headers=self._token, 120 | json=json_params, verify=self._verify_ssl) 121 | http_status_code = raw_results.status_code 122 | if http_status_code in dbclient.http_error_codes: 123 | raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type, 124 | http_status_code, 125 | raw_results.text)) 126 | results = raw_results.json() 127 | else: 128 | print("Must have a payload in json_args param.") 129 | return {} 130 | if print_json: 131 | print(json.dumps(results, indent=4, sort_keys=True)) 132 | # if results are empty, let's return the return status 133 | if results: 134 | results['http_status_code'] = raw_results.status_code 135 | return results 136 | else: 137 | return {'http_status_code': raw_results.status_code} 138 | 139 | def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None): 140 | return self.http_req('post', endpoint, json_params, version, print_json, files_json) 141 | 142 | def put(self, endpoint, json_params, version='2.0', print_json=False): 143 | return self.http_req('put', endpoint, json_params, version, print_json) 144 | 145 | def patch(self, endpoint, json_params, version='2.0', print_json=False): 146 | return self.http_req('patch', endpoint, json_params, version, print_json) 147 | 148 | @staticmethod 149 | def get_num_lines(filename): 150 | i = 0 151 | with open(filename) as f: 152 | for i, l in enumerate(f): 153 | pass 154 | return i + 1 155 | 156 | @staticmethod 157 | def get_key(http_resp, key_name): 158 | value = http_resp.get(key_name, None) 159 | if value is None: 160 | print(http_resp) 161 | raise ValueError('Unable to find key') 162 | return value 163 | 164 | @staticmethod 165 | def my_map(F, items): 166 | to_return = [] 167 | for elem in items: 168 | to_return.append(F(elem)) 169 | return to_return 170 | 171 | def whoami(self): 172 | """ 173 | get current user userName from SCIM API 174 | :return: username string 175 | """ 176 | user_name = self.get('/preview/scim/v2/Me').get('userName') 177 | return user_name 178 | 179 | def build_acl_args(self, full_acl_list, is_jobs=False): 180 | """ 181 | Take the ACL json and return a json that corresponds to the proper input with permission level one level higher 182 | { 'acl': [ { (user_name, group_name): {'permission_level': '*'}, ... ] } 183 | for job ACLs, we need to reset the OWNER, so set the admin as CAN_MANAGE instead 184 | :param full_acl_list: 185 | :return: 186 | """ 187 | acls_list = [] 188 | current_owner = '' 189 | for member in full_acl_list: 190 | permissions = member.get('all_permissions')[0].get('permission_level') 191 | if 'user_name' in member: 192 | acls_list.append({'user_name': member.get('user_name'), 193 | 'permission_level': permissions}) 194 | if permissions == 'IS_OWNER': 195 | current_owner = member.get('user_name') 196 | else: 197 | if member.get('group_name') != 'admins': 198 | acls_list.append({'group_name': member.get('group_name'), 199 | 'permission_level': permissions}) 200 | if permissions == 'IS_OWNER': 201 | current_owner = member.get('group_name') 202 | 203 | if is_jobs: 204 | me = self.whoami() 205 | if current_owner != me: 206 | update_admin = {'user_name': self.whoami(), 207 | 'permission_level': 'CAN_MANAGE'} 208 | acls_list.append(update_admin) 209 | return acls_list 210 | 211 | def set_export_dir(self, dir_location): 212 | self._export_dir = dir_location 213 | 214 | def get_export_dir(self): 215 | return self._export_dir 216 | 217 | def get_url(self): 218 | return self._url 219 | 220 | def get_latest_spark_version(self): 221 | versions = self.get('/clusters/spark-versions')['versions'] 222 | v_sorted = sorted(versions, key=lambda i: i['key'], reverse=True) 223 | for x in v_sorted: 224 | img_type = x['key'].split('-')[1][0:5] 225 | if img_type == 'scala': 226 | return x 227 | 228 | def replace_file_contents(self, old_str, new_str, filename): 229 | """ 230 | regex replace all occurrences of a string with a new value 231 | :param old_str: old value to replace, e.g. account id, old email, etc. 232 | :param new_str: new value 233 | :param filename: logfile path relative to the export dir 234 | :return: 235 | """ 236 | log_dir = self.get_export_dir() 237 | update_filename = log_dir + filename 238 | with fileinput.FileInput(update_filename, inplace=True, backup='.bak') as fp: 239 | for line in fp: 240 | print(line.replace(old_str, new_str), end='') 241 | # cleanup old backup file once completed 242 | f_backup = log_dir + filename + '.bak' 243 | os.remove(f_backup) 244 | 245 | def update_account_id(self, new_aws_account_id, old_account_id): 246 | log_dir = self.get_export_dir() 247 | logs_to_update = ['users.log', 248 | 'instance_profiles.log', 'clusters.log', 'cluster_policies.log', 249 | 'jobs.log'] 250 | # update individual logs first 251 | for log_name in logs_to_update: 252 | if os.path.exists(log_dir + log_name): 253 | self.replace_file_contents(old_account_id, new_aws_account_id, log_name) 254 | # # update group logs 255 | group_dir = log_dir + 'groups/' 256 | groups = os.listdir(group_dir) 257 | for group_name in groups: 258 | group_file = 'groups/' + group_name 259 | if os.path.exists(log_dir + group_file): 260 | self.replace_file_contents(old_account_id, new_aws_account_id, group_file) 261 | 262 | def update_email_addresses(self, old_email_address, new_email_address): 263 | """ 264 | :param old_email_address: 265 | :param new_email_address: 266 | :return: 267 | """ 268 | log_dir = self.get_export_dir() 269 | logs_to_update = ['users.log', 270 | 'acl_jobs.log', 271 | 'acl_clusters.log', 'acl_cluster_policies.log', 272 | 'acl_notebooks.log', 'acl_directories.log'] 273 | for logfile in logs_to_update: 274 | if os.path.exists(log_dir + logfile): 275 | self.replace_file_contents(old_email_address, new_email_address, logfile) 276 | # update the path for user notebooks in bulk export mode 277 | bulk_export_dir = log_dir + 'artifacts/Users/' 278 | old_bulk_export_dir = bulk_export_dir + old_email_address 279 | new_bulk_export_dir = bulk_export_dir + new_email_address 280 | if os.path.exists(old_bulk_export_dir): 281 | os.rename(old_bulk_export_dir, new_bulk_export_dir) 282 | # update the path for user notebooks in single user export mode 283 | single_user_dir = log_dir + 'user_exports/' 284 | old_single_user_dir = single_user_dir + old_email_address 285 | new_single_user_dir = single_user_dir + new_email_address 286 | if os.path.exists(old_single_user_dir): 287 | os.rename(old_single_user_dir, new_single_user_dir) 288 | old_single_user_nbs_dir = new_single_user_dir + '/user_artifacts/Users/' + old_email_address 289 | new_single_user_nbs_dir = new_single_user_dir + '/user_artifacts/Users/' + new_email_address 290 | if os.path.exists(old_single_user_nbs_dir): 291 | os.rename(old_single_user_nbs_dir, new_single_user_nbs_dir) 292 | print("Update email address complete") 293 | -------------------------------------------------------------------------------- /dbclient/parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import configparser 3 | import re 4 | from enum import Enum 5 | from os import path 6 | 7 | auth_key = ['host', 8 | 'username', 9 | 'token'] 10 | 11 | 12 | class NotebookFormat(Enum): 13 | dbc = 'DBC' 14 | source = 'SOURCE' 15 | html = 'HTML' 16 | # jupyter is only supported for python notebooks. consider adding this back if there's demand 17 | # jupyter = 'JUPYTER' 18 | 19 | def __str__(self): 20 | return self.value 21 | 22 | 23 | def is_azure_creds(creds): 24 | if 'azuredatabricks.net' in creds['host']: 25 | return True 26 | return False 27 | 28 | 29 | def convert_args_to_list(arg_str): 30 | arg_list = map(lambda x: x.lstrip().rstrip(), arg_str.split(',')) 31 | return list(arg_list) 32 | 33 | 34 | def get_login_credentials(creds_path='~/.databrickscfg', profile='DEFAULT'): 35 | config = configparser.ConfigParser() 36 | abs_creds_path = path.expanduser(creds_path) 37 | config.read(abs_creds_path) 38 | try: 39 | current_profile = dict(config[profile]) 40 | return current_profile 41 | except KeyError: 42 | raise ValueError('Unable to find credentials to load for profile. Profile only supports tokens.') 43 | 44 | 45 | def get_export_user_parser(): 46 | # export workspace items 47 | parser = argparse.ArgumentParser(description='Export user(s) workspace artifacts from Databricks') 48 | 49 | parser.add_argument('--profile', action='store', default='DEFAULT', 50 | help='Profile to parse the credentials') 51 | 52 | parser.add_argument('--azure', action='store_true', default=False, 53 | help='Run on Azure. (Default is AWS)') 54 | 55 | parser.add_argument('--skip-failed', action='store_true', default=False, 56 | help='Skip retries for any failed hive metastore exports.') 57 | 58 | parser.add_argument('--silent', action='store_true', default=False, 59 | help='Silent all logging of export operations.') 60 | # Don't verify ssl 61 | parser.add_argument('--no-ssl-verification', action='store_true', 62 | help='Set Verify=False when making http requests.') 63 | 64 | parser.add_argument('--debug', action='store_true', 65 | help='Enable debug logging') 66 | 67 | parser.add_argument('--set-export-dir', action='store', 68 | help='Set the base directory to export artifacts') 69 | 70 | parser.add_argument('--users', action='store', 71 | help='Download user(s) artifacts such as notebooks, cluster specs, jobs. ' 72 | 'Provide a list of user ids / emails to export') 73 | 74 | return parser 75 | 76 | 77 | def get_export_parser(): 78 | # export workspace items 79 | parser = argparse.ArgumentParser(description='Export full workspace artifacts from Databricks') 80 | 81 | # export all users and groups 82 | parser.add_argument('--users', action='store_true', 83 | help='Download all the users and groups in the workspace') 84 | 85 | # log all user workspace paths 86 | parser.add_argument('--workspace', action='store_true', 87 | help='Log all the notebook paths in the workspace. (metadata only)') 88 | 89 | parser.add_argument('--notebook-format', type=NotebookFormat, 90 | choices=list(NotebookFormat), default=NotebookFormat.dbc, 91 | help='Choose the file format to download the notebooks (default: DBC)') 92 | 93 | # download all user workspace notebooks 94 | parser.add_argument('--download', action='store_true', 95 | help='Download all notebooks for the environment') 96 | 97 | # add all lib configs 98 | parser.add_argument('--libs', action='store_true', 99 | help='Log all the libs for the environment') 100 | 101 | # add all clusters configs 102 | parser.add_argument('--clusters', action='store_true', 103 | help='Log all the clusters for the environment') 104 | 105 | # get all job configs 106 | parser.add_argument('--jobs', action='store_true', 107 | help='Log all the job configs for the environment') 108 | # get all metastore 109 | parser.add_argument('--metastore', action='store_true', 110 | help='log all the metastore table definitions') 111 | 112 | # get all secret scopes 113 | parser.add_argument('--secrets', action='store_true', 114 | help='log all the secret scopes') 115 | 116 | # get all metastore 117 | parser.add_argument('--metastore-unicode', action='store_true', 118 | help='log all the metastore table definitions including unicode characters') 119 | 120 | # cluster name used to export the metastore 121 | parser.add_argument('--cluster-name', action='store', 122 | help='Cluster name to export the metastore to a specific cluster. Cluster will be started.') 123 | 124 | # get database to export for metastore 125 | parser.add_argument('--database', action='store', 126 | help='Database name to export for the metastore. Single database name supported') 127 | 128 | # iam role used to export the metastore 129 | parser.add_argument('--iam', action='store', 130 | help='IAM Instance Profile to export metastore entires') 131 | 132 | # skip failures 133 | parser.add_argument('--skip-failed', action='store_true', default=False, 134 | help='Skip retries for any failed hive metastore exports.') 135 | 136 | # get mount points 137 | parser.add_argument('--mounts', action='store_true', default=False, 138 | help='Log all mount points.') 139 | # get azure logs 140 | parser.add_argument('--azure', action='store_true', default=False, 141 | help='Run on Azure. (Default is AWS)') 142 | # 143 | parser.add_argument('--profile', action='store', default='DEFAULT', 144 | help='Profile to parse the credentials') 145 | 146 | parser.add_argument('--single-user', action='store', 147 | help='User\'s email to export their user identity and entitlements') 148 | 149 | parser.add_argument('--export-home', action='store', 150 | help='User workspace name to export, typically the users email address') 151 | 152 | parser.add_argument('--export-groups', action='store', 153 | help='Group names to export as a set. Includes group, users, and notebooks.') 154 | 155 | parser.add_argument('--workspace-acls', action='store_true', 156 | help='Permissions for workspace objects to export') 157 | 158 | parser.add_argument('--workspace-top-level-only', action='store_true', 159 | help='Download only top level notebook directories') 160 | 161 | parser.add_argument('--silent', action='store_true', default=False, 162 | help='Silent all logging of export operations.') 163 | # Don't verify ssl 164 | parser.add_argument('--no-ssl-verification', action='store_true', 165 | help='Set Verify=False when making http requests.') 166 | 167 | parser.add_argument('--debug', action='store_true', 168 | help='Enable debug logging') 169 | 170 | parser.add_argument('--reset-exports', action='store_true', 171 | help='Clear export directory') 172 | 173 | parser.add_argument('--set-export-dir', action='store', 174 | help='Set the base directory to export artifacts') 175 | 176 | parser.add_argument('--pause-all-jobs', action='store_true', 177 | help='Pause all scheduled jobs') 178 | 179 | parser.add_argument('--unpause-all-jobs', action='store_true', 180 | help='Unpause all scheduled jobs') 181 | 182 | parser.add_argument('--update-account-id', action='store', 183 | help='Set the account id for instance profiles to a new account id') 184 | 185 | parser.add_argument('--old-account-id', action='store', 186 | help='Old account ID to filter on') 187 | 188 | parser.add_argument('--replace-old-email', action='store', 189 | help='Old email address to update from logs') 190 | 191 | parser.add_argument('--update-new-email', action='store', 192 | help='New email address to replace the logs') 193 | 194 | parser.add_argument('--replace-email', action='store', 195 | help='Update old emails with new e-mails. NOTE: Similar to replace-old-email but capable of using multiple e-mails. Format old1@email:new1@email.com,old2@email.com:new2@email.com') 196 | 197 | parser.add_argument('--bypass-windows-check', action='store_true', 198 | help='By-pass windows os checker') 199 | return parser 200 | 201 | 202 | def get_import_parser(): 203 | # import workspace items parser 204 | parser = argparse.ArgumentParser(description='Import full workspace artifacts into Databricks') 205 | 206 | # import all users and groups 207 | parser.add_argument('--users', action='store_true', 208 | help='Import all the users and groups from the logfile.') 209 | 210 | # import all notebooks 211 | parser.add_argument('--workspace', action='store_true', 212 | help='Import all notebooks from export dir into the workspace.') 213 | 214 | # skip previous successful imports 215 | parser.add_argument('--restart-from-checkpoint', action='store_true', 216 | help='Restart the workspace import and skip previously successful imports. ' 217 | 'Only works with --workspace option') 218 | 219 | parser.add_argument('--workspace-top-level', action='store_true', 220 | help='Import all top level notebooks from export dir into the workspace. Excluding Users dirs') 221 | 222 | parser.add_argument('--workspace-acls', action='store_true', 223 | help='Permissions for workspace objects to import') 224 | 225 | parser.add_argument('--notebook-format', type=NotebookFormat, 226 | choices=list(NotebookFormat), default=NotebookFormat.dbc, 227 | help='Choose the file format of the notebook to import (default: DBC)') 228 | 229 | parser.add_argument('--import-home', action='store', 230 | help='User workspace name to import, typically the users email address') 231 | 232 | parser.add_argument('--import-groups', action='store_true', 233 | help='Groups to import into a new workspace. Includes group creation and user notebooks.') 234 | 235 | # import all notebooks 236 | parser.add_argument('--archive-missing', action='store_true', 237 | help='Import all missing users into the top level /Archive/ directory.') 238 | 239 | # import all lib configs 240 | parser.add_argument('--libs', action='store_true', 241 | help='Import all the libs from the logfile into the workspace.') 242 | 243 | # import all clusters configs 244 | parser.add_argument('--clusters', action='store_true', 245 | help='Import all the cluster configs for the environment') 246 | 247 | # import all job configs 248 | parser.add_argument('--jobs', action='store_true', 249 | help='Import all job configurations to the environment.') 250 | 251 | # import all metastore 252 | parser.add_argument('--metastore', action='store_true', 253 | help='Import the metastore to the workspace.') 254 | 255 | # import all metastore including defns with unicode 256 | parser.add_argument('--metastore-unicode', action='store_true', 257 | help='Import all the metastore table definitions with unicode characters') 258 | 259 | parser.add_argument('--get-repair-log', action='store_true', 260 | help='Report on current tables requiring repairs') 261 | 262 | # cluster name used to import the metastore 263 | parser.add_argument('--cluster-name', action='store', 264 | help='Cluster name to import the metastore to a specific cluster. Cluster will be started.') 265 | # skip failures 266 | parser.add_argument('--skip-failed', action='store_true', default=False, 267 | help='Skip missing users that do not exist when importing user notebooks') 268 | 269 | # get azure logs 270 | parser.add_argument('--azure', action='store_true', 271 | help='Run on Azure. (Default is AWS)') 272 | # 273 | parser.add_argument('--profile', action='store', default='DEFAULT', 274 | help='Profile to parse the credentials') 275 | 276 | parser.add_argument('--single-user', action='store', 277 | help='User\'s email to export their user identity and entitlements') 278 | 279 | # Don't verify ssl 280 | parser.add_argument('--no-ssl-verification', action='store_true', 281 | help='Set Verify=False when making http requests.') 282 | 283 | parser.add_argument('--silent', action='store_true', 284 | help='Silent all logging of import operations.') 285 | 286 | parser.add_argument('--debug', action='store_true', 287 | help='Enable debug logging') 288 | 289 | parser.add_argument('--set-export-dir', action='store', 290 | help='Set the base directory to import artifacts if the export dir was a customized') 291 | 292 | parser.add_argument('--pause-all-jobs', action='store_true', 293 | help='Pause all scheduled jobs') 294 | 295 | parser.add_argument('--unpause-all-jobs', action='store_true', 296 | help='Unpause all scheduled jobs') 297 | 298 | parser.add_argument('--delete-all-jobs', action='store_true', 299 | help='Delete all jobs') 300 | return parser 301 | 302 | 303 | def prompt_for_input(message): 304 | import sys 305 | # raw_input returns the empty string for "enter", therefore default is no 306 | yes = {'yes','y', 'ye'} 307 | no = {'no','n', ''} 308 | 309 | choice = input(message + '\n').lower() 310 | if choice in yes: 311 | return True 312 | elif choice in no: 313 | return False 314 | else: 315 | sys.stdout.write("Please respond with 'yes' or 'no'") 316 | 317 | 318 | def url_validation(url): 319 | if '/?o=' in url: 320 | # if the workspace_id exists, lets remove it from the URL 321 | new_url = re.sub("\/\?o=.*", '', url) 322 | return new_url 323 | elif 'net/' == url[-4:]: 324 | return url[:-1] 325 | elif 'com/' == url[-4:]: 326 | return url[:-1] 327 | return url 328 | 329 | 330 | def build_client_config(url, token, args): 331 | # cant use netrc credentials because requests module tries to load the credentials into http basic auth headers 332 | # aws is the default 333 | config = {'url': url_validation(url), 334 | 'token': token, 335 | 'is_aws': (not args.azure), 336 | 'verbose': (not args.silent), 337 | 'verify_ssl': (not args.no_ssl_verification), 338 | 'skip_failed': args.skip_failed, 339 | 'debug': args.debug, 340 | 'file_format': str(args.notebook_format) 341 | } 342 | if args.set_export_dir: 343 | if args.set_export_dir.rstrip()[-1] != '/': 344 | config['export_dir'] = args.set_export_dir + '/' 345 | else: 346 | config['export_dir'] = args.set_export_dir 347 | elif config['is_aws']: 348 | config['export_dir'] = 'logs/' 349 | else: 350 | config['export_dir'] = 'azure_logs/' 351 | return config 352 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [Deprecated] Databricks Migration Tool 2 | 3 | ## This project is deprecated and official support is moving to the Databricks Labs project: [migrate](https://github.com/databrickslabs/migrate) 4 | 5 | This is a migration package to log all Databricks resources for backup and/or migrating to another Databricks workspace. 6 | Migration allows a Databricks organization to move resources between Databricks Workspaces, 7 | to move between different cloud providers, or to move to different regions / accounts. 8 | 9 | Packaged is based on python 3.6 and DBR 6.x and 7.x releases. 10 | 11 | **Note:** Tools does not support windows currently since path resolution is different than mac / linux. 12 | Support for Windows is work in progress to update all paths to use pathlib resolution. 13 | 14 | This package uses credentials from the 15 | [Databricks CLI](https://docs.databricks.com/user-guide/dev-tools/databricks-cli.html) 16 | 17 | Support Matrix for Import and Export Operations: 18 | 19 | | Component | Export | Import | 20 | | ----------------- | ------------ | ------------ | 21 | | Users / Groups | Supported | Supported | 22 | | Clusters (w/ ACLs)| Supported | Supported | 23 | | Notebooks | Supported | Supported | 24 | | Notebooks ACLs | Supported | Supported | 25 | | Metastore | Supported | Supported | 26 | | Jobs (w/ ACLs) | Supported | Supported | 27 | | Libraries | Supported | Unsupported | 28 | | Secrets | Unsupported | Unsupported | 29 | | ML Models | Unsupported | Unsupported | 30 | | Table ACLs | Unsupported | Unsupported | 31 | 32 | **DBFS Data Migration:** 33 | * DBFS is a protected object storage location on AWS and Azure. 34 | * Data within the DBFS bucket can be very large, and the Databricks support team will need to help here. 35 | * The Databricks support team has a tool available to help with DBFS migrations between AWS workspaces today. 36 | * Azure DBFS migrations is work in progress. 37 | 38 | **Note:** MLFlow objects cannot be exported / imported with this tool. 39 | For more details, please look [here](https://github.com/amesar/mlflow-tools/) 40 | 41 | ## Workspace Analysis 42 | Import this [notebook](data/workspace_migration_analysis.py) to do an analysis of the number of objects within the 43 | current workspace. The last cell will print: 44 | 1. Number of users 45 | 2. Number of groups 46 | 3. Approximate number of notebooks 47 | 4. Number of internal jobs defined 48 | 5. Number of external jobs executed (from external API invocations) 49 | 6. Number of databases 50 | 7. Number of tables 51 | 52 | ## Order of Operations 53 | 1. Export users and groups 54 | 2. Export cluster templates 55 | 3. Export notebook metadata (listing of all notebooks) 56 | 4. Export notebook content 57 | 5. Export job templates 58 | 6. Export Hive Metastore data 59 | 60 | **Note:** During user / group import, users will be notified of the new workspace and account. This is required 61 | for them to set up their credentials to access the new workspace. We need the user to exist before loading their 62 | artifacts like notebooks, clusters, etc. 63 | 64 | By default, artifacts are stored in the `logs/` directory, and `azure_logs/` for Azure artifacts. 65 | This is configurable with the `--set-export-dir` flag to specify the log directory. 66 | 67 | While exporting Libraries is supported, we do not have an implementation to import library definitions. 68 | ## Table of Contents 69 | - [Users and Groups](#users-and-groups) 70 | - [Clusters](#Clusters) 71 | - [Notebooks](#Notebooks) 72 | - [Jobs](#Jobs) 73 | - [Export Help Text](#export-help-text) 74 | - [Import Help Text](#import-help-text) 75 | 76 | ### Users and Groups 77 | This section uses the [SCIM API](https://docs.databricks.com/dev-tools/api/latest/scim/index.html) to export / import 78 | user and groups. 79 | [Instance Profiles API](https://docs.databricks.com/dev-tools/api/latest/instance-profiles.html) used 80 | to export instance profiles that are tied to user/group entitlements. 81 | For AWS users, this section will log the instance profiles used for IAM access to resources. 82 | 83 | To export users / groups, use the following: 84 | ```bash 85 | python export_db.py --profile DEMO --users 86 | ``` 87 | 88 | To import these users: 89 | ```bash 90 | python import_db.py --profile NEW_DEMO --users 91 | ``` 92 | 93 | If you plan to use this tool to export multiple workspaces, you can set the `--set-export-dir` directory to log 94 | artifacts into separate logging directories. 95 | 96 | 97 | ### Clusters 98 | The section uses the [Clusters APIs](https://docs.databricks.com/dev-tools/api/latest/clusters.html) 99 | 100 | ```bash 101 | python export_db.py --profile DEMO --clusters 102 | ``` 103 | This will export the following: 104 | 1. Cluster templates + ACLs 105 | 2. Instance pool definitions 106 | 3. Cluster policies + ACLs 107 | 108 | ```bash 109 | python import_db.py --profile NEW_DEMO --clusters 110 | ``` 111 | 112 | ### Notebooks 113 | This section uses the [Workspace API](https://docs.databricks.com/dev-tools/api/latest/workspace.html) 114 | 115 | This part is a 3 part process. 116 | 1. Download all notebook locations and paths 117 | 2. Download all notebook contents for every path 118 | 3. Download all workspace ACLs 119 | 120 | ```bash 121 | python export_db.py --profile DEMO --workspace 122 | python export_db.py --profile DEMO --download 123 | python export_db.py --profile DEMO --workspace-acls 124 | ``` 125 | 126 | To import into a new workspace: 127 | ```bash 128 | python import_db.py --profile NEW_DEMO --workspace [--archive-missing] 129 | python import_db.py --profile NEW_DEMO --workspace-acls 130 | ``` 131 | If users have left your organization, their artifacts (notebooks / job templates) still exists. However, their user 132 | object no longer exists. During the migration, we can keep the old users notebooks into the top level 133 | directory `/Archive/{username}@domain.com` 134 | Use the `--archive-missing` option to put these artifacts in the archive folder. 135 | 136 | **Single User Export/Import** 137 | The tool supports exporting single user workspaces using the following command: 138 | ```bash 139 | # export a single users workspace 140 | python export_db.py --profile DEMO --export-home example@foobar.com 141 | ``` 142 | 143 | The corollary is the `--import-home` option: 144 | ```bash 145 | python import_db.py --profile NEW_DEMO --import-home example@foobar.com 146 | ``` 147 | 148 | ### Jobs 149 | This section uses the [Jobs API](https://docs.databricks.com/dev-tools/api/latest/jobs.html) 150 | Job ACLs are exported and imported with this option. 151 | 152 | ```bash 153 | python export_db.py --profile DEMO --jobs 154 | ``` 155 | If we're unable to find old cluster ids that are no longer available, we'll reset the job template 156 | to use a new default cluster. 157 | 158 | ```bash 159 | python import_db.py --profile NEW_DEMO --jobs 160 | ``` 161 | 162 | Imported jobs into the new workspace are paused by default. We do not want to have 2 jobs run simultaneously. 163 | Admins must pause their jobs with Databricks defined schedules using the following option: 164 | ```bash 165 | python export_db.py --profile DEMO --pause-all-jobs 166 | ``` 167 | 168 | Un-pause all jobs in the new workspace: 169 | ```bash 170 | python import_db.py --profile NEW_DEMO --unpause-all-jobs 171 | ``` 172 | 173 | ### Hive Metastore 174 | This section uses an API to remotely run Spark commands on a cluster, this API is called 175 | [Execution Context](https://docs.databricks.com/dev-tools/api/1.2/index.html#execution-context) 176 | 177 | By default, this will launch an small cluster in the `data/` folder to export the Hive Metastore data. 178 | If you need a specific IAM role to export the metastore, use the `--cluster-name` option to connect to 179 | a specific cluster. 180 | 181 | By default, we will edit the cluster for every defined IAM role to loop through all failed exports in case the tool was 182 | missing IAM permissions. To disable looping through all failed exports, you can pass in `--skip-failed` 183 | 184 | ```bash 185 | # export all metastore entries and brute force loop through all instance profiles / IAM roles 186 | python export_db.py --profile DEMO --metastore 187 | 188 | # export all metastore entries on the default cluster without retries 189 | python export_db.py --profile DEMO --metastore --skip-failed 190 | 191 | # export all metastore entries on a specific cluster 192 | python export_db.py --profile DEMO --metastore --cluster-name "Test" 193 | 194 | # export all tables within a specific database 195 | python export_db.py --profile DEMO --metastore --cluster-name "Test" --database "my_db" 196 | ``` 197 | 198 | To find legacy Hive tables that need to be repaired after a successful import, run the following: 199 | ``` 200 | python import_db.py --profile DST --get-repair-log 201 | ``` 202 | Once completed, it will upload a log to the destination location. 203 | Use this [repair notebook](data/repair_tables_for_migration.py) to import into the destination environment to repair 204 | all tables. 205 | 206 | ### Export Groups by Name 207 | This functionality exports group(s), their members, and corresponding notebooks. 208 | This assumes an empty export directory to simplify the number of operations needed. 209 | This does **not** include IAM roles as those likely change while moving across workspaces. 210 | 211 | ```bash 212 | # reset the export directory and export a set of groups 213 | python export_db.py --reset-export && python export_db.py --profile SRC --export-groups 'groupA,groupB' 214 | 215 | # import the groups that were exported 216 | python import_db.py --profile DST --import-groups 217 | ``` 218 | 219 | ### Export / Import Top Level Notebooks 220 | This will export all notebooks that are not a part of the `/Users/` directories to help migrate notebooks that are 221 | outside of personal workspace directories. 222 | ```bash 223 | # reset the export directory and export the top level directories / notebooks 224 | python export_db.py --reset-export && python export_db.py --profile SRC --workspace-top-level-only 225 | # if ACLs are enabled, export the ACLs as well 226 | python export_db.py --profile SRC --workspace-acls 227 | 228 | # import the groups that were exported 229 | python import_db.py --profile DST --workspace-top-level 230 | # apply acls if needed 231 | python import_db.py --profile DST --workspace-acls 232 | ``` 233 | 234 | #### Export Help Text 235 | ``` 236 | $ python export_db.py --help 237 | usage: export_db.py [-h] [--users] [--workspace] 238 | [--notebook-format {DBC,SOURCE,HTML}] [--download] 239 | [--libs] [--clusters] [--jobs] [--metastore] [--secrets] 240 | [--metastore-unicode] [--cluster-name CLUSTER_NAME] 241 | [--database DATABASE] [--iam IAM] [--skip-failed] 242 | [--mounts] [--azure] [--profile PROFILE] 243 | [--single-user SINGLE_USER] [--export-home EXPORT_HOME] 244 | [--export-groups EXPORT_GROUPS] [--workspace-acls] 245 | [--workspace-top-level-only] [--silent] 246 | [--no-ssl-verification] [--debug] [--reset-exports] 247 | [--set-export-dir SET_EXPORT_DIR] [--pause-all-jobs] 248 | [--unpause-all-jobs] 249 | [--update-account-id UPDATE_ACCOUNT_ID] 250 | [--old-account-id OLD_ACCOUNT_ID] 251 | [--replace-old-email REPLACE_OLD_EMAIL] 252 | [--update-new-email UPDATE_NEW_EMAIL] 253 | [--bypass-windows-check] 254 | 255 | Export full workspace artifacts from Databricks 256 | 257 | optional arguments: 258 | -h, --help show this help message and exit 259 | --users Download all the users and groups in the workspace 260 | --workspace Log all the notebook paths in the workspace. (metadata 261 | only) 262 | --notebook-format {DBC,SOURCE,HTML} 263 | Choose the file format to download the notebooks 264 | (default: DBC) 265 | --download Download all notebooks for the environment 266 | --libs Log all the libs for the environment 267 | --clusters Log all the clusters for the environment 268 | --jobs Log all the job configs for the environment 269 | --metastore log all the metastore table definitions 270 | --metastore-unicode log all the metastore table definitions including 271 | unicode characters 272 | --cluster-name CLUSTER_NAME 273 | Cluster name to export the metastore to a specific 274 | cluster. Cluster will be started. 275 | --database DATABASE Database name to export for the metastore. Single 276 | database name supported 277 | --iam IAM IAM Instance Profile to export metastore entires 278 | --skip-failed Skip retries for any failed hive metastore exports. 279 | --mounts Log all mount points. 280 | --azure Run on Azure. (Default is AWS) 281 | --profile PROFILE Profile to parse the credentials 282 | --export-home EXPORT_HOME 283 | User workspace name to export, typically the users 284 | email address 285 | --export-groups EXPORT_GROUPS 286 | Group names to export as a set. Includes group, users, 287 | and notebooks. 288 | --workspace-acls Permissions for workspace objects to export 289 | --workspace-top-level-only 290 | Download only top level notebook directories 291 | --silent Silent all logging of export operations. 292 | --no-ssl-verification 293 | Set Verify=False when making http requests. 294 | --debug Enable debug logging 295 | --reset-exports Clear export directory 296 | --set-export-dir SET_EXPORT_DIR 297 | Set the base directory to export artifacts 298 | --pause-all-jobs Pause all scheduled jobs 299 | --unpause-all-jobs Unpause all scheduled jobs 300 | --update-account-id UPDATE_ACCOUNT_ID 301 | Set the account id for instance profiles to a new 302 | account id 303 | --old-account-id OLD_ACCOUNT_ID 304 | Old account ID to filter on 305 | --replace-old-email REPLACE_OLD_EMAIL 306 | Old email address to update from logs 307 | --update-new-email UPDATE_NEW_EMAIL 308 | New email address to replace the logs 309 | ``` 310 | 311 | #### Import Help Text 312 | ``` 313 | $ python import_db.py --help 314 | usage: import_db.py [-h] [--users] [--workspace] [--workspace-top-level] 315 | [--workspace-acls] [--notebook-format {DBC,SOURCE,HTML}] 316 | [--import-home IMPORT_HOME] [--import-groups] 317 | [--archive-missing] [--libs] [--clusters] [--jobs] 318 | [--metastore] [--metastore-unicode] [--get-repair-log] 319 | [--cluster-name CLUSTER_NAME] [--skip-failed] [--azure] 320 | [--profile PROFILE] [--single-user SINGLE_USER] 321 | [--no-ssl-verification] [--silent] [--debug] 322 | [--set-export-dir SET_EXPORT_DIR] [--pause-all-jobs] 323 | [--unpause-all-jobs] [--delete-all-jobs] 324 | 325 | Import full workspace artifacts into Databricks 326 | 327 | optional arguments: 328 | -h, --help show this help message and exit 329 | --users Import all the users and groups from the logfile. 330 | --workspace Import all notebooks from export dir into the 331 | workspace. 332 | --workspace-top-level 333 | Import all top level notebooks from export dir into 334 | the workspace. Excluding Users dirs 335 | --notebook-format {DBC,SOURCE,HTML} 336 | Choose the file format of the notebook to import 337 | (default: DBC) 338 | --workspace-acls Permissions for workspace objects to import 339 | --import-home IMPORT_HOME 340 | User workspace name to import, typically the users 341 | email address 342 | --import-groups Groups to import into a new workspace. Includes group 343 | creation and user notebooks. 344 | --archive-missing Import all missing users into the top level /Archive/ 345 | directory. 346 | --libs Import all the libs from the logfile into the 347 | workspace. 348 | --clusters Import all the cluster configs for the environment 349 | --jobs Import all job configurations to the environment. 350 | --metastore Import the metastore to the workspace. 351 | --metastore-unicode Import all the metastore table definitions with 352 | unicode characters 353 | --get-repair-log Report on current tables requiring repairs 354 | --cluster-name CLUSTER_NAME 355 | Cluster name to import the metastore to a specific 356 | cluster. Cluster will be started. 357 | --skip-failed Skip missing users that do not exist when importing 358 | user notebooks 359 | --azure Run on Azure. (Default is AWS) 360 | --profile PROFILE Profile to parse the credentials 361 | --no-ssl-verification 362 | Set Verify=False when making http requests. 363 | --silent Silent all logging of import operations. 364 | --debug Enable debug logging 365 | --set-export-dir SET_EXPORT_DIR 366 | Set the base directory to import artifacts if the 367 | export dir was a customized 368 | --pause-all-jobs Pause all scheduled jobs 369 | --unpause-all-jobs Unpause all scheduled jobs 370 | --delete-all-jobs Delete all jobs 371 | ``` 372 | 373 | #### FAQs / Limitations 374 | **Note**: To disable ssl verification pass the flag `--no-ssl-verification`. 375 | If still getting SSL Error add the following to your current bash shell - 376 | ``` 377 | export REQUESTS_CA_BUNDLE="" 378 | export CURL_CA_BUNDLE="" 379 | ``` 380 | 381 | Limitations: 382 | * Instance profiles (AWS only): Group access to instance profiles will take precedence. If a user is added to the role 383 | directly, and has access via a group, only the group access will be granted during a migration. 384 | * Clusters: Cluster creator will be seen as the single admin user who migrated all the clusters. (Relevant for billing 385 | purposes) 386 | * Cluster creator tags cannot be updated. Added a custom tag named `OriginalCreator` with the original cluster creator 387 | for DBU tracking. 388 | * Jobs: Job owners will be seen as the single admin user who migrate the job configurations. (Relevant for billing 389 | purposes) 390 | * Jobs with existing clusters that no longer exist will be reset to the default cluster type 391 | * Jobs with older legacy instances will fail with unsupported DBR or instance types. See release notes for the latest 392 | supported releases. 393 | 394 | -------------------------------------------------------------------------------- /dbclient/ScimClient.py: -------------------------------------------------------------------------------- 1 | from dbclient import * 2 | import os 3 | import json 4 | 5 | 6 | class ScimClient(dbclient): 7 | 8 | def get_active_users(self): 9 | users = self.get('/preview/scim/v2/Users').get('Resources', None) 10 | return users if users else None 11 | 12 | def log_all_users(self, log_file='users.log'): 13 | user_log = self.get_export_dir() + log_file 14 | users = self.get('/preview/scim/v2/Users').get('Resources', None) 15 | if users: 16 | with open(user_log, "w") as fp: 17 | for x in users: 18 | fullname = x.get('name', None) 19 | if fullname: 20 | given_name = fullname.get('givenName', None) 21 | # if user is an admin, skip this user entry 22 | if x['userName'] == 'admin' and given_name == 'Administrator': 23 | continue 24 | fp.write(json.dumps(x) + '\n') 25 | else: 26 | print("Users returned an empty object") 27 | 28 | def log_single_user(self, user_email, log_file='single_user.log'): 29 | single_user_log = self.get_export_dir() + log_file 30 | users = self.get_active_users() 31 | found_user = False 32 | for user in users: 33 | current_email = user['emails'][0]['value'] 34 | if user_email == current_email: 35 | found_user = True 36 | print(user) 37 | with open(single_user_log, 'w') as fp: 38 | fp.write(json.dumps(user) + '\n') 39 | if not found_user: 40 | print("User not found. Emails are case sensitive. Please verify email address") 41 | 42 | def import_single_user(self, user_email, log_file='single_user.log'): 43 | single_user_log = self.get_export_dir() + log_file 44 | resp = self.import_users(single_user_log) 45 | 46 | def get_users_from_log(self, users_log='users.log'): 47 | """ 48 | fetch a list of user names from the users log file 49 | meant to be used during group exports where the user list is a subset of users 50 | :param users_log: 51 | :return: a list of usernames that help identify their workspace paths 52 | """ 53 | user_logfile = self.get_export_dir() + users_log 54 | username_list = [] 55 | with open(user_logfile, 'r') as fp: 56 | for u in fp: 57 | user_json = json.loads(u) 58 | username_list.append(user_json.get('userName')) 59 | return username_list 60 | 61 | @staticmethod 62 | def is_member_a_user(member_json): 63 | if 'Users/' in member_json['$ref']: 64 | return True 65 | return False 66 | 67 | @staticmethod 68 | def is_member_a_group(member_json): 69 | if 'Groups/' in member_json['$ref']: 70 | return True 71 | return False 72 | 73 | @staticmethod 74 | def is_member_a_service_principal(member_json): 75 | if 'ServicePrincipals/' in member_json['$ref']: 76 | return True 77 | return False 78 | 79 | def add_username_to_group(self, group_json): 80 | # add the userName field to json since ids across environments may not match 81 | members = group_json.get('members', []) 82 | new_members = [] 83 | for m in members: 84 | m_id = m['value'] 85 | if self.is_member_a_user(m): 86 | user_resp = self.get('/preview/scim/v2/Users/{0}'.format(m_id)) 87 | m['userName'] = user_resp['userName'] 88 | m['type'] = 'user' 89 | elif self.is_member_a_group(m): 90 | m['type'] = 'group' 91 | elif self.is_member_a_service_principal(m): 92 | m['type'] = 'service-principal' 93 | else: 94 | m['type'] = 'unknown' 95 | new_members.append(m) 96 | group_json['members'] = new_members 97 | return group_json 98 | 99 | def log_all_groups(self, group_log_dir='groups/'): 100 | group_dir = self.get_export_dir() + group_log_dir 101 | os.makedirs(group_dir, exist_ok=True) 102 | group_list = self.get("/preview/scim/v2/Groups").get('Resources', []) 103 | for x in group_list: 104 | group_name = x['displayName'] 105 | with open(group_dir + group_name, "w") as fp: 106 | fp.write(json.dumps(self.add_username_to_group(x))) 107 | 108 | @staticmethod 109 | def build_group_dict(group_list): 110 | group_dict = {} 111 | for group in group_list: 112 | group_dict[group.get('displayName')] = group 113 | return group_dict 114 | 115 | def log_groups_from_list(self, group_name_list, group_log_dir='groups/', users_logfile='users.log'): 116 | """ 117 | take a list of groups and log all the members 118 | :param group_name_list: a list obj of group names 119 | :param group_log_dir: 120 | :param users_logfile: logfile to store the user log data 121 | :return: return a list of userNames to export their notebooks for the next api call 122 | """ 123 | group_dir = self.get_export_dir() + group_log_dir 124 | os.makedirs(group_dir, exist_ok=True) 125 | group_list = self.get("/preview/scim/v2/Groups").get('Resources', []) 126 | group_dict = self.build_group_dict(group_list) 127 | member_id_list = [] 128 | for group_name in group_name_list: 129 | group_details = group_dict[group_name] 130 | members_list = group_details.get('members', []) 131 | filtered_users = list(filter(lambda y: 'Users' in y.get('$ref', None), members_list)) 132 | filtered_sub_groups = list(filter(lambda y: 'Groups' in y.get('$ref', None), members_list)) 133 | if filtered_sub_groups: 134 | sub_group_names = list(map(lambda z: z.get('display'), filtered_sub_groups)) 135 | group_name_list.extend(sub_group_names) 136 | member_id_list.extend(list(map(lambda y: y['value'], filtered_users))) 137 | with open(group_dir + group_name, "w") as fp: 138 | group_details.pop('roles', None) # removing the roles field from the groups arg 139 | fp.write(json.dumps(self.add_username_to_group(group_details))) 140 | users_log = self.get_export_dir() + users_logfile 141 | user_names_list = [] 142 | with open(users_log, 'w') as u_fp: 143 | for mid in member_id_list: 144 | print('Exporting', mid) 145 | api = f'/preview/scim/v2/Users/{mid}' 146 | user_resp = self.get(api) 147 | user_resp.pop('roles', None) # remove roles since those can change during the migration 148 | user_resp.pop('http_status_code', None) # remove unnecessary params 149 | user_names_list.append(user_resp.get('userName')) 150 | u_fp.write(json.dumps(user_resp) + '\n') 151 | return user_names_list 152 | 153 | def get_user_id_mapping(self): 154 | # return a dict of the userName to id mapping of the new env 155 | user_list = self.get('/preview/scim/v2/Users').get('Resources', None) 156 | if user_list: 157 | user_id_dict = {} 158 | for user in user_list: 159 | user_id_dict[user['userName']] = user['id'] 160 | return user_id_dict 161 | return None 162 | 163 | @staticmethod 164 | def assign_roles_args(roles_list): 165 | # roles list passed from file, which is in proper patch arg format already 166 | # this method is used to patch the group IAM roles 167 | assign_args = {"schemas": ["urn:ietf:params:scim:api:messages:2.0:PatchOp"], 168 | "Operations": [{"op": "add", 169 | "path": "roles", 170 | "value": roles_list}]} 171 | return assign_args 172 | 173 | @staticmethod 174 | def assign_entitlements_args(entitlements_list): 175 | # roles list passed from file, which is in proper patch arg format already 176 | # this method is used to patch the group IAM roles 177 | assign_args = {"schemas": ["urn:ietf:params:scim:api:messages:2.0:PatchOp"], 178 | "Operations": [{"op": "add", 179 | "path": "entitlements", 180 | "value": entitlements_list}]} 181 | return assign_args 182 | 183 | def assign_group_entitlements(self, group_dir): 184 | # assign group role ACLs, which are only available via SCIM apis 185 | group_ids = self.get_current_group_ids() 186 | if not os.path.exists(group_dir): 187 | print("No groups defined. Skipping group entitlement assignment") 188 | return 189 | groups = os.listdir(group_dir) 190 | for group_name in groups: 191 | with open(group_dir + group_name, 'r') as fp: 192 | group_data = json.loads(fp.read()) 193 | entitlements = group_data.get('entitlements', None) 194 | if entitlements: 195 | g_id = group_ids[group_name] 196 | update_entitlements = self.assign_entitlements_args(entitlements) 197 | up_resp = self.patch(f'/preview/scim/v2/Groups/{g_id}', update_entitlements) 198 | print(up_resp) 199 | 200 | def assign_group_roles(self, group_dir): 201 | # assign group role ACLs, which are only available via SCIM apis 202 | group_ids = self.get_current_group_ids() 203 | if not os.path.exists(group_dir): 204 | print("No groups defined. Skipping group entitlement assignment") 205 | return 206 | groups = os.listdir(group_dir) 207 | for group_name in groups: 208 | with open(group_dir + group_name, 'r') as fp: 209 | group_data = json.loads(fp.read()) 210 | roles = group_data.get('roles', None) 211 | if roles: 212 | g_id = group_ids[group_name] 213 | update_roles = self.assign_roles_args(roles) 214 | up_resp = self.patch(f'/preview/scim/v2/Groups/{g_id}', update_roles) 215 | print(up_resp) 216 | entitlements = group_data.get('entitlements', None) 217 | if entitlements: 218 | g_id = group_ids[group_name] 219 | update_entitlements = self.assign_entitlements_args(entitlements) 220 | up_resp = self.patch(f'/preview/scim/v2/Groups/{g_id}', update_entitlements) 221 | print(up_resp) 222 | 223 | def get_current_user_ids(self): 224 | # return a dict of user email to user id mappings 225 | users = self.get('/preview/scim/v2/Users')['Resources'] 226 | user_id = {} 227 | for user in users: 228 | user_id[user['emails'][0]['value']] = user['id'] 229 | return user_id 230 | 231 | def get_old_user_emails(self, users_logfile='users.log'): 232 | # return a dictionary of { old_id : email } from the users log 233 | users_log = self.get_export_dir() + users_logfile 234 | email_dict = {} 235 | with open(users_log, 'r') as fp: 236 | for x in fp: 237 | user = json.loads(x) 238 | email_dict[user['id']] = user['emails'][0]['value'] 239 | return email_dict 240 | 241 | def get_current_group_ids(self): 242 | # return a dict of group displayName and id mappings 243 | groups = self.get('/preview/scim/v2/Groups').get('Resources', None) 244 | group_ids = {} 245 | for group in groups: 246 | group_ids[group['displayName']] = group['id'] 247 | return group_ids 248 | 249 | @staticmethod 250 | def add_roles_arg(roles_list): 251 | # this builds the args from a list of IAM roles. diff built from user logfile 252 | role_values = [{'value': x} for x in roles_list] 253 | patch_roles_arg = { 254 | "schemas": ["urn:ietf:params:scim:api:messages:2.0:PatchOp"], 255 | "Operations": [ 256 | { 257 | "op": "add", 258 | "path": "roles", 259 | "value": role_values 260 | } 261 | ] 262 | } 263 | return patch_roles_arg 264 | 265 | def assign_user_entitlements(self, user_log_file='users.log'): 266 | """ 267 | assign user entitlements to allow cluster create, job create, sql analytics etc 268 | :param user_log_file: 269 | :return: 270 | """ 271 | user_log = self.get_export_dir() + user_log_file 272 | if not os.path.exists(user_log): 273 | print("Skipping user entitlement assignment. Logfile does not exist") 274 | return 275 | user_ids = self.get_user_id_mapping() 276 | with open(user_log, 'r') as fp: 277 | # loop through each user in the file 278 | for line in fp: 279 | user = json.loads(line) 280 | # add the users entitlements 281 | user_entitlements = user.get('entitlements', None) 282 | # get the current registered user id 283 | user_id = user_ids[user['userName']] 284 | if user_entitlements: 285 | entitlements_args = self.assign_entitlements_args(user_entitlements) 286 | update_resp = self.patch(f'/preview/scim/v2/Users/{user_id}', entitlements_args) 287 | 288 | def assign_user_roles(self, user_log_file='users.log'): 289 | """ 290 | assign user roles that are missing after adding group assignment 291 | Note: There is a limitation in the exposed API. If a user is assigned a role permission & the permission 292 | is granted via a group, we can't distinguish the difference. Only group assignment will be migrated. 293 | :param user_log_file: logfile of all user properties 294 | :return: 295 | """ 296 | user_log = self.get_export_dir() + user_log_file 297 | if not os.path.exists(user_log): 298 | print("Skipping user entitlement assignment. Logfile does not exist") 299 | return 300 | # keys to filter from the user log to get the user / role mapping 301 | old_role_keys = ('userName', 'roles') 302 | cur_role_keys = ('schemas', 'userName', 'entitlements', 'roles', 'groups') 303 | # get current user id of the new environment, k,v = email, id 304 | user_ids = self.get_user_id_mapping() 305 | with open(user_log, 'r') as fp: 306 | # loop through each user in the file 307 | for line in fp: 308 | user = json.loads(line) 309 | user_roles = {k: user[k] for k in old_role_keys if k in user} 310 | # get the current registered user id 311 | user_id = user_ids[user['userName']] 312 | # get the current users settings 313 | cur_user = self.get('/preview/scim/v2/Users/{0}'.format(user_id)) 314 | # get the current users IAM roles 315 | current_roles = cur_user.get('roles', None) 316 | if current_roles: 317 | cur_role_values = set([x['value'] for x in current_roles]) 318 | else: 319 | cur_role_values = set() 320 | # get the users saved IAM roles from the export 321 | saved_roles = user_roles.get('roles', None) 322 | if saved_roles: 323 | saved_role_values = set([y['value'] for y in saved_roles]) 324 | else: 325 | saved_role_values = set() 326 | roles_needed = list(saved_role_values - cur_role_values) 327 | if roles_needed: 328 | # get the json to add the roles to the user profile 329 | patch_roles = self.add_roles_arg(roles_needed) 330 | update_resp = self.patch(f'/preview/scim/v2/Users/{user_id}', patch_roles) 331 | 332 | @staticmethod 333 | def get_member_args(member_id_list): 334 | """ 335 | helper function to form the json args to the patch request to update group memberships 336 | :param member_id_list: member ids to add to a specific group 337 | :return: dict args for the patch operation 338 | """ 339 | member_id_list_json = [] 340 | for m_id in member_id_list: 341 | member_id_list_json.append({'value': '{0}'.format(m_id)}) 342 | 343 | add_members_args = { 344 | "schemas": ["urn:ietf:params:scim:api:messages:2.0:PatchOp"], 345 | "Operations": [{ 346 | "op": "add", 347 | "value": {"members": member_id_list_json} 348 | } 349 | ] 350 | } 351 | return add_members_args 352 | 353 | @staticmethod 354 | def is_user(member_json): 355 | # currently a workaround to get whether the member is a user or group 356 | # check the ref instead of the type field 357 | # once fixed, the type should be `user` or `group` in lowercase 358 | if 'Users/' in member_json['$ref']: 359 | return True 360 | return False 361 | 362 | @staticmethod 363 | def is_group(member_json): 364 | # currently a workaround to get whether the member is a user or group 365 | # check the ref instead of the type field 366 | # once fixed, the type should be `user` or `group` in lowercase 367 | if 'Groups/' in member_json['$ref']: 368 | return True 369 | return False 370 | 371 | def import_groups(self, group_dir): 372 | # list all the groups and create groups first 373 | if not os.path.exists(group_dir): 374 | print("No groups to import.") 375 | return 376 | groups = os.listdir(group_dir) 377 | create_args = { 378 | "schemas": ["urn:ietf:params:scim:schemas:core:2.0:Group"], 379 | "displayName": "default" 380 | } 381 | for x in groups: 382 | print('Creating group: {0}'.format(x)) 383 | # set the create args displayName property aka group name 384 | create_args['displayName'] = x 385 | group_resp = self.post('/preview/scim/v2/Groups', create_args) 386 | 387 | # dict of { group_name : group_id } 388 | current_group_ids = self.get_current_group_ids() 389 | # dict of { email : current_user_id } 390 | current_user_ids = self.get_current_user_ids() 391 | # dict of { old_user_id : email } 392 | old_user_emails = self.get_old_user_emails() 393 | for group_name in groups: 394 | with open(group_dir + group_name, 'r') as fp: 395 | members = json.loads(fp.read()).get('members', None) 396 | if members: 397 | # grab a list of ids to add either groups or users to this current group 398 | member_id_list = [] 399 | for m in members: 400 | if self.is_user(m): 401 | old_email = old_user_emails[m['value']] 402 | this_user_id = current_user_ids.get(old_email, '') 403 | if not this_user_id: 404 | raise ValueError(f'Unable to find user {old_email} in the new workspace. ' 405 | f'This users email case has changed and needs to be updated with ' 406 | f'the --replace-old-email and --update-new-email options') 407 | member_id_list.append(this_user_id) 408 | elif self.is_group(m): 409 | this_group_id = current_group_ids.get(m['display']) 410 | member_id_list.append(this_group_id) 411 | else: 412 | print("Skipping service principal members and other identities not within users/groups") 413 | add_members_json = self.get_member_args(member_id_list) 414 | group_id = current_group_ids[group_name] 415 | add_resp = self.patch('/preview/scim/v2/Groups/{0}'.format(group_id), add_members_json) 416 | 417 | def import_users(self, user_log): 418 | # first create the user identities with the required fields 419 | create_keys = ('emails', 'entitlements', 'displayName', 'name', 'userName') 420 | if not os.path.exists(user_log): 421 | print("No users to import.") 422 | return 423 | with open(user_log, 'r') as fp: 424 | for x in fp: 425 | user = json.loads(x) 426 | print("Creating user: {0}".format(user['userName'])) 427 | user_create = {k: user[k] for k in create_keys if k in user} 428 | create_resp = self.post('/preview/scim/v2/Users', user_create) 429 | 430 | def import_all_users_and_groups(self, user_log_file='users.log', group_log_dir='groups/'): 431 | user_log = self.get_export_dir() + user_log_file 432 | group_dir = self.get_export_dir() + group_log_dir 433 | 434 | self.import_users(user_log) 435 | self.import_groups(group_dir) 436 | # assign the users to IAM roles if on AWS 437 | if self.is_aws(): 438 | print("Update group role assignments") 439 | self.assign_group_roles(group_dir) 440 | print("Update user role assignments") 441 | self.assign_user_roles(user_log_file) 442 | print("Done") 443 | # need to separate role assignment and entitlements to support Azure 444 | print("Updating groups entitlements") 445 | self.assign_group_entitlements(group_dir) 446 | print("Updating users entitlements") 447 | self.assign_user_entitlements(user_log_file) 448 | -------------------------------------------------------------------------------- /dbclient/ClustersClient.py: -------------------------------------------------------------------------------- 1 | import os, re, time 2 | 3 | from dbclient import * 4 | 5 | 6 | class ClustersClient(dbclient): 7 | create_configs = {'num_workers', 8 | 'autoscale', 9 | 'cluster_name', 10 | 'spark_version', 11 | 'spark_conf', 12 | 'aws_attributes', 13 | 'node_type_id', 14 | 'driver_node_type_id', 15 | 'ssh_public_keys', 16 | 'custom_tags', 17 | 'cluster_log_conf', 18 | 'init_scripts', 19 | 'docker_image', 20 | 'spark_env_vars', 21 | 'autotermination_minutes', 22 | 'enable_elastic_disk', 23 | 'instance_pool_id', 24 | 'policy_id', 25 | 'pinned_by_user_name', 26 | 'creator_user_name', 27 | 'cluster_id'} 28 | 29 | def cleanup_cluster_pool_configs(self, cluster_json, cluster_creator, is_job_cluster=False): 30 | """ 31 | Pass in cluster json and cluster_creator to update fields that are not needed for clusters submitted to pools 32 | :param cluster_json: 33 | :param cluster_creator: 34 | :param is_job_cluster: flag to not add tags for job clusters since those clusters don't have this behavior 35 | as interactive clusters 36 | :return: 37 | """ 38 | pool_id_dict = self.get_instance_pool_id_mapping() 39 | # if pool id exists, remove instance types 40 | cluster_json.pop('node_type_id', None) 41 | cluster_json.pop('driver_node_type_id', None) 42 | cluster_json.pop('enable_elastic_disk', None) 43 | if not is_job_cluster: 44 | # add custom tag for original cluster creator for cost tracking 45 | if 'custom_tags' in cluster_json: 46 | tags = cluster_json['custom_tags'] 47 | tags['OriginalCreator'] = cluster_creator 48 | cluster_json['custom_tags'] = tags 49 | else: 50 | cluster_json['custom_tags'] = {'OriginalCreator': cluster_creator} 51 | # remove all aws_attr except for IAM role if it exists 52 | if 'aws_attributes' in cluster_json: 53 | aws_conf = cluster_json.pop('aws_attributes') 54 | iam_role = aws_conf.get('instance_profile_arn', None) 55 | if not iam_role: 56 | cluster_json['aws_attributes'] = {'instance_profile_arn': iam_role} 57 | # map old pool ids to new pool ids 58 | old_pool_id = cluster_json['instance_pool_id'] 59 | cluster_json['instance_pool_id'] = pool_id_dict[old_pool_id] 60 | return cluster_json 61 | 62 | def delete_all_clusters(self): 63 | cl = self.get_cluster_list(False) 64 | for x in cl: 65 | self.post('/clusters/unpin', {'cluster_id': x['cluster_id']}) 66 | self.post('/clusters/permanent-delete', {'cluster_id': x['cluster_id']}) 67 | 68 | def edit_cluster(self, cid, iam_role): 69 | """Edits the existing metastore cluster 70 | Returns cluster_id""" 71 | version = self.get_latest_spark_version() 72 | import os 73 | real_path = os.path.dirname(os.path.realpath(__file__)) 74 | if self.is_aws(): 75 | print("Updating cluster with: " + iam_role) 76 | current_cluster_json = self.get(f'/clusters/get?cluster_id={cid}') 77 | run_properties = set(list(current_cluster_json.keys())) - self.create_configs 78 | for p in run_properties: 79 | del current_cluster_json[p] 80 | if 'aws_attributes' in current_cluster_json: 81 | aws_conf = current_cluster_json.pop('aws_attributes') 82 | aws_conf['instance_profile_arn'] = iam_role 83 | else: 84 | aws_conf = {'instance_profile_arn': iam_role} 85 | current_cluster_json['aws_attributes'] = aws_conf 86 | resp = self.post('/clusters/edit', current_cluster_json) 87 | print(resp) 88 | new_cid = self.wait_for_cluster(cid) 89 | return new_cid 90 | else: 91 | return False 92 | 93 | def get_cluster_acls(self, cluster_id, cluster_name): 94 | """ 95 | Export all cluster permissions for a specific cluster id 96 | :return: 97 | """ 98 | perms = self.get(f'/preview/permissions/clusters/{cluster_id}/') 99 | perms['cluster_name'] = cluster_name 100 | return perms 101 | 102 | def get_cluster_id_by_name(self, cname, running_only=False): 103 | cluster_list = self.get('/clusters/list').get('clusters', []) 104 | if running_only: 105 | running = list(filter(lambda x: x['state'] == "RUNNING", cluster_list)) 106 | for x in running: 107 | if cname == x['cluster_name']: 108 | return x['cluster_id'] 109 | else: 110 | for x in cluster_list: 111 | if cname == x['cluster_name']: 112 | return x['cluster_id'] 113 | return None 114 | 115 | def get_cluster_list(self, alive=True): 116 | """ 117 | Returns an array of json objects for the running clusters. 118 | Grab the cluster_name or cluster_id 119 | """ 120 | clusters_list = self.get("/clusters/list", print_json=False).get('clusters', []) 121 | if alive and clusters_list: 122 | running = filter(lambda x: x['state'] == "RUNNING", clusters_list) 123 | return list(running) 124 | else: 125 | return clusters_list 126 | 127 | def get_execution_context(self, cid): 128 | print("Creating remote Spark Session") 129 | time.sleep(5) 130 | ec_payload = {"language": "python", 131 | "clusterId": cid} 132 | ec = self.post('/contexts/create', json_params=ec_payload, version="1.2") 133 | # Grab the execution context ID 134 | ec_id = ec.get('id', None) 135 | if not ec_id: 136 | print('Unable to establish remote session') 137 | print(ec) 138 | raise Exception("Remote session error") 139 | return ec_id 140 | 141 | def get_global_init_scripts(self): 142 | """ return a list of global init scripts. Currently not logged """ 143 | ls = self.get('/dbfs/list', {'path': '/databricks/init/'}).get('files', None) 144 | if ls is None: 145 | return [] 146 | else: 147 | global_scripts = [{'path': x['path']} for x in ls if x['is_dir'] == False] 148 | return global_scripts 149 | 150 | def get_instance_pool_id_mapping(self, log_file='instance_pools.log'): 151 | pool_log = self.get_export_dir() + log_file 152 | current_pools = self.get('/instance-pools/list').get('instance_pools', None) 153 | if not current_pools: 154 | return None 155 | new_pools = {} 156 | # build dict of pool name and id mapping 157 | for p in current_pools: 158 | new_pools[p['instance_pool_name']] = p['instance_pool_id'] 159 | # mapping id from old_pool_id to new_pool_id 160 | pool_mapping_dict = {} 161 | with open(pool_log, 'r') as fp: 162 | for line in fp: 163 | pool_conf = json.loads(line) 164 | old_pool_id = pool_conf['instance_pool_id'] 165 | pool_name = pool_conf['instance_pool_name'] 166 | new_pool_id = new_pools[pool_name] 167 | pool_mapping_dict[old_pool_id] = new_pool_id 168 | return pool_mapping_dict 169 | 170 | def get_policy_id_by_name_dict(self): 171 | name_id_dict = {} 172 | resp = self.get('/policies/clusters/list').get('policies', []) 173 | for policy in resp: 174 | name_id_dict[policy['name']] = policy['policy_id'] 175 | return name_id_dict 176 | 177 | def get_spark_versions(self): 178 | return self.get("/clusters/spark-versions", print_json=True) 179 | 180 | def get_instance_profiles_list(self): 181 | if self.is_aws(): 182 | ip_json_list = self.get('/instance-profiles/list').get('instance_profiles', []) 183 | iam_roles_list = list(map(lambda x: x.get('instance_profile_arn'), ip_json_list)) 184 | return iam_roles_list 185 | return [] 186 | 187 | def get_iam_role_by_cid(self, cid): 188 | if self.is_aws(): 189 | cluster_resp = self.get(f'/clusters/get?cluster_id={cid}') 190 | return cluster_resp.get('aws_attributes').get('instance_profile_arn', None) 191 | return None 192 | 193 | def get_new_policy_id_dict(self, policy_file='cluster_policies.log'): 194 | """ 195 | mapping function to get the new policy ids. ids change when migrating to a new workspace 196 | read the log file and map the old id to the new id 197 | :param old_policy_id: str of the old id 198 | :return: str of new policy id 199 | """ 200 | policy_log = self.get_export_dir() + policy_file 201 | current_policies = self.get('/policies/clusters/list').get('policies', []) 202 | current_policies_dict = {} # name : current policy id 203 | for policy in current_policies: 204 | current_name = policy['name'] 205 | current_id = policy['policy_id'] 206 | current_policies_dict[current_name] = current_id 207 | policy_id_dict = {} 208 | with open(policy_log, 'r') as fp: 209 | for line in fp: 210 | policy_conf = json.loads(line) 211 | policy_name = policy_conf['name'] 212 | old_policy_id = policy_conf['policy_id'] 213 | policy_id_dict[old_policy_id] = current_policies_dict[policy_name] # old_id : new_id 214 | return policy_id_dict 215 | 216 | def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None): 217 | """ 218 | Import cluster configs and update appropriate properties / tags in the new env 219 | :param log_file: 220 | :return: 221 | """ 222 | cluster_log = self.get_export_dir() + log_file 223 | acl_cluster_log = self.get_export_dir() + acl_log_file 224 | if not os.path.exists(cluster_log): 225 | print("No clusters to import.") 226 | return 227 | current_cluster_names = set([x.get('cluster_name', None) for x in self.get_cluster_list(False)]) 228 | old_2_new_policy_ids = self.get_new_policy_id_dict() # dict of {old_id : new_id} 229 | # get instance pool id mappings 230 | with open(cluster_log, 'r') as fp: 231 | for line in fp: 232 | cluster_conf = json.loads(line) 233 | cluster_name = cluster_conf['cluster_name'] 234 | if cluster_name in current_cluster_names: 235 | print("Cluster already exists, skipping: {0}".format(cluster_name)) 236 | continue 237 | cluster_creator = cluster_conf.pop('creator_user_name') 238 | if 'policy_id' in cluster_conf: 239 | old_policy_id = cluster_conf['policy_id'] 240 | cluster_conf['policy_id'] = old_2_new_policy_ids[old_policy_id] 241 | # check for instance pools and modify cluster attributes 242 | if 'instance_pool_id' in cluster_conf: 243 | new_cluster_conf = self.cleanup_cluster_pool_configs(cluster_conf, cluster_creator) 244 | else: 245 | # update cluster configs for non-pool clusters 246 | # add original creator tag to help with DBU tracking 247 | if 'custom_tags' in cluster_conf: 248 | tags = cluster_conf['custom_tags'] 249 | tags['OriginalCreator'] = cluster_creator 250 | cluster_conf['custom_tags'] = tags 251 | else: 252 | cluster_conf['custom_tags'] = {'OriginalCreator': cluster_creator} 253 | new_cluster_conf = cluster_conf 254 | print("Creating cluster: {0}".format(new_cluster_conf['cluster_name'])) 255 | cluster_resp = self.post('/clusters/create', new_cluster_conf) 256 | if cluster_resp['http_status_code'] == 200: 257 | stop_resp = self.post('/clusters/delete', {'cluster_id': cluster_resp['cluster_id']}) 258 | if 'pinned_by_user_name' in cluster_conf: 259 | pin_resp = self.post('/clusters/pin', {'cluster_id': cluster_resp['cluster_id']}) 260 | else: 261 | print(cluster_resp) 262 | # add cluster ACLs 263 | # loop through and reapply cluster ACLs 264 | with open(acl_cluster_log, 'r') as acl_fp: 265 | for x in acl_fp: 266 | data = json.loads(x) 267 | cluster_name = data['cluster_name'] 268 | print(f'Applying acl for {cluster_name}') 269 | acl_args = {'access_control_list' : self.build_acl_args(data['access_control_list'])} 270 | cid = self.get_cluster_id_by_name(cluster_name) 271 | if cid is None: 272 | raise ValueError('Cluster id must exist in new env. Re-import cluster configs.') 273 | api = f'/preview/permissions/clusters/{cid}' 274 | resp = self.put(api, acl_args) 275 | print(resp) 276 | 277 | def import_cluster_policies(self, log_file='cluster_policies.log', acl_log_file='acl_cluster_policies.log'): 278 | policies_log = self.get_export_dir() + log_file 279 | acl_policies_log = self.get_export_dir() + acl_log_file 280 | # create the policies 281 | if os.path.exists(policies_log): 282 | with open(policies_log, 'r') as policy_fp: 283 | for p in policy_fp: 284 | policy_conf = json.loads(p) 285 | # when creating the policy, we only need `name` and `definition` fields 286 | create_args = {'name': policy_conf['name'], 287 | 'definition': policy_conf['definition']} 288 | resp = self.post('/policies/clusters/create', create_args) 289 | # ACLs are created by using the `access_control_list` key 290 | with open(acl_policies_log, 'r') as acl_fp: 291 | id_map = self.get_policy_id_by_name_dict() 292 | for x in acl_fp: 293 | p_acl = json.loads(x) 294 | acl_create_args = {'access_control_list': self.build_acl_args(p_acl['access_control_list'])} 295 | policy_id = id_map[p_acl['name']] 296 | api = f'/permissions/cluster-policies/{policy_id}' 297 | resp = self.put(api, acl_create_args) 298 | print(resp) 299 | else: 300 | print('Skipping cluster policies as no log file exists') 301 | 302 | def import_instance_pools(self, log_file='instance_pools.log'): 303 | pool_log = self.get_export_dir() + log_file 304 | if not os.path.exists(pool_log): 305 | print("No instance pools to import.") 306 | return 307 | with open(pool_log, 'r') as fp: 308 | for line in fp: 309 | pool_conf = json.loads(line) 310 | pool_resp = self.post('/instance-pools/create', pool_conf) 311 | 312 | def import_instance_profiles(self, log_file='instance_profiles.log'): 313 | # currently an AWS only operation 314 | ip_log = self.get_export_dir() + log_file 315 | if not os.path.exists(ip_log): 316 | print("No instance profiles to import.") 317 | return 318 | # check current profiles and skip if the profile already exists 319 | ip_list = self.get('/instance-profiles/list').get('instance_profiles', None) 320 | if ip_list: 321 | list_of_profiles = [x['instance_profile_arn'] for x in ip_list] 322 | else: 323 | list_of_profiles = [] 324 | list_of_profiles = [] 325 | import_profiles_count = 0 326 | with open(ip_log, "r") as fp: 327 | for line in fp: 328 | ip_arn = json.loads(line).get('instance_profile_arn', None) 329 | if ip_arn not in list_of_profiles: 330 | print("Importing arn: {0}".format(ip_arn)) 331 | resp = self.post('/instance-profiles/add', {'instance_profile_arn': ip_arn}) 332 | if 'error_code' in resp: 333 | print("Error") 334 | else: 335 | import_profiles_count += 1 336 | print(resp) 337 | else: 338 | print("Skipping since profile exists: {0}".format(ip_arn)) 339 | return import_profiles_count 340 | 341 | def is_spark_3(self, cid): 342 | spark_version = self.get(f'/clusters/get?cluster_id={cid}').get('spark_version', "") 343 | if spark_version[0] >= '7': 344 | return True 345 | else: 346 | return False 347 | 348 | def launch_cluster(self, iam_role=None): 349 | """ Launches a cluster to get DDL statements. 350 | Returns a cluster_id """ 351 | # removed for now as Spark 3.0 will have backwards incompatible changes 352 | # version = self.get_latest_spark_version() 353 | import os 354 | real_path = os.path.dirname(os.path.realpath(__file__)) 355 | if self.is_aws(): 356 | with open(real_path + '/../data/aws_cluster.json', 'r') as fp: 357 | cluster_json = json.loads(fp.read()) 358 | if iam_role: 359 | aws_attr = cluster_json['aws_attributes'] 360 | print("Creating cluster with: " + iam_role) 361 | aws_attr['instance_profile_arn'] = iam_role 362 | cluster_json['aws_attributes'] = aws_attr 363 | else: 364 | with open(real_path + '/../data/azure_cluster.json', 'r') as fp: 365 | cluster_json = json.loads(fp.read()) 366 | # set the latest spark release regardless of defined cluster json 367 | # cluster_json['spark_version'] = version['key'] 368 | cluster_name = cluster_json['cluster_name'] 369 | existing_cid = self.get_cluster_id_by_name(cluster_name) 370 | if existing_cid: 371 | # if the cluster id exists, then a cluster exists in a terminated state. let's start it 372 | cid = self.start_cluster_by_name(cluster_name) 373 | return cid 374 | else: 375 | print("Starting cluster with name: {0} ".format(cluster_name)) 376 | c_info = self.post('/clusters/create', cluster_json) 377 | if c_info['http_status_code'] != 200: 378 | raise Exception("Could not launch cluster. Verify that the --azure flag or cluster config is correct.") 379 | self.wait_for_cluster(c_info['cluster_id']) 380 | return c_info['cluster_id'] 381 | 382 | def log_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None): 383 | """ 384 | Log the current cluster configs in json file 385 | :param log_file: log the cluster configs 386 | :param acl_log_file: log the ACL definitions 387 | :param filter_user: user name to filter and log the cluster config 388 | :return: 389 | """ 390 | cluster_log = self.get_export_dir() + log_file 391 | acl_cluster_log = self.get_export_dir() + acl_log_file 392 | # pinned by cluster_user is a flag per cluster 393 | cl_raw = self.get_cluster_list(False) 394 | cluster_list = self.remove_automated_clusters(cl_raw) 395 | ip_list = self.get('/instance-profiles/list').get('instance_profiles', []) 396 | nonempty_ip_list = [] 397 | if ip_list: 398 | # filter none if we hit a profile w/ a none object 399 | # generate list of registered instance profiles to check cluster configs against 400 | nonempty_ip_list = list(filter(None, [x.get('instance_profile_arn', None) for x in ip_list])) 401 | 402 | # filter on these items as MVP of the cluster configs 403 | # https://docs.databricks.com/api/latest/clusters.html#request-structure 404 | with open(cluster_log, 'w') as log_fp, open(acl_cluster_log, 'w') as acl_log_fp: 405 | for cluster_json in cluster_list: 406 | run_properties = set(list(cluster_json.keys())) - self.create_configs 407 | for p in run_properties: 408 | del cluster_json[p] 409 | if 'aws_attributes' in cluster_json: 410 | aws_conf = cluster_json.pop('aws_attributes') 411 | iam_role = aws_conf.get('instance_profile_arn', None) 412 | if iam_role and ip_list: 413 | if iam_role not in nonempty_ip_list: 414 | print("Skipping log of default IAM role: " + iam_role) 415 | del aws_conf['instance_profile_arn'] 416 | cluster_json['aws_attributes'] = aws_conf 417 | cluster_json['aws_attributes'] = aws_conf 418 | cluster_perms = self.get_cluster_acls(cluster_json['cluster_id'], cluster_json['cluster_name']) 419 | acl_log_fp.write(json.dumps(cluster_perms) + '\n') 420 | if filter_user: 421 | if cluster_json['creator_user_name'] == filter_user: 422 | log_fp.write(json.dumps(cluster_json) + '\n') 423 | else: 424 | log_fp.write(json.dumps(cluster_json) + '\n') 425 | 426 | def log_cluster_policies(self, log_file='cluster_policies.log', acl_log_file='acl_cluster_policies.log'): 427 | policies_log = self.get_export_dir() + log_file 428 | acl_policies_log = self.get_export_dir() + acl_log_file 429 | # log all cluster policy definitions 430 | policy_ids = {} 431 | policies_list = self.get('/policies/clusters/list').get('policies', []) 432 | with open(policies_log, 'w') as fp: 433 | for x in policies_list: 434 | policy_ids[x.get('policy_id')] = x.get('name') 435 | fp.write(json.dumps(x) + '\n') 436 | # log cluster policy ACLs, which takes a policy id as arguments 437 | with open(acl_policies_log, 'w') as acl_fp: 438 | for pid in policy_ids: 439 | api = f'/preview/permissions/cluster-policies/{pid}' 440 | perms = self.get(api) 441 | perms['name'] = policy_ids[pid] 442 | acl_fp.write(json.dumps(perms) + '\n') 443 | 444 | def log_instance_pools(self, log_file='instance_pools.log'): 445 | pool_log = self.get_export_dir() + log_file 446 | pools = self.get('/instance-pools/list').get('instance_pools', None) 447 | if pools: 448 | with open(pool_log, "w") as fp: 449 | for x in pools: 450 | fp.write(json.dumps(x) + '\n') 451 | 452 | def log_instance_profiles(self, log_file='instance_profiles.log'): 453 | ip_log = self.get_export_dir() + log_file 454 | ips = self.get('/instance-profiles/list').get('instance_profiles', None) 455 | if ips: 456 | with open(ip_log, "w") as fp: 457 | for x in ips: 458 | fp.write(json.dumps(x) + '\n') 459 | 460 | def remove_automated_clusters(self, cluster_list, log_file='skipped_clusters.log'): 461 | """ 462 | Automated clusters like job clusters or model endpoints should be excluded 463 | :param cluster_list: list of cluster configurations 464 | :return: cleaned list with automated clusters removed 465 | """ 466 | # model endpoint clusters start with the following 467 | ml_model_pattern = "mlflow-model-" 468 | # job clusters have specific format, job-JOBID-run-RUNID 469 | re_expr = re.compile("job-\d+-run-\d+$") 470 | clean_cluster_list = [] 471 | with open(self.get_export_dir() + log_file, 'w') as log_fp: 472 | for cluster in cluster_list: 473 | cluster_name = cluster['cluster_name'] 474 | if re_expr.match(cluster_name) or cluster_name.startswith(ml_model_pattern): 475 | log_fp.write(json.dumps(cluster) + '\n') 476 | else: 477 | clean_cluster_list.append(cluster) 478 | return clean_cluster_list 479 | 480 | def start_cluster_by_name(self, cluster_name): 481 | cid = self.get_cluster_id_by_name(cluster_name) 482 | if cid is None: 483 | raise Exception('Error: Cluster name does not exist') 484 | print("Starting {0} with id {1}".format(cluster_name, cid)) 485 | resp = self.post('/clusters/start', {'cluster_id': cid}) 486 | if 'error_code' in resp: 487 | if resp.get('error_code', None) == 'INVALID_STATE': 488 | print('Error: {0}'.format(resp.get('message', None))) 489 | else: 490 | raise Exception('Error: cluster does not exist, or is in a state that is unexpected. ' 491 | 'Cluster should either be terminated state, or already running.') 492 | self.wait_for_cluster(cid) 493 | return cid 494 | 495 | def submit_command(self, cid, ec_id, cmd): 496 | # This launches spark commands and print the results. We can pull out the text results from the API 497 | command_payload = {'language': 'python', 498 | 'contextId': ec_id, 499 | 'clusterId': cid, 500 | 'command': cmd} 501 | command = self.post('/commands/execute', 502 | json_params=command_payload, 503 | version="1.2") 504 | 505 | com_id = command.get('id', None) 506 | if not com_id: 507 | print("ERROR: ") 508 | print(command) 509 | # print('command_id : ' + com_id) 510 | result_payload = {'clusterId': cid, 'contextId': ec_id, 'commandId': com_id} 511 | 512 | resp = self.get('/commands/status', json_params=result_payload, version="1.2") 513 | is_running = self.get_key(resp, 'status') 514 | 515 | # loop through the status api to check for the 'running' state call and sleep 1 second 516 | while (is_running == "Running") or (is_running == 'Queued'): 517 | resp = self.get('/commands/status', json_params=result_payload, version="1.2") 518 | is_running = self.get_key(resp, 'status') 519 | time.sleep(1) 520 | end_result_status = self.get_key(resp, 'status') 521 | end_results = self.get_key(resp, 'results') 522 | if end_results.get('resultType', None) == 'error': 523 | print("ERROR: ") 524 | print(end_results.get('summary', None)) 525 | return end_results 526 | 527 | def wait_for_cluster(self, cid): 528 | c_state = self.get('/clusters/get', {'cluster_id': cid}) 529 | while c_state['state'] != 'RUNNING' and c_state['state'] != 'TERMINATED': 530 | c_state = self.get('/clusters/get', {'cluster_id': cid}) 531 | print('Cluster state: {0}'.format(c_state['state'])) 532 | time.sleep(2) 533 | if c_state['state'] == 'TERMINATED': 534 | raise RuntimeError("Cluster is terminated. Please check EVENT history for details") 535 | return cid 536 | 537 | -------------------------------------------------------------------------------- /dbclient/WorkspaceClient.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from dbclient import * 3 | from timeit import default_timer as timer 4 | from datetime import timedelta 5 | import os 6 | 7 | WS_LIST = "/workspace/list" 8 | WS_STATUS = "/workspace/get-status" 9 | WS_MKDIRS = "/workspace/mkdirs" 10 | WS_IMPORT = "/workspace/import" 11 | WS_EXPORT = "/workspace/export" 12 | LS_ZONES = "/clusters/list-zones" 13 | 14 | 15 | class WorkspaceClient(ScimClient): 16 | _languages = {'.py': 'PYTHON', 17 | '.scala': 'SCALA', 18 | '.r': 'R', 19 | '.sql': 'SQL'} 20 | 21 | def get_language(self, file_ext): 22 | return self._languages[file_ext] 23 | 24 | def get_top_level_folders(self): 25 | # get top level folders excluding the /Users path 26 | supported_types = ('NOTEBOOK', 'DIRECTORY') 27 | root_items = self.get(WS_LIST, {'path': '/'}).get('objects', []) 28 | # filter out Projects and Users folders 29 | non_users_dir = list(filter(lambda x: (x.get('path') != '/Users' and x.get('path') != '/Projects'), 30 | root_items)) 31 | dirs_and_nbs = list(filter(lambda x: (x.get('object_type') in supported_types), 32 | non_users_dir)) 33 | return dirs_and_nbs 34 | 35 | def export_top_level_folders(self): 36 | ls_tld = self.get_top_level_folders() 37 | logged_nb_count = 0 38 | for tld_obj in ls_tld: 39 | # obj has 3 keys, object_type, path, object_id 40 | tld_path = tld_obj.get('path') 41 | log_count = self.log_all_workspace_items(ws_path=tld_path) 42 | logged_nb_count += log_count 43 | dl_nb_count = self.download_notebooks() 44 | print(f'Total logged notebooks: {logged_nb_count}') 45 | print(f'Total Downloaded notebooks: {dl_nb_count}') 46 | 47 | def get_user_import_args(self, full_local_path, nb_full_path): 48 | """ 49 | helper function to define the import parameters to upload a notebook object 50 | :param full_local_path: full local path of the notebook to read 51 | :param nb_full_path: full destination path, e.g. /Users/foo@db.com/bar.dbc . Includes extension / type 52 | :return: return the full input args to upload to the destination system 53 | """ 54 | is_source_format = self.is_source_file_format() 55 | fp = open(full_local_path, "rb") 56 | (nb_path_dest, nb_type) = os.path.splitext(nb_full_path) 57 | in_args = { 58 | "content": base64.encodebytes(fp.read()).decode('utf-8'), 59 | "path": nb_path_dest, 60 | "format": self.get_file_format() 61 | } 62 | if is_source_format: 63 | in_args['language'] = self.get_language(nb_type) 64 | in_args['object_type'] = 'NOTEBOOK' 65 | return in_args 66 | 67 | @staticmethod 68 | def build_ws_lookup_table(success_ws_logfile): 69 | ws_hashmap = set() 70 | with open(success_ws_logfile, 'r') as fp: 71 | for line in fp: 72 | ws_hashmap.add(line.rstrip()) 73 | return ws_hashmap 74 | 75 | @staticmethod 76 | def is_user_ws_item(ws_dir): 77 | """ 78 | Checks if this is a user artifact / notebook. 79 | We can't create user home folders, hence we need to identify user items 80 | """ 81 | path_list = [x for x in ws_dir.split('/') if x] 82 | if len(path_list) >= 2 and path_list[0] == 'Users': 83 | return True 84 | return False 85 | 86 | @staticmethod 87 | def is_user_ws_root(ws_dir): 88 | """ 89 | Check if we're at the users home folder to skip folder creation 90 | """ 91 | if ws_dir == '/Users/' or ws_dir == '/Users': 92 | return True 93 | path_list = [x for x in ws_dir.split('/') if x] 94 | if len(path_list) == 2 and path_list[0] == 'Users': 95 | return True 96 | return False 97 | 98 | @staticmethod 99 | def get_user(ws_dir): 100 | """ 101 | returns the username of the workspace / folder path 102 | """ 103 | path_list = [x for x in ws_dir.split('/') if x] 104 | if len(path_list) < 2: 105 | raise ValueError("Error: Not a users workspace directory") 106 | return path_list[1] 107 | 108 | @staticmethod 109 | def is_user_trash(ws_path): 110 | """ 111 | checks if this is the users home folder trash directory, which is a special dir 112 | """ 113 | path_list = ws_path.split('/') 114 | if len(path_list) == 4: 115 | if path_list[1] == 'Users' and path_list[3] == 'Trash': 116 | return True 117 | return False 118 | 119 | def is_user_home_empty(self, username): 120 | user_root = '/Users/' + username.rstrip().lstrip() 121 | get_args = {'path': user_root} 122 | items = self.get(WS_LIST, get_args).get('objects', None) 123 | if items: 124 | folders = self.filter_workspace_items(items, 'DIRECTORY') 125 | notebooks = self.filter_workspace_items(items, 'NOTEBOOK') 126 | # if both notebooks and directories are empty, return true 127 | if not folders and not notebooks: 128 | return True 129 | return False 130 | return True 131 | 132 | @staticmethod 133 | def get_num_of_saved_users(export_dir): 134 | """ 135 | returns the number of exported user items to check against number of created users in the new workspace 136 | this helps identify if the new workspace is ready for the import, or if we should skip / archive failed imports 137 | """ 138 | # get current number of saved workspaces 139 | user_home_dir = export_dir + 'Users' 140 | num_of_users = 0 141 | if os.path.exists(user_home_dir): 142 | ls = os.listdir(user_home_dir) 143 | for x in ls: 144 | if os.path.isdir(user_home_dir + '/' + x): 145 | num_of_users += 1 146 | return num_of_users 147 | 148 | def export_user_home(self, username, local_export_dir): 149 | """ 150 | Export the provided user's home directory 151 | :param username: user's home directory to export 152 | :param local_export_dir: folder location to do single user exports 153 | :return: None 154 | """ 155 | original_export_dir = self.get_export_dir() 156 | user_export_dir = self.get_export_dir() + local_export_dir 157 | user_root = '/Users/' + username.rstrip().lstrip() 158 | self.set_export_dir(user_export_dir + '/{0}/'.format(username)) 159 | print("Export path: {0}".format(self.get_export_dir())) 160 | num_of_nbs = self.log_all_workspace_items(ws_path=user_root) 161 | if num_of_nbs == 0: 162 | raise ValueError('User does not have any notebooks in this path. Please verify the case of the email') 163 | num_of_nbs_dl = self.download_notebooks(ws_dir='user_artifacts/') 164 | print(f"Total notebooks logged: {num_of_nbs}") 165 | print(f"Total notebooks downloaded: {num_of_nbs_dl}") 166 | if num_of_nbs != num_of_nbs_dl: 167 | print(f"Notebooks logged != downloaded. Check the failed download file at: {user_export_dir}") 168 | # reset the original export dir for other calls to this method using the same client 169 | self.set_export_dir(original_export_dir) 170 | 171 | def import_user_home(self, username, local_export_dir): 172 | """ 173 | Import the provided user's home directory 174 | logs/user_exports/{{USERNAME}}/ stores the log files to understand what was exported 175 | logs/user_exports/{{USERNAME}}/user_artifacts/ stores the notebook contents 176 | :param username: user's home directory to export 177 | :param local_export_dir: the log directory for this users workspace items 178 | :return: None 179 | """ 180 | original_export_dir = self.get_export_dir() 181 | user_import_dir = self.get_export_dir() + local_export_dir 182 | if self.does_user_exist(username): 183 | print("Yes, we can upload since the user exists") 184 | else: 185 | print("User must exist before we upload the notebook contents. Please add the user to the platform first") 186 | user_root = '/Users/' + username.rstrip().lstrip() 187 | self.set_export_dir(user_import_dir + '/{0}/'.format(username)) 188 | print("Import local path: {0}".format(self.get_export_dir())) 189 | notebook_dir = self.get_export_dir() + 'user_artifacts/' 190 | for root, subdirs, files in os.walk(notebook_dir): 191 | upload_dir = '/' + root.replace(notebook_dir, '') 192 | # if the upload dir is the 2 root directories, skip and continue 193 | if upload_dir == '/' or upload_dir == '/Users': 194 | continue 195 | if not self.is_user_ws_root(upload_dir): 196 | # if it is not the /Users/example@example.com/ root path, don't create the folder 197 | resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir}) 198 | print(resp_mkdirs) 199 | for f in files: 200 | # get full path for the local notebook file 201 | local_file_path = os.path.join(root, f) 202 | # create upload path and remove file format extension 203 | ws_file_path = upload_dir + '/' + f 204 | # generate json args with binary data for notebook to upload to the workspace path 205 | nb_input_args = self.get_user_import_args(local_file_path, ws_file_path) 206 | # call import to the workspace 207 | if self.is_verbose(): 208 | print("Path: {0}".format(nb_input_args['path'])) 209 | resp_upload = self.post(WS_IMPORT, nb_input_args) 210 | if self.is_verbose(): 211 | print(resp_upload) 212 | self.set_export_dir(original_export_dir) 213 | 214 | def download_notebooks(self, ws_log_file='user_workspace.log', ws_dir='artifacts/'): 215 | """ 216 | Loop through all notebook paths in the logfile and download individual notebooks 217 | :param ws_log_file: logfile for all notebook paths in the workspace 218 | :param ws_dir: export directory to store all notebooks 219 | :return: None 220 | """ 221 | ws_log = self.get_export_dir() + ws_log_file 222 | num_notebooks = 0 223 | if not os.path.exists(ws_log): 224 | raise Exception("Run --workspace first to download full log of all notebooks.") 225 | with open(ws_log, "r") as fp: 226 | # notebook log metadata file now contains object_id to help w/ ACL exports 227 | # pull the path from the data to download the individual notebook contents 228 | for notebook_data in fp: 229 | notebook_path = json.loads(notebook_data).get('path', None).rstrip() 230 | dl_resp = self.download_notebook_helper(notebook_path, export_dir=self.get_export_dir() + ws_dir) 231 | if 'error_code' not in dl_resp: 232 | num_notebooks += 1 233 | return num_notebooks 234 | 235 | def download_notebook_helper(self, notebook_path, export_dir='artifacts/'): 236 | """ 237 | Helper function to download an individual notebook, or log the failure in the failure logfile 238 | :param notebook_path: an individual notebook path 239 | :param export_dir: directory to store all notebooks 240 | :return: return the notebook path that's successfully downloaded 241 | """ 242 | get_args = {'path': notebook_path, 'format': self.get_file_format()} 243 | if self.is_verbose(): 244 | print("Downloading: {0}".format(get_args['path'])) 245 | resp = self.get(WS_EXPORT, get_args) 246 | with open(self.get_export_dir() + 'failed_notebooks.log', 'a') as err_log: 247 | if resp.get('error_code', None): 248 | err_msg = {'error_code': resp.get('error_code'), 'path': notebook_path} 249 | err_log.write(json.dumps(err_msg) + '\n') 250 | return err_msg 251 | nb_path = os.path.dirname(notebook_path) 252 | if nb_path != '/': 253 | # path is NOT empty, remove the trailing slash from export_dir 254 | save_path = export_dir[:-1] + nb_path + '/' 255 | else: 256 | save_path = export_dir 257 | save_filename = save_path + os.path.basename(notebook_path) + '.' + resp.get('file_type') 258 | # If the local path doesn't exist,we create it before we save the contents 259 | if not os.path.exists(save_path) and save_path: 260 | os.makedirs(save_path, exist_ok=True) 261 | with open(save_filename, "wb") as f: 262 | f.write(base64.b64decode(resp['content'])) 263 | return {'path': notebook_path} 264 | 265 | def filter_workspace_items(self, item_list, item_type): 266 | """ 267 | Helper function to filter on different workspace types. 268 | :param item_list: iterable of workspace items 269 | :param item_type: DIRECTORY, NOTEBOOK, LIBRARY 270 | :return: list of items filtered by type 271 | """ 272 | supported_types = {'DIRECTORY', 'NOTEBOOK', 'LIBRARY'} 273 | if item_type not in supported_types: 274 | raise ValueError('Unsupported type provided: {0}.\n. Supported types: {1}'.format(item_type, 275 | str(supported_types))) 276 | filtered_list = list(self.my_map(lambda y: {'path': y.get('path', None), 277 | 'object_id': y.get('object_id', None)}, 278 | filter(lambda x: x.get('object_type', None) == item_type, item_list))) 279 | return filtered_list 280 | 281 | def init_workspace_logfiles(self, workspace_log_file='user_workspace.log', 282 | libs_log_file='libraries.log', workspace_dir_log_file='user_dirs.log'): 283 | """ 284 | initialize the logfile locations since we run a recursive function to download notebooks 285 | """ 286 | workspace_log = self.get_export_dir() + workspace_log_file 287 | libs_log = self.get_export_dir() + libs_log_file 288 | workspace_dir_log = self.get_export_dir() + workspace_dir_log_file 289 | if os.path.exists(workspace_log): 290 | os.remove(workspace_log) 291 | if os.path.exists(workspace_dir_log): 292 | os.remove(workspace_dir_log) 293 | if os.path.exists(libs_log): 294 | os.remove(libs_log) 295 | 296 | def log_all_workspace_items(self, ws_path='/', workspace_log_file='user_workspace.log', 297 | libs_log_file='libraries.log', dir_log_file='user_dirs.log'): 298 | """ 299 | Loop and log all workspace items to download them at a later time 300 | :param ws_path: root path to log all the items of the notebook workspace 301 | :param workspace_log_file: logfile to store all the paths of the notebooks 302 | :param libs_log_file: library logfile to store workspace libraries 303 | :param dir_log_file: log directory for users 304 | :return: 305 | """ 306 | # define log file names for notebooks, folders, and libraries 307 | workspace_log = self.get_export_dir() + workspace_log_file 308 | workspace_dir_log = self.get_export_dir() + dir_log_file 309 | libs_log = self.get_export_dir() + libs_log_file 310 | if ws_path == '/': 311 | # default is the root path 312 | get_args = {'path': '/'} 313 | else: 314 | get_args = {'path': ws_path} 315 | 316 | if not os.path.exists(self.get_export_dir()): 317 | os.makedirs(self.get_export_dir(), exist_ok=True) 318 | items = self.get(WS_LIST, get_args).get('objects', None) 319 | num_nbs = 0 320 | if self.is_verbose(): 321 | print("Listing: {0}".format(get_args['path'])) 322 | if items is not None: 323 | # list all the users folders only 324 | folders = self.filter_workspace_items(items, 'DIRECTORY') 325 | # should be no notebooks, but lets filter and can check later 326 | notebooks = self.filter_workspace_items(items, 'NOTEBOOK') 327 | libraries = self.filter_workspace_items(items, 'LIBRARY') 328 | with open(workspace_log, "a") as ws_fp, open(libs_log, "a") as libs_fp: 329 | for x in notebooks: 330 | # notebook objects has path and object_id 331 | if self.is_verbose(): 332 | print("Saving path: {0}".format(x.get('path'))) 333 | ws_fp.write(json.dumps(x) + '\n') 334 | num_nbs += 1 335 | for y in libraries: 336 | libs_fp.write(json.dumps(y) + '\n') 337 | # log all directories to export permissions 338 | if folders: 339 | with open(workspace_dir_log, "a") as dir_fp: 340 | for f in folders: 341 | dir_path = f.get('path', None) 342 | if not WorkspaceClient.is_user_trash(dir_path): 343 | dir_fp.write(json.dumps(f) + '\n') 344 | num_nbs += self.log_all_workspace_items(ws_path=dir_path, 345 | workspace_log_file=workspace_log_file, 346 | libs_log_file=libs_log_file) 347 | return num_nbs 348 | 349 | def get_obj_id_by_path(self, input_path): 350 | resp = self.get(WS_STATUS, {'path': input_path}) 351 | obj_id = resp.get('object_id', None) 352 | return obj_id 353 | 354 | def log_acl_to_file(self, artifact_type, read_log_filename, write_log_filename, failed_log_filename): 355 | """ 356 | generic function to log the notebook/directory ACLs to specific file names 357 | :param artifact_type: set('notebooks', 'directories') ACLs to be logged 358 | :param read_log_filename: the list of the notebook paths / object ids 359 | :param write_log_filename: output file to store object_id acls 360 | :param failed_log_filename: failed acl logs for resources, should be empty 361 | """ 362 | read_log_path = self.get_export_dir() + read_log_filename 363 | write_log_path = self.get_export_dir() + write_log_filename 364 | failed_log_path = self.get_export_dir() + failed_log_filename 365 | with open(read_log_path, 'r') as read_fp, open(write_log_path, 'w') as write_fp, \ 366 | open(failed_log_path, 'w') as failed_fp: 367 | for x in read_fp: 368 | data = json.loads(x) 369 | obj_id = data.get('object_id', None) 370 | api_endpoint = '/permissions/{0}/{1}'.format(artifact_type, obj_id) 371 | acl_resp = self.get(api_endpoint) 372 | acl_resp['path'] = data.get('path') 373 | if 'error_code' in acl_resp: 374 | failed_fp.write(json.dumps(acl_resp) + '\n') 375 | continue 376 | acl_resp.pop('http_status_code') 377 | write_fp.write(json.dumps(acl_resp) + '\n') 378 | 379 | def log_all_workspace_acls(self, workspace_log_file='user_workspace.log', 380 | dir_log_file='user_dirs.log'): 381 | """ 382 | loop through all notebooks and directories to store their associated ACLs 383 | :param workspace_log_file: input file for user notebook listing 384 | :param dir_log_file: input file for user directory listing 385 | """ 386 | # define log file names for notebooks, folders, and libraries 387 | print("Exporting the notebook permissions") 388 | start = timer() 389 | self.log_acl_to_file('notebooks', workspace_log_file, 'acl_notebooks.log', 'failed_acl_notebooks.log') 390 | end = timer() 391 | print("Complete Notebook ACLs Export Time: " + str(timedelta(seconds=end - start))) 392 | print("Exporting the directories permissions") 393 | start = timer() 394 | self.log_acl_to_file('directories', dir_log_file, 'acl_directories.log', 'failed_acl_directories.log') 395 | end = timer() 396 | print("Complete Directories ACLs Export Time: " + str(timedelta(seconds=end - start))) 397 | 398 | def apply_acl_on_object(self, acl_str): 399 | """ 400 | apply the acl definition to the workspace object 401 | object_id comes from the export data which contains '/type/id' format for this key 402 | the object_id contains the {{/type/object_id}} format which helps craft the api endpoint 403 | setting acl definitions uses the patch rest api verb 404 | :param acl_str: the complete string from the logfile. contains object defn and acl lists 405 | """ 406 | object_acl = json.loads(acl_str) 407 | # the object_type 408 | object_type = object_acl.get('object_type', None) 409 | obj_path = object_acl['path'] 410 | obj_status = self.get(WS_STATUS, {'path': obj_path}) 411 | print("ws-stat: ", obj_status) 412 | current_obj_id = obj_status.get('object_id', None) 413 | if not current_obj_id: 414 | print('Object id missing from destination workspace', obj_path) 415 | return 416 | if object_type == 'directory': 417 | object_id_with_type = f'/directories/{current_obj_id}' 418 | elif object_type == 'notebook': 419 | object_id_with_type = f'/notebooks/{current_obj_id}' 420 | else: 421 | raise ValueError('Object for Workspace ACLs is Undefined') 422 | api_path = '/permissions' + object_id_with_type 423 | acl_list = object_acl.get('access_control_list', None) 424 | api_args = {'access_control_list': self.build_acl_args(acl_list)} 425 | resp = self.patch(api_path, api_args) 426 | print(resp) 427 | return resp 428 | 429 | def import_workspace_acls(self, workspace_log_file='acl_notebooks.log', 430 | dir_log_file='acl_directories.log'): 431 | """ 432 | import the notebook and directory acls by looping over notebook and dir logfiles 433 | """ 434 | dir_acl_logs = self.get_export_dir() + dir_log_file 435 | notebook_acl_logs = self.get_export_dir() + workspace_log_file 436 | with open(notebook_acl_logs) as nb_acls_fp: 437 | for nb_acl_str in nb_acls_fp: 438 | self.apply_acl_on_object(nb_acl_str) 439 | with open(dir_acl_logs) as dir_acls_fp: 440 | for dir_acl_str in dir_acls_fp: 441 | self.apply_acl_on_object(dir_acl_str) 442 | print("Completed import ACLs of Notebooks and Directories") 443 | 444 | def get_current_users(self): 445 | """ 446 | get the num of defined user home directories in the new workspace 447 | if this is 0, we must create the users before importing the notebooks over. 448 | we cannot create the users home directory since its a special type of directory 449 | """ 450 | ws_users = self.get(WS_LIST, {'path': '/Users/'}).get('objects', None) 451 | if ws_users: 452 | return len(ws_users) 453 | else: 454 | return 0 455 | 456 | def does_user_exist(self, username): 457 | """ 458 | check if the users home dir exists 459 | """ 460 | stat = self.get(WS_STATUS, {'path': '/Users/{0}'.format(username)}) 461 | if stat.get('object_type', None) == 'DIRECTORY': 462 | return True 463 | return False 464 | 465 | def does_path_exist(self, dir_path): 466 | status_resp = self.get(WS_STATUS, {'path': dir_path}) 467 | if 'error_code' in status_resp: 468 | if status_resp.get('error_code') == 'RESOURCE_DOES_NOT_EXIST': 469 | return False 470 | else: 471 | print('Failure:' + json.dumps(status_resp)) 472 | return False 473 | return True 474 | 475 | def import_current_workspace_items(self,artifact_dir='artifacts/'): 476 | src_dir = self.get_export_dir() + artifact_dir 477 | for root, subdirs, files in os.walk(src_dir): 478 | # replace the local directory with empty string to get the notebook workspace directory 479 | nb_dir = '/' + root.replace(src_dir, '') 480 | upload_dir = nb_dir 481 | if not nb_dir == '/': 482 | upload_dir = nb_dir + '/' 483 | if not self.does_path_exist(upload_dir): 484 | resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir}) 485 | for f in files: 486 | print("Uploading: {0}".format(f)) 487 | # create the local file path to load the DBC file 488 | local_file_path = os.path.join(root, f) 489 | # create the ws full file path including filename 490 | ws_file_path = upload_dir + f 491 | # generate json args with binary data for notebook to upload to the workspace path 492 | nb_input_args = self.get_user_import_args(local_file_path, ws_file_path) 493 | # call import to the workspace 494 | if self.is_verbose(): 495 | print("Path: {0}".format(nb_input_args['path'])) 496 | resp_upload = self.post(WS_IMPORT, nb_input_args) 497 | 498 | def import_all_workspace_items(self, artifact_dir='artifacts/', success_log='success_ws_import.log', 499 | failed_log='failed_ws_import.log', archive_missing=False, restart_from_last=False): 500 | """ 501 | import all notebooks into a new workspace 502 | :param artifact_dir: notebook download directory 503 | :param success_log: success log to allow recovery from last successful upload 504 | :param failed_log: failed import log 505 | :param archive_missing: whether to put missing users into a /Archive/ top level directory 506 | :param restart_from_last: flag to restart import and skip past successful imports 507 | """ 508 | src_dir = self.get_export_dir() + artifact_dir 509 | success_logfile = self.get_export_dir() + success_log 510 | failed_logfile = self.get_export_dir() + failed_log 511 | overwrite_or_append = 'w' 512 | if restart_from_last: 513 | # if we're restarting from checkpoint, append to the successful logfile 514 | overwrite_or_append = 'a' 515 | uploaded_hashmap = self.build_ws_lookup_table(success_logfile) 516 | print(uploaded_hashmap) 517 | else: 518 | # delete the log if we start from the beginning 519 | if os.path.exists(success_logfile): 520 | os.remove(success_logfile) 521 | num_exported_users = self.get_num_of_saved_users(src_dir) 522 | num_current_users = self.get_current_users() 523 | if num_current_users == 0: 524 | print("No registered users in existing environment. Please import users / groups first.") 525 | raise ValueError("No registered users in the current environment") 526 | if (num_current_users < num_exported_users) and (not archive_missing): 527 | print("Exported number of user workspaces: {0}".format(num_exported_users)) 528 | print("Current number of user workspaces: {0}".format(num_current_users)) 529 | print("Re-run with the `--archive-missing` flag to load missing users into a separate directory") 530 | raise ValueError("Current number of users is less than number of user workspaces to import.") 531 | archive_users = set() 532 | with open(success_logfile, overwrite_or_append) as success_fp, open(failed_logfile, 'w') as failed_fp: 533 | for root, subdirs, files in os.walk(src_dir): 534 | # replace the local directory with empty string to get the notebook workspace directory 535 | nb_dir = '/' + root.replace(src_dir, '') 536 | upload_dir = nb_dir 537 | if not nb_dir == '/': 538 | upload_dir = nb_dir + '/' 539 | if self.is_user_ws_item(upload_dir): 540 | ws_user = self.get_user(upload_dir) 541 | if archive_missing: 542 | if ws_user in archive_users: 543 | upload_dir = upload_dir.replace('Users', 'Archive', 1) 544 | elif not self.does_user_exist(ws_user): 545 | # add the user to the cache / set of missing users 546 | print("User workspace does not exist, adding to archive cache: {0}".format(ws_user)) 547 | archive_users.add(ws_user) 548 | # append the archive path to the upload directory 549 | upload_dir = upload_dir.replace('Users', 'Archive', 1) 550 | else: 551 | print("User workspace exists: {0}".format(ws_user)) 552 | elif not self.does_user_exist(ws_user): 553 | print("User {0} is missing. " 554 | "Please re-run with --archive-missing flag " 555 | "or first verify all users exist in the new workspace".format(ws_user)) 556 | return 557 | else: 558 | print("Uploading for user: {0}".format(ws_user)) 559 | # make the top level folder before uploading files within the loop 560 | if not self.is_user_ws_root(upload_dir): 561 | # if it is not the /Users/example@example.com/ root path, don't create the folder 562 | resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir}) 563 | for f in files: 564 | print("Uploading: {0}".format(f)) 565 | # create the local file path to load the DBC file 566 | local_file_path = os.path.join(root, f) 567 | # create the ws full file path including filename 568 | ws_file_path = upload_dir + f 569 | if restart_from_last: 570 | if ws_file_path in uploaded_hashmap: 571 | print(f"Skipping upload as file has already been uploaded: {ws_file_path}") 572 | continue 573 | # generate json args with binary data for notebook to upload to the workspace path 574 | nb_input_args = self.get_user_import_args(local_file_path, ws_file_path) 575 | # call import to the workspace 576 | if self.is_verbose(): 577 | print("Path: {0}".format(nb_input_args['path'])) 578 | resp_upload = self.post(WS_IMPORT, nb_input_args) 579 | if 'error_code' in resp_upload: 580 | # log this path to a success logfile 581 | print(f'Error uploading file: {ws_file_path}') 582 | failed_fp.write(json.dumps(resp_upload) + '\n') 583 | else: 584 | success_fp.write(ws_file_path + '\n') 585 | -------------------------------------------------------------------------------- /dbclient/HiveClient.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | import time 4 | import base64 5 | import re 6 | from datetime import timedelta 7 | from timeit import default_timer as timer 8 | from dbclient import * 9 | 10 | 11 | class HiveClient(ClustersClient): 12 | 13 | @staticmethod 14 | def is_delta_table(local_path): 15 | with open(local_path, 'r') as fp: 16 | for line in fp: 17 | lower_line = line.lower() 18 | if lower_line.startswith('using delta'): 19 | return True 20 | return False 21 | 22 | @staticmethod 23 | def get_ddl_by_keyword_group(local_path): 24 | """ 25 | return a list of DDL strings that are grouped by keyword arguments and their parameters 26 | """ 27 | ddl_statement = [] 28 | parameter_group = [] 29 | with open(local_path, 'r') as fp: 30 | for line in fp: 31 | raw = line.rstrip() 32 | if not raw: 33 | # make sure it's not an empty line, continue if empty 34 | continue 35 | if raw[0] == ' ' or raw[0] == ')': 36 | parameter_group.append(raw) 37 | else: 38 | if parameter_group: 39 | ddl_statement.append(''.join(parameter_group)) 40 | parameter_group = [raw] 41 | ddl_statement.append(''.join(parameter_group)) 42 | return ddl_statement 43 | 44 | @staticmethod 45 | def get_path_option_if_available(stmt): 46 | # parse the OPTIONS keyword and pull out the `path` parameter if it exists 47 | params = re.search(r'\((.*?)\)', stmt).group(1) 48 | params_list = list(map(lambda p: p.lstrip().rstrip(), params.split(','))) 49 | for x in params_list: 50 | if x.startswith('path'): 51 | return f'OPTIONS ( {x} )' 52 | return '' 53 | 54 | def is_table_location_defined(self, local_table_path): 55 | """ check if LOCATION or OPTIONS(path ..) are defined for the table 56 | """ 57 | ddl_statement = self.get_ddl_by_keyword_group(local_table_path) 58 | for keyword_param in ddl_statement: 59 | if keyword_param.startswith('OPTIONS'): 60 | options_param = self.get_path_option_if_available(keyword_param) 61 | if options_param: 62 | # if the return is not empty, the path option is provided which means its an external table 63 | return True 64 | elif keyword_param.startswith('LOCATION'): 65 | # if LOCATION is defined, we know the external table location 66 | return True 67 | return False 68 | 69 | def get_local_tmp_ddl_if_applicable(self, current_local_ddl_path): 70 | """ 71 | method to identify if we should update the current DDL if OPTIONS or TBLPROPERTIES keywords exist 72 | """ 73 | ddl_statement = self.get_ddl_by_keyword_group(current_local_ddl_path) 74 | tmp_ddl_path = self.get_export_dir() + 'tmp_ddl.txt' 75 | return_tmp_file = False 76 | with open(tmp_ddl_path, 'w') as fp: 77 | for keyword_param in ddl_statement: 78 | if keyword_param.startswith('OPTIONS'): 79 | return_tmp_file = True 80 | options_param = self.get_path_option_if_available(keyword_param) 81 | if options_param: 82 | fp.write(options_param + ' ') 83 | continue 84 | elif keyword_param.startswith('TBLPROPERTIES'): 85 | return_tmp_file = True 86 | continue 87 | fp.write(keyword_param + ' ') 88 | if return_tmp_file: 89 | return tmp_ddl_path 90 | else: 91 | os.remove(tmp_ddl_path) 92 | return current_local_ddl_path 93 | 94 | def update_table_ddl(self, local_table_path, db_path): 95 | # check if the database location / path is the default DBFS path 96 | table_name = os.path.basename(local_table_path) 97 | is_db_default_path = db_path.startswith('dbfs:/user/hive/warehouse') 98 | if (not is_db_default_path) and (not self.is_table_location_defined(local_table_path)): 99 | # the LOCATION attribute is not defined and the Database has a custom location defined 100 | # therefore we need to add it to the DDL, e.g. dbfs:/db_path/table_name 101 | table_path = db_path + '/' + table_name 102 | location_stmt = f"\nLOCATION '{table_path}'" 103 | with open(local_table_path, 'a') as fp: 104 | fp.write(location_stmt) 105 | return True 106 | return False 107 | 108 | def apply_table_ddl(self, local_table_path, ec_id, cid, db_path, has_unicode=False): 109 | """ 110 | Run DDL command on destination workspace 111 | :param local_table_path: local file path to the table DDL 112 | :param ec_id: execution context id to run remote commands 113 | :param cid: cluster id to connect to 114 | :param db_path: database S3 / Blob Storage / ADLS path for the Database 115 | :param has_unicode: Whether the table definitions have unicode characters. 116 | :return: rest api response 117 | """ 118 | # get file size in bytes 119 | updated_table_status = self.update_table_ddl(local_table_path, db_path) 120 | # update local table ddl to a new temp file with OPTIONS and TBLPROPERTIES removed from the DDL for delta tables 121 | if self.is_delta_table(local_table_path): 122 | local_table_path = self.get_local_tmp_ddl_if_applicable(local_table_path) 123 | 124 | f_size_bytes = os.path.getsize(local_table_path) 125 | if f_size_bytes > 1024 or has_unicode: 126 | # upload first to tmp DBFS path and apply 127 | dbfs_path = '/tmp/migration/tmp_import_ddl.txt' 128 | path_args = {'path': dbfs_path} 129 | del_resp = self.post('/dbfs/delete', path_args) 130 | if self.is_verbose(): 131 | print(del_resp) 132 | file_content_json = {'files': open(local_table_path, 'r')} 133 | put_resp = self.post('/dbfs/put', path_args, files_json=file_content_json) 134 | if self.is_verbose(): 135 | print(put_resp) 136 | spark_big_ddl_cmd = f'with open("/dbfs{dbfs_path}", "r") as fp: tmp_ddl = fp.read(); spark.sql(tmp_ddl)' 137 | ddl_results = self.submit_command(cid, ec_id, spark_big_ddl_cmd) 138 | return ddl_results 139 | else: 140 | with open(local_table_path, "r") as fp: 141 | ddl_statement = fp.read() 142 | spark_ddl_statement = self.get_spark_ddl(ddl_statement) 143 | ddl_results = self.submit_command(cid, ec_id, spark_ddl_statement) 144 | return ddl_results 145 | 146 | def check_if_instance_profiles_exists(self, log_file='instance_profiles.log'): 147 | ip_log = self.get_export_dir() + log_file 148 | ips = self.get('/instance-profiles/list').get('instance_profiles', None) 149 | if ips: 150 | with open(ip_log, "w") as fp: 151 | for x in ips: 152 | fp.write(json.dumps(x) + '\n') 153 | return True 154 | return False 155 | 156 | def create_database_db(self, db_name, ec_id, cid, db_attributes): 157 | location = db_attributes.get('Location', '') 158 | if not location.startswith('dbfs:/user/hive/warehouse/'): 159 | create_stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION '{location}'" 160 | else: 161 | create_stmt = f"CREATE DATABASE IF NOT EXISTS {db_name}" 162 | create_db_sql = f'spark.sql("{create_stmt}")' 163 | db_results = self.submit_command(cid, ec_id, create_db_sql) 164 | return db_results 165 | 166 | def get_database_detail_dict(self, db_log='database_details.log'): 167 | db_logfile = self.get_export_dir() + db_log 168 | all_db_json = {} 169 | with open(db_logfile, 'r') as fp: 170 | for x in fp: 171 | db_json = json.loads(x) 172 | db_name = db_json.pop('Database Name') 173 | all_db_json[db_name] = db_json 174 | return all_db_json 175 | 176 | def set_desc_database_helper(self, cid, ec_id): 177 | """ 178 | define the helper function on the cluster 179 | :param cid: cluster id to run against 180 | :param ec_id: execution id, aka spark session id 181 | :return: api response object 182 | """ 183 | # replacement strings 184 | helper_func_cmd1 = """def get_db_json(db_name): import json; rows = spark.sql(f"DESC DATABASE EXTENDED \ 185 | {db_name}").toJSON().collect(); return list(map(lambda x: json.loads(x), rows))""" 186 | helper_func_cmd2 = """def format_db_json(db_list): return dict(list(map(lambda x: \ 187 | (x.get('database_description_item'), x.get('database_description_value')), db_list)))""" 188 | helper_func_cmd3 = "def get_db_details(db_name): return format_db_json(get_db_json(db_name))" 189 | resp1 = self.submit_command(cid, ec_id, helper_func_cmd1) 190 | resp2 = self.submit_command(cid, ec_id, helper_func_cmd2) 191 | resp3 = self.submit_command(cid, ec_id, helper_func_cmd3) 192 | return resp3 193 | 194 | def get_desc_database_details(self, db_name, cid, ec_id): 195 | """ 196 | Returns a dict object of the `desc database extended {db_name}` command to include location, comment, etc fields 197 | :param db_name: database name to fetch 198 | :param cid: cluster id 199 | :param ec_id: execution id aka spark context id 200 | :return: database json object 201 | """ 202 | desc_database_cmd = f'print(get_db_details(\"{db_name}\"))' 203 | results = self.submit_command(cid, ec_id, desc_database_cmd) 204 | if results['resultType'] != 'text': 205 | print(json.dumps(results) + '\n') 206 | raise ValueError("Desc database extended failure") 207 | db_json = ast.literal_eval(results['data']) 208 | return db_json 209 | 210 | def export_database(self, db_name, cluster_name=None, iam_role=None, metastore_dir='metastore/', 211 | fail_log='failed_metastore.log', success_log='success_metastore.log', 212 | has_unicode=False, db_log='database_details.log'): 213 | """ 214 | :param db_name: database name 215 | :param cluster_name: cluster to run against if provided 216 | :param iam_role: iam role to launch the cluster with 217 | :param metastore_dir: directory to store all the metadata 218 | :param has_unicode: whether the metadata has unicode characters to export 219 | :param db_log: specific database properties logfile 220 | :return: 221 | """ 222 | # check if instance profile exists, ask users to use --users first or enter yes to proceed. 223 | start = timer() 224 | if cluster_name: 225 | cid = self.start_cluster_by_name(cluster_name) 226 | current_iam = self.get_iam_role_by_cid(cid) 227 | else: 228 | current_iam = iam_role 229 | cid = self.launch_cluster(current_iam) 230 | end = timer() 231 | print("Cluster creation time: " + str(timedelta(seconds=end - start))) 232 | time.sleep(5) 233 | ec_id = self.get_execution_context(cid) 234 | # if metastore failed log path exists, cleanup before re-running 235 | failed_metastore_log_path = self.get_export_dir() + fail_log 236 | success_metastore_log_path = self.get_export_dir() + success_log 237 | if os.path.exists(failed_metastore_log_path): 238 | os.remove(failed_metastore_log_path) 239 | if os.path.exists(success_metastore_log_path): 240 | os.remove(success_metastore_log_path) 241 | database_logfile = self.get_export_dir() + db_log 242 | resp = self.set_desc_database_helper(cid, ec_id) 243 | if self.is_verbose(): 244 | print(resp) 245 | with open(database_logfile, 'w') as fp: 246 | db_json = self.get_desc_database_details(db_name, cid, ec_id) 247 | fp.write(json.dumps(db_json) + '\n') 248 | os.makedirs(self.get_export_dir() + metastore_dir + db_name, exist_ok=True) 249 | self.log_all_tables(db_name, cid, ec_id, metastore_dir, failed_metastore_log_path, 250 | success_metastore_log_path, current_iam, has_unicode) 251 | 252 | def export_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', db_log='database_details.log', 253 | success_log='success_metastore.log', fail_log='failed_metastore.log', has_unicode=False): 254 | start = timer() 255 | instance_profiles = self.get_instance_profiles_list() 256 | if cluster_name: 257 | cid = self.start_cluster_by_name(cluster_name) 258 | current_iam_role = self.get_iam_role_by_cid(cid) 259 | elif instance_profiles: 260 | # if any instance profile exists, lets start w/ this on the first cluster to launch and export 261 | current_iam_role = instance_profiles[0] 262 | cid = self.launch_cluster(iam_role=current_iam_role) 263 | else: 264 | current_iam_role = None 265 | cid = self.launch_cluster() 266 | end = timer() 267 | print("Cluster creation time: " + str(timedelta(seconds=end - start))) 268 | time.sleep(5) 269 | ec_id = self.get_execution_context(cid) 270 | # if metastore failed log path exists, cleanup before re-running 271 | failed_metastore_log_path = self.get_export_dir() + fail_log 272 | success_metastore_log_path = self.get_export_dir() + success_log 273 | database_logfile = self.get_export_dir() + db_log 274 | if os.path.exists(failed_metastore_log_path): 275 | os.remove(failed_metastore_log_path) 276 | if os.path.exists(success_metastore_log_path): 277 | os.remove(success_metastore_log_path) 278 | all_dbs = self.get_all_databases(cid, ec_id) 279 | resp = self.set_desc_database_helper(cid, ec_id) 280 | if self.is_verbose(): 281 | print(resp) 282 | with open(database_logfile, 'w') as fp: 283 | for db_name in all_dbs: 284 | os.makedirs(self.get_export_dir() + metastore_dir + db_name, exist_ok=True) 285 | db_json = self.get_desc_database_details(db_name, cid, ec_id) 286 | fp.write(json.dumps(db_json) + '\n') 287 | self.log_all_tables(db_name, cid, ec_id, metastore_dir, failed_metastore_log_path, 288 | success_metastore_log_path, current_iam_role, has_unicode) 289 | 290 | total_failed_entries = self.get_num_of_lines(failed_metastore_log_path) 291 | if (not self.is_skip_failed()) and self.is_aws() and total_failed_entries > 0: 292 | print("Retrying failed metastore export with registered IAM roles") 293 | remaining_iam_roles = instance_profiles[1:] 294 | self.retry_failed_metastore_export(cid, failed_metastore_log_path, remaining_iam_roles, 295 | success_metastore_log_path, has_unicode) 296 | print("Failed count before retry: " + str(total_failed_entries)) 297 | print("Total Databases attempted export: " + str(len(all_dbs))) 298 | else: 299 | print("Failed count: " + str(total_failed_entries)) 300 | print("Total Databases attempted export: " + str(len(all_dbs))) 301 | 302 | @staticmethod 303 | def get_num_of_lines(filename): 304 | if not os.path.exists(filename): 305 | return 0 306 | else: 307 | i = 0 308 | with open(filename) as fp: 309 | for line in fp: 310 | i += 1 311 | return i 312 | 313 | @staticmethod 314 | def get_spark_ddl(table_ddl): 315 | """ 316 | Formats the provided DDL into spark.sql() command to run remotely 317 | """ 318 | spark_ddl = 'spark.sql(""" {0} """)'.format(table_ddl) 319 | return spark_ddl 320 | 321 | @staticmethod 322 | def is_ddl_a_view(ddl_list): 323 | first_statement = ddl_list[0] 324 | if first_statement.startswith('CREATE VIEW'): 325 | return True 326 | return False 327 | 328 | def move_table_view(self, db_name, tbl_name, local_table_ddl, views_dir='metastore_views/'): 329 | metastore_view_dir = self.get_export_dir() + views_dir 330 | ddl_statement = self.get_ddl_by_keyword_group(local_table_ddl) 331 | if self.is_ddl_a_view(ddl_statement): 332 | dst_local_ddl = metastore_view_dir + db_name + '/' + tbl_name 333 | os.rename(local_table_ddl, dst_local_ddl) 334 | return True 335 | return False 336 | 337 | def import_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', views_dir='metastore_views/', 338 | has_unicode=False): 339 | metastore_local_dir = self.get_export_dir() + metastore_dir 340 | metastore_view_dir = self.get_export_dir() + views_dir 341 | os.makedirs(metastore_view_dir, exist_ok=True) 342 | if cluster_name: 343 | cid = self.start_cluster_by_name(cluster_name) 344 | else: 345 | cid = self.launch_cluster() 346 | time.sleep(2) 347 | ec_id = self.get_execution_context(cid) 348 | # get local databases 349 | db_list = os.listdir(metastore_local_dir) 350 | # make directory in DBFS root bucket path for tmp data 351 | resp = self.post('/dbfs/mkdirs', {'path': '/tmp/migration/'}) 352 | # iterate over the databases saved locally 353 | all_db_details_json = self.get_database_detail_dict() 354 | for db_name in db_list: 355 | # create a dir to host the view ddl if we find them 356 | os.makedirs(metastore_view_dir + db_name, exist_ok=True) 357 | # get the local database path to list tables 358 | local_db_path = metastore_local_dir + db_name 359 | # get a dict of the database attributes 360 | database_attributes = all_db_details_json.get(db_name, '') 361 | if not database_attributes: 362 | print(all_db_details_json) 363 | raise ValueError('Missing Database Attributes Log. Re-run metastore export') 364 | create_db_resp = self.create_database_db(db_name, ec_id, cid, database_attributes) 365 | db_path = database_attributes.get('Location') 366 | if os.path.isdir(local_db_path): 367 | # all databases should be directories, no files at this level 368 | # list all the tables in the database local dir 369 | tables = os.listdir(local_db_path) 370 | for tbl_name in tables: 371 | # build the path for the table where the ddl is stored 372 | print("Importing table {0}.{1}".format(db_name, tbl_name)) 373 | local_table_ddl = metastore_local_dir + db_name + '/' + tbl_name 374 | if not self.move_table_view(db_name, tbl_name, local_table_ddl): 375 | # we hit a table ddl here, so we apply the ddl 376 | is_successful = self.apply_table_ddl(local_table_ddl, ec_id, cid, db_path, has_unicode) 377 | print(is_successful) 378 | else: 379 | print(f'Moving view ddl to re-apply later: {db_name}.{tbl_name}') 380 | else: 381 | print("Error: Only databases should exist at this level: {0}".format(db_name)) 382 | self.delete_dir_if_empty(metastore_view_dir + db_name) 383 | views_db_list = os.listdir(metastore_view_dir) 384 | for db_name in views_db_list: 385 | local_view_db_path = metastore_view_dir + db_name 386 | database_attributes = all_db_details_json.get(db_name, '') 387 | db_path = database_attributes.get('Location') 388 | if os.path.isdir(local_view_db_path): 389 | views = os.listdir(local_view_db_path) 390 | for view_name in views: 391 | print("Importing view {0}.{1}".format(db_name, view_name)) 392 | local_view_ddl = metastore_view_dir + db_name + '/' + view_name 393 | is_successful = self.apply_table_ddl(local_view_ddl, ec_id, cid, db_path, has_unicode) 394 | print(is_successful) 395 | 396 | def get_all_databases(self, cid, ec_id): 397 | # submit first command to find number of databases 398 | # DBR 7.0 changes databaseName to namespace for the return value of show databases 399 | all_dbs_cmd = 'all_dbs = [x.databaseName for x in spark.sql("show databases").collect()]; print(len(all_dbs))' 400 | results = self.submit_command(cid, ec_id, all_dbs_cmd) 401 | if results['resultType'] != 'text': 402 | print(json.dumps(results) + '\n') 403 | raise ValueError("Cannot identify number of databases due to the above error") 404 | num_of_dbs = ast.literal_eval(results['data']) 405 | batch_size = 100 # batch size to iterate over databases 406 | num_of_buckets = (num_of_dbs // batch_size) + 1 # number of slices of the list to take 407 | 408 | all_dbs = [] 409 | for m in range(0, num_of_buckets): 410 | db_slice = 'print(all_dbs[{0}:{1}])'.format(batch_size*m, batch_size*(m+1)) 411 | results = self.submit_command(cid, ec_id, db_slice) 412 | db_names = ast.literal_eval(results['data']) 413 | for db in db_names: 414 | all_dbs.append(db) 415 | print("Database: {0}".format(db)) 416 | return all_dbs 417 | 418 | def log_all_tables(self, db_name, cid, ec_id, metastore_dir, err_log_path, success_log_path, iam, 419 | has_unicode=False): 420 | all_tables_cmd = 'all_tables = [x.tableName for x in spark.sql("show tables in {0}").collect()]'.format(db_name) 421 | results = self.submit_command(cid, ec_id, all_tables_cmd) 422 | results = self.submit_command(cid, ec_id, 'print(len(all_tables))') 423 | num_of_tables = ast.literal_eval(results['data']) 424 | 425 | batch_size = 100 # batch size to iterate over databases 426 | num_of_buckets = (num_of_tables // batch_size) + 1 # number of slices of the list to take 427 | 428 | all_tables = [] 429 | with open(success_log_path, 'a') as sfp: 430 | for m in range(0, num_of_buckets): 431 | tables_slice = 'print(all_tables[{0}:{1}])'.format(batch_size*m, batch_size*(m+1)) 432 | results = self.submit_command(cid, ec_id, tables_slice) 433 | table_names = ast.literal_eval(results['data']) 434 | for table_name in table_names: 435 | print("Table: {0}".format(table_name)) 436 | is_successful = self.log_table_ddl(cid, ec_id, db_name, table_name, metastore_dir, 437 | err_log_path, has_unicode) 438 | if is_successful == 0: 439 | print(f"Exported {db_name}.{table_name}") 440 | success_item = {'table': f'{db_name}.{table_name}', 'iam': iam} 441 | sfp.write(json.dumps(success_item)) 442 | sfp.write('\n') 443 | else: 444 | print("Logging failure") 445 | return True 446 | 447 | def log_table_ddl(self, cid, ec_id, db_name, table_name, metastore_dir, err_log_path, has_unicode): 448 | """ 449 | Log the table DDL to handle large DDL text 450 | :param cid: cluster id 451 | :param ec_id: execution context id (rest api 1.2) 452 | :param db_name: database name 453 | :param table_name: table name 454 | :param metastore_dir: metastore export directory name 455 | :param err_log_path: log for errors 456 | :param has_unicode: export to a file if this flag is true 457 | :return: 0 for success, -1 for error 458 | """ 459 | set_ddl_str_cmd = f'ddl_str = spark.sql("show create table {db_name}.{table_name}").collect()[0][0]' 460 | ddl_str_resp = self.submit_command(cid, ec_id, set_ddl_str_cmd) 461 | with open(err_log_path, 'a') as err_log: 462 | if ddl_str_resp['resultType'] != 'text': 463 | ddl_str_resp['table'] = '{0}.{1}'.format(db_name, table_name) 464 | err_log.write(json.dumps(ddl_str_resp) + '\n') 465 | return -1 466 | get_ddl_str_len = 'ddl_len = len(ddl_str); print(ddl_len)' 467 | len_resp = self.submit_command(cid, ec_id, get_ddl_str_len) 468 | ddl_len = int(len_resp['data']) 469 | if ddl_len <= 0: 470 | len_resp['table'] = '{0}.{1}'.format(db_name, table_name) 471 | err_log.write(json.dumps(len_resp) + '\n') 472 | return -1 473 | # if (len > 2k chars) OR (has unicode chars) then export to file 474 | table_ddl_path = self.get_export_dir() + metastore_dir + db_name + '/' + table_name 475 | if ddl_len > 2048 or has_unicode: 476 | # create the dbfs tmp path for exports / imports. no-op if exists 477 | resp = self.post('/dbfs/mkdirs', {'path': '/tmp/migration/'}) 478 | # save the ddl to the tmp path on dbfs 479 | save_ddl_cmd = "with open('/dbfs/tmp/migration/tmp_export_ddl.txt', 'w') as fp: fp.write(ddl_str)" 480 | save_resp = self.submit_command(cid, ec_id, save_ddl_cmd) 481 | # read that data using the dbfs rest endpoint which can handle 2MB of text easily 482 | read_args = {'path': '/tmp/migration/tmp_export_ddl.txt'} 483 | read_resp = self.get('/dbfs/read', read_args) 484 | with open(table_ddl_path, "w") as fp: 485 | fp.write(base64.b64decode(read_resp.get('data')).decode('utf-8')) 486 | return 0 487 | else: 488 | export_ddl_cmd = 'print(ddl_str)' 489 | ddl_resp = self.submit_command(cid, ec_id, export_ddl_cmd) 490 | with open(table_ddl_path, "w") as fp: 491 | fp.write(ddl_resp.get('data')) 492 | return 0 493 | 494 | def retry_failed_metastore_export(self, cid, failed_metastore_log_path, iam_roles_list, success_metastore_log_path, 495 | has_unicode, metastore_dir='metastore/'): 496 | # check if instance profile exists, ask users to use --users first or enter yes to proceed. 497 | if self.is_aws() and iam_roles_list: 498 | do_instance_profile_exist = True 499 | else: 500 | do_instance_profile_exist = False 501 | # get total failed entries 502 | total_failed_entries = self.get_num_of_lines(failed_metastore_log_path) 503 | if do_instance_profile_exist: 504 | print("Instance profiles exist, retrying export of failed tables with each instance profile") 505 | err_log_list = [] 506 | with open(failed_metastore_log_path, 'r') as err_log: 507 | for table in err_log: 508 | err_log_list.append(table) 509 | 510 | with open(success_metastore_log_path, 'a') as sfp: 511 | for iam_role in iam_roles_list: 512 | self.edit_cluster(cid, iam_role) 513 | ec_id = self.get_execution_context(cid) 514 | for table in err_log_list: 515 | table_json = json.loads(table) 516 | db_name = table_json['table'].split(".")[0] 517 | table_name = table_json['table'].split(".")[1] 518 | 519 | is_successful = self.log_table_ddl(cid, ec_id, db_name, table_name, metastore_dir, 520 | failed_metastore_log_path, has_unicode) 521 | if is_successful == 0: 522 | err_log_list.remove(table) 523 | print(f"Exported {db_name}.{table_name}") 524 | success_item = {'table': f'{db_name}.{table_name}', 'iam': iam_role} 525 | sfp.write(json.dumps(success_item)) 526 | sfp.write('\n') 527 | else: 528 | print('Failed to get ddl for {0}.{1} with iam role {2}'.format(db_name, table_name, 529 | iam_role)) 530 | 531 | os.remove(failed_metastore_log_path) 532 | with open(failed_metastore_log_path, 'w') as fm: 533 | for table in err_log_list: 534 | fm.write(table) 535 | failed_count_after_retry = self.get_num_of_lines(failed_metastore_log_path) 536 | print("Failed count after retry: " + str(failed_count_after_retry)) 537 | else: 538 | print("No registered instance profiles to retry export") 539 | 540 | def report_legacy_tables_to_fix(self, metastore_dir='metastore/', fix_table_log='repair_tables.log'): 541 | metastore_local_dir = self.get_export_dir() + metastore_dir 542 | fix_log = self.get_export_dir() + fix_table_log 543 | db_list = os.listdir(metastore_local_dir) 544 | num_of_tables = 0 545 | with open(fix_log, 'w') as fp: 546 | for db_name in db_list: 547 | local_db_path = metastore_local_dir + db_name 548 | if os.path.isdir(local_db_path): 549 | # all databases should be directories, no files at this level 550 | # list all the tables in the database local dir 551 | tables = os.listdir(local_db_path) 552 | for tbl_name in tables: 553 | local_table_ddl = local_db_path + '/' + tbl_name 554 | if self.is_legacy_table_partitioned(local_table_ddl): 555 | num_of_tables += 1 556 | print(f'Table needs repair: {db_name}.{tbl_name}') 557 | fp.write(f'{db_name}.{tbl_name}\n') 558 | # once completed, check if the file exists 559 | log_size = os.stat(fix_log).st_size 560 | if log_size > 0: 561 | # repair log exists, upload to the platform to repair these tables 562 | print(f"Total number of tables needing repair: {num_of_tables}") 563 | dbfs_path = '/tmp/migration/repair_ddl.log' 564 | print(f"Uploading repair log to DBFS: {dbfs_path}") 565 | path_args = {'path': dbfs_path, 'overwrite': 'true'} 566 | file_content_json = {'files': open(fix_log, 'r')} 567 | put_resp = self.post('/dbfs/put', path_args, files_json=file_content_json) 568 | if self.is_verbose(): 569 | print(put_resp) 570 | else: 571 | os.remove(fix_log) 572 | 573 | def is_legacy_table_partitioned(self, table_local_path): 574 | if not self.is_delta_table(table_local_path): 575 | ddl_group = self.get_ddl_by_keyword_group(table_local_path) 576 | for kw in ddl_group: 577 | kw_lower = kw.lower() 578 | if kw_lower.startswith('partitioned by'): 579 | return True 580 | return False 581 | --------------------------------------------------------------------------------