├── .gitignore
├── __init__.py
├── data
    ├── default_jobs_cluster_aws.json
    ├── default_jobs_cluster_azure.json
    ├── azure_cluster.json
    ├── aws_cluster.json
    ├── repair_tables_for_migration.py
    └── workspace_migration_analysis.py
├── dbclient
    ├── __init__.py
    ├── LibraryClient.py
    ├── DbfsClient.py
    ├── SecretsClient.py
    ├── JobsClient.py
    ├── dbclient.py
    ├── parser.py
    ├── ScimClient.py
    ├── ClustersClient.py
    ├── WorkspaceClient.py
    └── HiveClient.py
├── setup.py
├── test_connection.py
├── METASTORE.md
├── LICENSE
├── import_db.py
├── export_db.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | venv
 3 | logs
 4 | user_logs
 5 | azure_logs
 6 | __pycache__
 7 | dbclient/*.pyc
 8 | build/
 9 | databricks_migration.egg-info/
10 | dist/
11 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | # outer __init__.py
2 | from dbclient import *
3 | from timeit import default_timer as timer
4 | from datetime import timedelta
5 | from os import makedirs, path
6 | from datetime import datetime
7 | 


--------------------------------------------------------------------------------
/data/default_jobs_cluster_aws.json:
--------------------------------------------------------------------------------
1 | {
2 |         "num_workers": 8,
3 |         "spark_version": "7.3.x-scala2.12",
4 |         "node_type_id": "i3.xlarge",
5 |         "spark_env_vars": {
6 |             "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
7 |         }
8 | }
9 | 


--------------------------------------------------------------------------------
/data/default_jobs_cluster_azure.json:
--------------------------------------------------------------------------------
1 | {
2 |         "num_workers": 8,
3 |         "spark_version": "7.3.x-scala2.12",
4 |         "node_type_id": "Standard_DS3_v2",
5 |         "spark_env_vars": {
6 |             "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
7 |         }
8 | }
9 | 


--------------------------------------------------------------------------------
/dbclient/__init__.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from .dbclient import dbclient
 3 | from .ClustersClient import ClustersClient
 4 | from .JobsClient import JobsClient
 5 | from .DbfsClient import DbfsClient
 6 | from .ScimClient import ScimClient
 7 | from .LibraryClient import LibraryClient
 8 | from .WorkspaceClient import WorkspaceClient
 9 | from .HiveClient import HiveClient
10 | from .SecretsClient import SecretsClient
11 | from .parser import *
12 | 


--------------------------------------------------------------------------------
/data/azure_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "num_workers": 1,
 3 |     "cluster_name": "API_Metastore_Work_Leave_Me_Alone",
 4 |     "spark_version": "7.3.x-scala2.12",
 5 |     "spark_conf": {},
 6 |     "node_type_id": "Standard_D8_v3",
 7 |     "ssh_public_keys": [],
 8 |     "custom_tags": {},
 9 |     "spark_env_vars": {
10 |         "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
11 |     },
12 |     "autotermination_minutes": 20,
13 |     "init_scripts": []
14 | }
15 | 


--------------------------------------------------------------------------------
/data/aws_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "num_workers": 1,
 3 |   "cluster_name": "API_Metastore_Work_Leave_Me_Alone",
 4 |   "spark_version": "7.3.x-scala2.12",
 5 |   "aws_attributes": {
 6 |       "first_on_demand": 1,
 7 |       "availability": "SPOT_WITH_FALLBACK",
 8 |       "zone_id": "us-west-2b",
 9 |       "spot_bid_price_percent": 100,
10 |       "ebs_volume_count": 0
11 |   },
12 |   "driver_node_type_id": "i3.xlarge",
13 |   "node_type_id": "i3.xlarge",
14 |   "spark_env_vars": {
15 |       "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
16 |   },
17 |   "autotermination_minutes": 20
18 | }
19 | 


--------------------------------------------------------------------------------
/data/repair_tables_for_migration.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | migration_log = '/dbfs/tmp/migration/repair_ddl.log'
 3 | 
 4 | num_of_tables = 0
 5 | with open(migration_log, 'r') as fp:
 6 |     for line in fp:
 7 |         # this is the db_name.tbl_name value
 8 |         fqdn_table = line.rstrip()
 9 |         fix_sql_statement = f"MSCK REPAIR TABLE {fqdn_table}"
10 |         print(fix_sql_statement)
11 |         df = spark.sql(fix_sql_statement)
12 |         num_of_tables += 1
13 | 
14 | # COMMAND ----------
15 | 
16 | print(f"Total number of tables repaired {num_of_tables}")
17 | 
18 | # COMMAND ----------
19 | 
20 | dbutils.fs.rm('/tmp/migration/repair_ddl.log')
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="databricks-migration", # Replace with your own username
 8 |     version="0.0.2",
 9 |     author="Miklos C",
10 |     author_email="mwc@databricks.com",
11 |     description="Databricks Migration scripts",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/mrchristine/db-migration",
15 |     license="http://www.apache.org/licenses/LICENSE-2.0",
16 |     packages=setuptools.find_packages(),
17 |     install_requires=[
18 |           'cron-descriptor',
19 |           'requests'
20 |       ],
21 |     py_modules=["export_db","import_db","test_connection"],
22 |     classifiers=[
23 |         "Programming Language :: Python :: 3",
24 |         "License :: OSI Approved :: MIT License",
25 |         "Operating System :: OS Independent",
26 |     ],
27 |     python_requires='>=3.6',
28 | )
29 | 


--------------------------------------------------------------------------------
/test_connection.py:
--------------------------------------------------------------------------------
 1 | from dbclient import *
 2 | import sys, requests
 3 | 
 4 | # python 3.6
 5 | 
 6 | def main():
 7 |     # define a parser to identify what component to import / export
 8 |     parser = get_export_parser()
 9 |     # parse the args
10 |     args = parser.parse_args()
11 |     p = args.profile
12 | 
13 |     # parse the path location of the Databricks CLI configuration
14 |     login_args = get_login_credentials(profile=p)
15 | 
16 |     # parse the credentials
17 |     url = login_args['host']
18 |     token = login_args['token']
19 |     client_config = build_client_config(url, token, args)
20 | 
21 |     print("Test connection at {0} with profile {1}\n".format(url, args.profile))
22 |     db_client = dbclient(client_config)
23 |     try:
24 |         is_successful = db_client.test_connection()
25 |     except requests.exceptions.RequestException as e:
26 |         print(e)
27 |         print("\nUnsuccessful connection. Verify credentials.\n")
28 |         sys.exit(1)
29 |     if is_successful == 0:
30 |         print("Connection successful!")
31 |     else:
32 |         print("\nUnsuccessful connection. Verify credentials.\n")
33 | 
34 | if __name__ == '__main__':
35 |     main()
36 | 


--------------------------------------------------------------------------------
/dbclient/LibraryClient.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dbclient import *
 3 | 
 4 | 
 5 | class LibraryClient(dbclient):
 6 | 
 7 |     def get_cluster_list(self, alive=True):
 8 |         """ Returns an array of json objects for the running clusters. Grab the cluster_name or cluster_id """
 9 |         cl = self.get("/clusters/list", print_json=False)
10 |         if alive:
11 |             running = filter(lambda x: x['state'] == "RUNNING", cl['clusters'])
12 |             return list(running)
13 |         else:
14 |             return cl['clusters']
15 | 
16 |     def log_library_details(self, log_file='lib_details.log'):
17 |         libs_log = self.get_export_dir() + log_file
18 |         all_libs = self.get('/libraries/list', version='1.2')
19 |         with open(libs_log, "w") as fp:
20 |             for x in all_libs.get('elements', None):
21 |                 lib_details = self.get('/libraries/status?libraryId={0}'.format(x['id']), version='1.2')
22 |                 fp.write(json.dumps(lib_details) + '\n')
23 | 
24 |     def log_cluster_libs(self, cl_log_file='attached_cluster_libs.log'):
25 |         cl_lib_log = self.get_export_dir() + cl_log_file
26 |         cl = self.get_cluster_list(False)
27 |         with open(cl_lib_log, "w") as fp:
28 |             for x in cl:
29 |                 cid = x['cluster_id']
30 |                 libs = self.get("/libraries/cluster-status?cluster_id={0}".format(cid))
31 |                 fp.write(json.dumps(libs))
32 |                 fp.write("\n")
33 | 


--------------------------------------------------------------------------------
/METASTORE.md:
--------------------------------------------------------------------------------
 1 | # Databricks Metastore Migration
 2 | 
 3 | This document discusses the metastore migration options and process. 
 4 | 
 5 | 1. Export the metastore DDL 
 6 | 2. Import the metastore DDL  
 7 |    a. The tool will import `TABLES` first  
 8 |    b. The tool will sideline `VIEWS` to be applied after all tables are created. Views will be sidelined into 
 9 |       `metastore_views/` directory in the export directory.   
10 |    c. The tool will import all `VIEWS`   
11 | 3. Copy the underlying DBFS / root table data. Databricks support team will need to help with this step.  
12 | 4. Report on legacy table DDLs to be repaired within the new workspace and metastore.   
13 |    a. Use the `--get-repair-log` option with the import tool. This will generate a list of tables that need to be 
14 |    repaired. The most common case of this is to register hive partitions within the metastore.  
15 |    b. The repair option will upload a list of tables to be repaired, and users can use the notebook included in this 
16 |    repo, `data/repair_tables_for_migration.py`, to run this operation. 
17 | 
18 | 
19 | **Recommendation / Caveats:**
20 | 1. Use the `--metastore-unicode` option to export and import if you do not know if tables contain unicode characters. 
21 |    This should be applied to both export and import operations.
22 | 2. Use DBR 6.x / Spark 2.x releases if you have legacy table definitions. 
23 |    Spark 3.x deprecates `SERDE` support and can cause import issues if you require those tables to use `SERDE` 
24 |    definitions. 
25 | 3. If you manually register table partitions using `ALTER TABLE table_name ADD PARTITION ()` to tables, you will need 
26 |    to manually report and add these partitions. The tool does not support this today. 
27 |    Or if you need to drop partitions, you can use `ALTER TABLE table_name DROP PARTITION ()`
28 | 


--------------------------------------------------------------------------------
/dbclient/DbfsClient.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import json
 3 | import os
 4 | import time
 5 | from datetime import timedelta
 6 | from timeit import default_timer as timer
 7 | 
 8 | from dbclient import *
 9 | 
10 | 
11 | class DbfsClient(ClustersClient):
12 | 
13 |     @staticmethod
14 |     def get_num_of_lines(fname):
15 |         if not os.path.exists(fname):
16 |             return 0
17 |         else:
18 |             i = 0
19 |             with open(fname) as fp:
20 |                 for line in fp:
21 |                     i += 1
22 |             return i
23 | 
24 |     def export_dbfs_mounts(self):
25 |         # check if instance profile exists, ask users to use --users first or enter yes to proceed.
26 |         start = timer()
27 |         cid = self.launch_cluster()
28 |         end = timer()
29 |         print("Cluster creation time: " + str(timedelta(seconds=end - start)))
30 |         time.sleep(5)
31 |         ec_id = self.get_execution_context(cid)
32 | 
33 |         # get all dbfs mount metadata
34 |         dbfs_mount_logfile = self.get_export_dir() + 'dbfs_mounts.log'
35 |         all_mounts_cmd = 'all_mounts = [{"path": x.mountPoint, "source": x.source, ' \
36 |                                         '"encryptionType": x.encryptionType} for x in dbutils.fs.mounts()]'
37 |         results = self.submit_command(cid, ec_id, all_mounts_cmd)
38 |         results = self.submit_command(cid, ec_id, 'print(len(all_mounts))')
39 |         # grab the number of mounts to bucket / batch the export
40 |         num_of_mounts = ast.literal_eval(results['data'])
41 | 
42 |         batch_size = 100    # batch size to iterate over databases
43 |         num_of_buckets = (num_of_mounts // batch_size) + 1     # number of slices of the list to take
44 | 
45 |         with open(dbfs_mount_logfile, 'w') as fp_log:
46 |             for m in range(0, num_of_buckets):
47 |                 mounts_slice = 'print(all_mounts[{0}:{1}])'.format(batch_size*m, batch_size*(m+1))
48 |                 results = self.submit_command(cid, ec_id, mounts_slice)
49 |                 mounts_slice_data = ast.literal_eval(results['data'])
50 |                 for mount_path in mounts_slice_data:
51 |                     print("Mounts: {0}".format(mount_path))
52 |                     fp_log.write(json.dumps(mount_path))
53 |                     fp_log.write('\n')
54 |         return True
55 | 


--------------------------------------------------------------------------------
/dbclient/SecretsClient.py:
--------------------------------------------------------------------------------
 1 | from dbclient import *
 2 | import os
 3 | import time
 4 | from timeit import default_timer as timer
 5 | 
 6 | 
 7 | class SecretsClient(ClustersClient):
 8 | 
 9 |     def get_secret_scopes_list(self):
10 |         scopes_list = self.get('/secrets/scopes/list').get('scopes', [])
11 |         return scopes_list
12 | 
13 |     def get_secrets(self, scope_name):
14 |         secrets_list = self.get('/secrets/list', {'scope': scope_name}).get('secrets', [])
15 |         return secrets_list
16 | 
17 |     def get_secret_value(self, scope_name, secret_key, cid, ec_id):
18 |         cmd_set_value = f"value = dbutils.secrets.get(scope = '{scope_name}', key = '{secret_key}')"
19 |         cmd_convert_b64 = "import base64; b64_value = base64.b64encode(value.encode('ascii'))"
20 |         cmd_get_b64 = "print(b64_value.decode('ascii'))"
21 |         results_set = self.submit_command(cid, ec_id, cmd_set_value)
22 |         results_convert = self.submit_command(cid, ec_id, cmd_convert_b64)
23 |         results_get = self.submit_command(cid, ec_id, cmd_get_b64)
24 |         if results_set['resultType'] == 'error' \
25 |                 or results_convert['resultType'] == 'error'\
26 |                 or results_get['resultType'] == 'error':
27 |             print("Error:")
28 |             print(results_set)
29 |             print(results_convert)
30 |             print(results_get)
31 |         s_value = results_get.get('data')
32 |         return s_value
33 | 
34 |     def log_all_secrets(self, cluster_name, log_dir='secret_scopes/'):
35 |         scopes_dir = self.get_export_dir() + log_dir
36 |         scopes_list = self.get_secret_scopes_list()
37 |         os.makedirs(scopes_dir, exist_ok=True)
38 |         start = timer()
39 |         cid = self.start_cluster_by_name(cluster_name)
40 |         time.sleep(5)
41 |         ec_id = self.get_execution_context(cid)
42 |         for scope_json in scopes_list:
43 |             scope_name = scope_json.get('name')
44 |             secrets_list = self.get_secrets(scope_name)
45 |             scopes_logfile = scopes_dir + scope_name
46 |             with open(scopes_logfile, 'w') as fp:
47 |                 for secret_json in secrets_list:
48 |                     secret_name = secret_json.get('key')
49 |                     b64_value = self.get_secret_value(scope_name, secret_name, cid, ec_id)
50 |                     s_json = {'name': secret_name, 'value': b64_value}
51 |                     fp.write(json.dumps(s_json) + '\n')
52 | 
53 |     def log_all_secrets_acls(self, log_name='secret_scopes_acls.log'):
54 |         acls_file = self.get_export_dir() + log_name
55 |         scopes_list = self.get_secret_scopes_list()
56 |         with open(acls_file, 'w') as fp:
57 |             for scope_json in scopes_list:
58 |                 scope_name = scope_json.get('name', None)
59 |                 resp = self.get('/secrets/acls/list', {'scope': scope_name})
60 |                 resp['scope_name'] = scope_name
61 |                 fp.write(json.dumps(resp) + '\n')


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Workspace Migration Tool
 2 | 
 3 | Copyright (2019) Databricks, Inc.
 4 | 
 5 | This library (the "Software") may not be used except in connection with the Licensee's use of the Databricks Platform Services pursuant 
 6 | to an Agreement (defined below) between Licensee (defined below) and Databricks, Inc. ("Databricks"). The Object Code version of the 
 7 | Software shall be deemed part of the Downloadable Services under the Agreement, or if the Agreement does not define Downloadable Services, 
 8 | Subscription Services, or if neither are defined then the term in such Agreement that refers to the applicable Databricks Platform 
 9 | Services (as defined below) shall be substituted herein for “Downloadable Services.”  Licensee's use of the Software must comply at 
10 | all times with any restrictions applicable to the Downlodable Services and Subscription Services, generally, and must be used in 
11 | accordance with any applicable documentation. For the avoidance of doubt, the Software constitutes Databricks Confidential Information
12 | under the Agreement.
13 | 
14 | Additionally, and notwithstanding anything in the Agreement to the contrary: 
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
16 |   OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
17 |   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
18 |   IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | * you may view, make limited copies of, and may compile the Source Code version of the Software into an Object Code version of the
20 |   Software.  For the avoidance of doubt, you may not make derivative works of Software (or make any any changes to the Source Code 
21 |   version of the unless you have agreed to separate terms with Databricks permitting such modifications (e.g., a contribution license
22 |   agreement)).
23 | 
24 | If you have not agreed to an Agreement or otherwise do not agree to these terms, you may not use the Software or view, copy or compile
25 | the Source Code of the Software.
26 |   
27 | This license terminates automatically upon the termination of the Agreement or Licensee's breach of these terms.  Additionally, 
28 | Databricks may terminate this license at any time on notice.  Upon termination, you must permanently delete the Software and all
29 | copies thereof (including the Source Code).
30 | 
31 | Agreement: the agreement between Databricks and Licensee governing the use of the Databricks Platform Services, which shall be, with
32 | respect to Databricks, the Databricks Terms of Service located at www.databricks.com/termsofservice, and with respect to Databricks
33 | Community Edition, the Community Edition Terms of Service located at www.databricks.com/ce-termsofuse, in each case unless Licensee 
34 | has entered into a separate written agreement with Databricks governing the use of the applicable Databricks Platform Services.
35 |  
36 | Databricks Platform Services: the Databricks services or the Databricks Community Edition services, according to where the Software is used.
37 | 
38 | Licensee: the user of the Software, or, if the Software is being used on behalf of a company, the company.
39 | 
40 | Object Code: is version of the Software produced when an interpreter or a compiler translates the Source Code into recognizable and 
41 | executable machine code.
42 | 
43 | Source Code: the human readable portion of the Software.
44 | 
45 | 


--------------------------------------------------------------------------------
/import_db.py:
--------------------------------------------------------------------------------
  1 | from dbclient import *
  2 | from timeit import default_timer as timer
  3 | from datetime import timedelta, datetime
  4 | from os import makedirs
  5 | 
  6 | 
  7 | # python 3.6
  8 | def main():
  9 |     # define a parser to identify what component to import / export
 10 |     my_parser = get_import_parser()
 11 |     # parse the args
 12 |     args = my_parser.parse_args()
 13 | 
 14 |     # parse the path location of the Databricks CLI configuration
 15 |     login_args = get_login_credentials(profile=args.profile)
 16 |     if is_azure_creds(login_args) and (not args.azure):
 17 |         raise ValueError('Login credentials do not match args. Please provide --azure flag for azure environments.')
 18 | 
 19 |     # cant use netrc credentials because requests module tries to load the credentials into http basic auth headers
 20 |     url = login_args['host']
 21 |     token = login_args['token']
 22 |     client_config = build_client_config(url, token, args)
 23 | 
 24 |     makedirs(client_config['export_dir'], exist_ok=True)
 25 | 
 26 |     if client_config['debug']:
 27 |         print(url, token)
 28 |     now = str(datetime.now())
 29 | 
 30 |     if args.users:
 31 |         print("Import all users and groups at {0}".format(now))
 32 |         scim_c = ScimClient(client_config)
 33 |         if client_config['is_aws']:
 34 |             print("Start import of instance profiles first to ensure they exist...")
 35 |             cl_c = ClustersClient(client_config)
 36 |             start = timer()
 37 |             cl_c.import_instance_profiles()
 38 |             end = timer()
 39 |             print("Complete Instance Profile Import Time: " + str(timedelta(seconds=end - start)))
 40 |         start = timer()
 41 |         scim_c.import_all_users_and_groups()
 42 |         end = timer()
 43 |         print("Complete Users and Groups Import Time: " + str(timedelta(seconds=end - start)))
 44 | 
 45 |     if args.workspace:
 46 |         print("Import the complete workspace at {0}".format(now))
 47 |         print("Import on {0}".format(url))
 48 |         ws_c = WorkspaceClient(client_config)
 49 |         start = timer()
 50 |         # log notebooks and libraries
 51 |         ws_c.import_all_workspace_items(archive_missing=args.archive_missing,
 52 |                                         restart_from_last=args.restart_from_checkpoint)
 53 |         end = timer()
 54 |         print("Complete Workspace Import Time: " + str(timedelta(seconds=end - start)))
 55 | 
 56 |     if args.workspace_top_level:
 57 |         print("Import the top level workspace items at {0}".format(now))
 58 |         print("Import on {0}".format(url))
 59 |         ws_c = WorkspaceClient(client_config)
 60 |         start = timer()
 61 |         # log notebooks and libraries
 62 |         ws_c.import_current_workspace_items()
 63 |         end = timer()
 64 |         print("Complete Workspace Import Time: " + str(timedelta(seconds=end - start)))
 65 | 
 66 |     if args.workspace_acls:
 67 |         print("Import workspace ACLs at {0}".format(now))
 68 |         print("Import on {0}".format(url))
 69 |         ws_c = WorkspaceClient(client_config)
 70 |         start = timer()
 71 |         # log notebooks and libraries
 72 |         ws_c.import_workspace_acls()
 73 |         end = timer()
 74 |         print("Complete Workspace acl Import Time: " + str(timedelta(seconds=end - start)))
 75 | 
 76 |     if args.clusters:
 77 |         print("Import all cluster configs at {0}".format(now))
 78 |         cl_c = ClustersClient(client_config)
 79 |         if client_config['is_aws']:
 80 |             print("Start import of instance profiles ...")
 81 |             start = timer()
 82 |             cl_c.import_instance_profiles()
 83 |             end = timer()
 84 |             print("Complete Instance Profile Import Time: " + str(timedelta(seconds=end - start)))
 85 |         print("Start import of cluster policies ...")
 86 |         start = timer()
 87 |         cl_c.import_cluster_policies()
 88 |         end = timer()
 89 |         print("Complete Cluster Policies Creation Time: " + str(timedelta(seconds=end - start)))
 90 |         print("Start import of instance pool configurations ...")
 91 |         start = timer()
 92 |         cl_c.import_instance_pools()
 93 |         end = timer()
 94 |         print("Complete Instance Pools Creation Time: " + str(timedelta(seconds=end - start)))
 95 |         print("Start import of cluster configurations ...")
 96 |         start = timer()
 97 |         cl_c.import_cluster_configs()
 98 |         end = timer()
 99 |         print("Complete Cluster Import Time: " + str(timedelta(seconds=end - start)))
100 | 
101 |     if args.jobs:
102 |         print("Importing the jobs configs at {0}".format(now))
103 |         start = timer()
104 |         jobs_c = JobsClient(client_config)
105 |         jobs_c.import_job_configs()
106 |         end = timer()
107 |         print("Complete Jobs Export Time: " + str(timedelta(seconds=end - start)))
108 | 
109 |     if args.metastore or args.metastore_unicode:
110 |         print("Importing the metastore configs at {0}".format(now))
111 |         start = timer()
112 |         hive_c = HiveClient(client_config)
113 |         # log job configs
114 |         hive_c.import_hive_metastore(cluster_name=args.cluster_name, has_unicode=args.metastore_unicode)
115 |         end = timer()
116 |         print("Complete Metastore Import Time: " + str(timedelta(seconds=end - start)))
117 | 
118 |     if args.pause_all_jobs:
119 |         print("Pause all current jobs {0}".format(now))
120 |         start = timer()
121 |         jobs_c = JobsClient(client_config)
122 |         # log job configs
123 |         jobs_c.pause_all_jobs()
124 |         end = timer()
125 |         print("Paused all jobs time: " + str(timedelta(seconds=end - start)))
126 | 
127 |     if args.unpause_all_jobs:
128 |         print("Unpause all current jobs {0}".format(now))
129 |         start = timer()
130 |         jobs_c = JobsClient(client_config)
131 |         # log job configs
132 |         jobs_c.pause_all_jobs(False)
133 |         end = timer()
134 |         print("Unpaused all jobs time: " + str(timedelta(seconds=end - start)))
135 | 
136 |     if args.delete_all_jobs:
137 |         print("Delete all current jobs {0}".format(now))
138 |         start = timer()
139 |         jobs_c = JobsClient(client_config)
140 |         url = jobs_c.get_url()
141 |         response = prompt_for_input(f'\nPlease confirm that you would like to delete jobs from {url} [yes/no]:')
142 |         if response:
143 |             print("Deleting all job configs ... ")
144 |             jobs_c.delete_all_jobs()
145 |         end = timer()
146 |         print("Delete all jobs time: " + str(timedelta(seconds=end - start)))
147 | 
148 |     if args.single_user:
149 |         user_email = args.single_user
150 |         print(f"Import user {user_email} at {now}")
151 |         scim_c = ScimClient(client_config)
152 |         start = timer()
153 |         # log all users
154 |         scim_c.import_single_user(user_email)
155 |         end = timer()
156 |         print("Complete single user import: " + str(timedelta(seconds=end - start)))
157 | 
158 |     if args.import_home:
159 |         username = args.import_home
160 |         print("Importing home directory: {0}".format(username))
161 |         ws_c = WorkspaceClient(client_config)
162 |         start = timer()
163 |         # log notebooks and libraries
164 |         ws_c.import_user_home(username, 'user_exports')
165 |         end = timer()
166 |         print("Complete Single User Import Time: " + str(timedelta(seconds=end - start)))
167 | 
168 |     if args.import_groups:
169 |         print("Importing Groups from logs")
170 |         start = timer()
171 |         scim_c = ScimClient(client_config)
172 |         scim_c.import_all_users_and_groups()
173 |         user_names = scim_c.get_users_from_log()
174 |         print('Export users notebooks:', user_names)
175 |         ws_c = WorkspaceClient(client_config)
176 |         for username in user_names:
177 |             ws_c.import_user_home(username, 'user_exports')
178 |         jobs_c = JobsClient(client_config)
179 |         # this will only import the groups jobs since we're filtering the jobs during the export process
180 |         print('Importing the groups members jobs:')
181 |         jobs_c.import_job_configs()
182 |         end = timer()
183 |         print("Complete User Export Time: " + str(timedelta(seconds=end - start)))
184 | 
185 |     if args.libs:
186 |         start = timer()
187 |         print("Not supported today")
188 |         end = timer()
189 |         # print("Complete Library Import Time: " + str(timedelta(seconds=end - start)))
190 | 
191 |     if args.get_repair_log:
192 |         print("Finding partitioned tables to repair at {0}".format(now))
193 |         start = timer()
194 |         hive_c = HiveClient(client_config)
195 |         # log job configs
196 |         hive_c.report_legacy_tables_to_fix()
197 |         end = timer()
198 |         print("Complete Report Time: " + str(timedelta(seconds=end - start)))
199 | 
200 | 
201 | if __name__ == '__main__':
202 |     main()
203 | 


--------------------------------------------------------------------------------
/dbclient/JobsClient.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from dbclient import *
  4 | 
  5 | 
  6 | class JobsClient(ClustersClient):
  7 | 
  8 |     def get_jobs_default_cluster_conf(self):
  9 |         if self.is_aws():
 10 |             cluster_json_file = 'data/default_jobs_cluster_aws.json'
 11 |         else:
 12 |             cluster_json_file = 'data/default_jobs_cluster_azure.json'
 13 |         with open(cluster_json_file, 'r') as fp:
 14 |             cluster_json = json.loads(fp.read())
 15 |             return cluster_json
 16 | 
 17 |     def get_jobs_list(self, print_json=False):
 18 |         """ Returns an array of json objects for jobs """
 19 |         jobs = self.get("/jobs/list", print_json)
 20 |         return jobs.get('jobs', [])
 21 | 
 22 |     def get_job_id_by_name(self):
 23 |         """
 24 |         get a dict mapping of job name to job id for the new job ids
 25 |         :return:
 26 |         """
 27 |         jobs = self.get_jobs_list()
 28 |         job_ids = {}
 29 |         for job in jobs:
 30 |             job_ids[job['settings']['name']] = job['job_id']
 31 |         return job_ids
 32 | 
 33 |     def update_imported_job_names(self):
 34 |         # loop through and update the job names to remove the custom delimiter + job_id suffix
 35 |         current_jobs_list = self.get_jobs_list()
 36 |         for job in current_jobs_list:
 37 |             job_id = job['job_id']
 38 |             job_name = job['settings']['name']
 39 |             # job name was set to `old_job_name:::{job_id}` to support duplicate job names
 40 |             # we need to parse the old job name and update the current jobs
 41 |             old_job_name = job_name.split(':::')[0]
 42 |             new_settings = {'name': old_job_name}
 43 |             update_args = {'job_id': job_id, 'new_settings': new_settings}
 44 |             print('Updating job name:', update_args)
 45 |             resp = self.post('/jobs/update', update_args)
 46 |             print(resp)
 47 | 
 48 |     def log_job_configs(self, users_list=[], log_file='jobs.log', acl_file='acl_jobs.log'):
 49 |         """
 50 |         log all job configs and the ACLs for each job
 51 |         :param users_list: a list of users / emails to filter the results upon (optional for group exports)
 52 |         :param log_file: log file to store job configs as json entries per line
 53 |         :param acl_file: log file to store job ACLs
 54 |         :return:
 55 |         """
 56 |         jobs_log = self.get_export_dir() + log_file
 57 |         acl_jobs_log = self.get_export_dir() + acl_file
 58 |         # pinned by cluster_user is a flag per cluster
 59 |         jl_full = self.get_jobs_list(False)
 60 |         if users_list:
 61 |             # filter the jobs list to only contain users that exist within this list
 62 |             jl = list(filter(lambda x: x['creator_user_name'] in users_list, jl_full))
 63 |         else:
 64 |             jl = jl_full
 65 |         with open(jobs_log, "w") as log_fp, open(acl_jobs_log, 'w') as acl_fp:
 66 |             for x in jl:
 67 |                 job_id = x['job_id']
 68 |                 new_job_name = x['settings']['name'] + ':::' + str(job_id)
 69 |                 # grab the settings obj
 70 |                 job_settings = x['settings']
 71 |                 # update the job name
 72 |                 job_settings['name'] = new_job_name
 73 |                 # reset the original struct with the new settings
 74 |                 x['settings'] = job_settings
 75 |                 log_fp.write(json.dumps(x) + '\n')
 76 |                 job_perms = self.get(f'/preview/permissions/jobs/{job_id}')
 77 |                 job_perms['job_name'] = new_job_name
 78 |                 acl_fp.write(json.dumps(job_perms) + '\n')
 79 | 
 80 |     def import_job_configs(self, log_file='jobs.log', acl_file='acl_jobs.log'):
 81 |         jobs_log = self.get_export_dir() + log_file
 82 |         acl_jobs_log = self.get_export_dir() + acl_file
 83 |         if not os.path.exists(jobs_log):
 84 |             print("No job configurations to import.")
 85 |             return
 86 |         # get an old cluster id to new cluster id mapping object
 87 |         cluster_mapping = self.get_cluster_id_mapping()
 88 |         old_2_new_policy_ids = self.get_new_policy_id_dict()  # dict { old_policy_id : new_policy_id }
 89 |         with open(jobs_log, 'r') as fp:
 90 |             for line in fp:
 91 |                 job_conf = json.loads(line)
 92 |                 job_creator = job_conf.get('creator_user_name', '')
 93 |                 job_settings = job_conf['settings']
 94 |                 job_schedule = job_settings.get('schedule', None)
 95 |                 if job_schedule:
 96 |                     # set all imported jobs as paused
 97 |                     job_schedule['pause_status'] = 'PAUSED'
 98 |                     job_settings['schedule'] = job_schedule
 99 |                 if 'existing_cluster_id' in job_settings:
100 |                     old_cid = job_settings['existing_cluster_id']
101 |                     # set new cluster id for existing cluster attribute
102 |                     new_cid = cluster_mapping.get(old_cid, None)
103 |                     if not new_cid:
104 |                         print("Existing cluster has been removed. Resetting job to use new cluster.")
105 |                         job_settings.pop('existing_cluster_id')
106 |                         job_settings['new_cluster'] = self.get_jobs_default_cluster_conf()
107 |                     else:
108 |                         job_settings['existing_cluster_id'] = new_cid
109 |                 else:  # new cluster config
110 |                     cluster_conf = job_settings['new_cluster']
111 |                     if 'policy_id' in cluster_conf:
112 |                         old_policy_id = cluster_conf['policy_id']
113 |                         cluster_conf['policy_id'] = old_2_new_policy_ids[old_policy_id]
114 |                     # check for instance pools and modify cluster attributes
115 |                     if 'instance_pool_id' in cluster_conf:
116 |                         new_cluster_conf = self.cleanup_cluster_pool_configs(cluster_conf, job_creator, True)
117 |                     else:
118 |                         new_cluster_conf = cluster_conf
119 |                     job_settings['new_cluster'] = new_cluster_conf
120 |                 print("Current Job Name: {0}".format(job_conf['settings']['name']))
121 |                 # creator can be none if the user is no longer in the org. see our docs page
122 |                 creator_user_name = job_conf.get('creator_user_name', None)
123 |                 create_resp = self.post('/jobs/create', job_settings)
124 |                 if 'error_code' in create_resp:
125 |                     print("Resetting job to use default cluster configs due to expired configurations.")
126 |                     job_settings['new_cluster'] = self.get_jobs_default_cluster_conf()
127 |                     create_resp_retry = self.post('/jobs/create', job_settings)
128 |         # update the jobs with their ACLs
129 |         with open(acl_jobs_log, 'r') as acl_fp:
130 |             job_id_by_name = self.get_job_id_by_name()
131 |             for line in acl_fp:
132 |                 acl_conf = json.loads(line)
133 |                 current_job_id = job_id_by_name[acl_conf['job_name']]
134 |                 job_path = f'jobs/{current_job_id}'  # contains `/jobs/{job_id}` path
135 |                 api = f'/preview/permissions/{job_path}'
136 |                 # get acl permissions for jobs
137 |                 acl_perms = self.build_acl_args(acl_conf['access_control_list'], True)
138 |                 acl_create_args = {'access_control_list': acl_perms}
139 |                 acl_resp = self.patch(api, acl_create_args)
140 |                 print(acl_resp)
141 |         # update the imported job names
142 |         self.update_imported_job_names()
143 | 
144 |     def pause_all_jobs(self, pause=True):
145 |         job_list = self.get('/jobs/list').get('jobs', None)
146 |         for job_conf in job_list:
147 |             job_settings = job_conf['settings']
148 |             job_schedule = job_settings.get('schedule', None)
149 |             if job_schedule:
150 |                 # set all imported jobs as paused or un-paused
151 |                 if pause:
152 |                     job_schedule['pause_status'] = 'PAUSED'
153 |                 else:
154 |                     job_schedule['pause_status'] = 'UNPAUSED'
155 |                 job_settings['schedule'] = job_schedule
156 |                 update_job_conf = {'job_id': job_conf['job_id'],
157 |                                    'new_settings': job_settings}
158 |                 update_job_resp = self.post('/jobs/reset', update_job_conf)
159 | 
160 |     def delete_all_jobs(self):
161 |         job_list = self.get('/jobs/list').get('jobs', [])
162 |         for job in job_list:
163 |             self.post('/jobs/delete', {'job_id': job['job_id']})
164 | 
165 |     def get_cluster_id_mapping(self, log_file='clusters.log'):
166 |         """
167 |         Get a dict mapping of old cluster ids to new cluster ids for jobs connecting to existing clusters
168 |         :param log_file:
169 |         :return:
170 |         """
171 |         cluster_logfile = self.get_export_dir() + log_file
172 |         current_cl = self.get('/clusters/list').get('clusters', [])
173 |         old_clusters = {}
174 |         # build dict with old cluster name to cluster id mapping
175 |         if not os.path.exists(cluster_logfile):
176 |             raise ValueError('Clusters log must exist to map clusters to previous existing cluster ids')
177 |         with open(cluster_logfile, 'r') as fp:
178 |             for line in fp:
179 |                 conf = json.loads(line)
180 |                 old_clusters[conf['cluster_name']] = conf['cluster_id']
181 |         new_to_old_mapping = {}
182 |         for new_cluster in current_cl:
183 |             old_cluster_id = old_clusters.get(new_cluster['cluster_name'], None)
184 |             if old_cluster_id:
185 |                 new_to_old_mapping[old_cluster_id] = new_cluster['cluster_id']
186 |         return new_to_old_mapping
187 | 


--------------------------------------------------------------------------------
/data/workspace_migration_analysis.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | import json, os, datetime, requests
  3 | import requests.packages.urllib3
  4 | 
  5 | global pprint_j
  6 | 
  7 | requests.packages.urllib3.disable_warnings()
  8 | 
  9 | 
 10 | # Helper to pretty print json
 11 | def pprint_j(i):
 12 |     print(json.dumps(i, indent=4, sort_keys=True))
 13 | 
 14 | 
 15 | class dbclient:
 16 |     """
 17 |     Rest API Wrapper for Databricks APIs
 18 |     """
 19 |     # set of http error codes to throw an exception if hit. Handles client and auth errors
 20 |     http_error_codes = (401, 403)
 21 | 
 22 |     def __init__(self, token, url):
 23 |         self._token = {'Authorization': 'Bearer {0}'.format(token)}
 24 |         self._url = url.rstrip("/")
 25 |         self._is_verbose = False
 26 |         self._verify_ssl = False
 27 |         if self._verify_ssl:
 28 |             # set these env variables if skip SSL verification is enabled
 29 |             os.environ['REQUESTS_CA_BUNDLE'] = ""
 30 |             os.environ['CURL_CA_BUNDLE'] = ""
 31 | 
 32 |     def is_aws(self):
 33 |         return self._is_aws
 34 | 
 35 |     def is_verbose(self):
 36 |         return self._is_verbose
 37 | 
 38 |     def is_skip_failed(self):
 39 |         return self._skip_failed
 40 | 
 41 |     def test_connection(self):
 42 |         # verify the proper url settings to configure this client
 43 |         if self._url[-4:] != '.com' and self._url[-4:] != '.net':
 44 |             print("Hostname should end in '.com'")
 45 |             return -1
 46 |         results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token,
 47 |                                verify=self._verify_ssl)
 48 |         http_status_code = results.status_code
 49 |         if http_status_code != 200:
 50 |             print("Error. Either the credentials have expired or the credentials don't have proper permissions.")
 51 |             print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.")
 52 |             print(results.text)
 53 |             return -1
 54 |         return 0
 55 | 
 56 |     def get(self, endpoint, json_params=None, version='2.0', print_json=False):
 57 |         if version:
 58 |             ver = version
 59 |         full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint
 60 |         if self.is_verbose():
 61 |             print("Get: {0}".format(full_endpoint))
 62 |         if json_params:
 63 |             raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl)
 64 |             http_status_code = raw_results.status_code
 65 |             if http_status_code in dbclient.http_error_codes:
 66 |                 raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text))
 67 |             results = raw_results.json()
 68 |         else:
 69 |             raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl)
 70 |             http_status_code = raw_results.status_code
 71 |             if http_status_code in dbclient.http_error_codes:
 72 |                 raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text))
 73 |             results = raw_results.json()
 74 |         if print_json:
 75 |             print(json.dumps(results, indent=4, sort_keys=True))
 76 |         if type(results) == list:
 77 |             results = {'elements': results}
 78 |         results['http_status_code'] = raw_results.status_code
 79 |         return results
 80 | 
 81 |     def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None):
 82 |         if version:
 83 |             ver = version
 84 |         full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint
 85 |         if self.is_verbose():
 86 |             print("{0}: {1}".format(http_type, full_endpoint))
 87 |         if json_params:
 88 |             if http_type == 'post':
 89 |                 if files_json:
 90 |                     raw_results = requests.post(full_endpoint, headers=self._token,
 91 |                                                 data=json_params, files=files_json, verify=self._verify_ssl)
 92 |                 else:
 93 |                     raw_results = requests.post(full_endpoint, headers=self._token,
 94 |                                                 json=json_params, verify=self._verify_ssl)
 95 |             if http_type == 'put':
 96 |                 raw_results = requests.put(full_endpoint, headers=self._token,
 97 |                                            json=json_params, verify=self._verify_ssl)
 98 |             if http_type == 'patch':
 99 |                 raw_results = requests.patch(full_endpoint, headers=self._token,
100 |                                              json=json_params, verify=self._verify_ssl)
101 |             
102 |             http_status_code = raw_results.status_code
103 |             if http_status_code in dbclient.http_error_codes:
104 |                 raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type,
105 |                                                                                       http_status_code,
106 |                                                                                       raw_results.text))
107 |             results = raw_results.json()
108 |         else:
109 |             print("Must have a payload in json_args param.")
110 |             return {}
111 |         if print_json:
112 |             print(json.dumps(results, indent=4, sort_keys=True))
113 |         # if results are empty, let's return the return status
114 |         if results:
115 |             results['http_status_code'] = raw_results.status_code
116 |             return results
117 |         else:
118 |             return {'http_status_code': raw_results.status_code}
119 | 
120 |     def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None):
121 |         return self.http_req('post', endpoint, json_params, version, print_json, files_json)
122 | 
123 |     def put(self, endpoint, json_params, version='2.0', print_json=False):
124 |         return self.http_req('put', endpoint, json_params, version, print_json)
125 | 
126 |     def patch(self, endpoint, json_params, version='2.0', print_json=False):
127 |         return self.http_req('patch', endpoint, json_params, version, print_json)
128 | 
129 |     @staticmethod
130 |     def my_map(F, items):
131 |         to_return = []
132 |         for elem in items:
133 |             to_return.append(F(elem))
134 |         return to_return
135 | 
136 |     def set_export_dir(self, dir_location):
137 |         self._export_dir = dir_location
138 | 
139 |     def get_export_dir(self):
140 |         return self._export_dir
141 | 
142 |     def get_latest_spark_version(self):
143 |         versions = self.get('/clusters/spark-versions')['versions']
144 |         v_sorted = sorted(versions, key=lambda i: i['key'], reverse=True)
145 |         for x in v_sorted:
146 |             img_type = x['key'].split('-')[1][0:5]
147 |             if img_type == 'scala':
148 |                 return x
149 | 
150 | 
151 | # COMMAND ----------
152 | 
153 | class migrateclient(dbclient):
154 |     
155 |     def get_num_defined_jobs(self):
156 |       jobs_list = self.get('/jobs/list').get('jobs', [])
157 |       return len(jobs_list)
158 |     
159 |     def get_num_external_jobs(self):
160 |       job_runs = self.get('/jobs/runs/list').get('runs', [])
161 |       job_ids_list = set(map(lambda x: x.get('job_id', None), self.get('/jobs/list').get('jobs', [])))
162 |       job_ids_from_runs = set(map(lambda x: x.get('job_id', None), job_runs))
163 |       ephemeral_job_ids = job_ids_from_runs - job_ids_list 
164 |       return len(ephemeral_job_ids)
165 |     
166 |     def get_num_users(self):
167 |       users = self.get('/preview/scim/v2/Users').get('Resources', [])
168 |       return len(users)
169 |     
170 |     def get_num_groups(self):
171 |       groups = self.get('/preview/scim/v2/Groups').get('Resources', [])
172 |       return len(groups)
173 |     
174 |     def get_num_notebooks(self, second_level=False):
175 |       users = self.get('/preview/scim/v2/Users').get('Resources', [])
176 |       total_nbs = 0 
177 |       second_level_dirs = []
178 |       for user in users:
179 |         path = '/Users/' + user['userName']
180 |         ls = self.get('/workspace/list', {'path' : path}).get('objects', [])
181 |         nbs = list(filter(lambda x: x.get('object_type', None) == 'NOTEBOOK', ls))
182 |         total_nbs += len(nbs) 
183 |         dirs = list(filter(lambda x: x.get('object_type', None) == 'DIRECTORY', ls))
184 |         for p in dirs:
185 |           dir_path = p.get('path')
186 |           ls_dir = self.get('/workspace/list', {'path' : dir_path}).get('objects', [])
187 |           dir_nbs = list(filter(lambda x: x.get('object_type', None) == 'NOTEBOOK', ls_dir))
188 |           second_level_dirs.extend(filter(lambda x: x.get('object_type', None) == 'DIRECTORY', ls_dir))
189 |           total_nbs += len(dir_nbs) 
190 |       # search 2 levels deep only to get an approximate notebook count
191 |       if second_level:
192 |         for p in second_level_dirs:
193 |           dir_path = p.get('path')
194 |           ls_dir = self.get('/workspace/list', {'path' : dir_path}).get('objects', [])
195 |           dir_nbs = list(filter(lambda x: x.get('object_type', None) == 'NOTEBOOK', ls_dir))
196 |           total_nbs += len(dir_nbs) 
197 |       return total_nbs 
198 |         
199 |     def get_num_databases(self):
200 |       dbs = spark.catalog.listDatabases()
201 |       return len(dbs)
202 |     
203 |     def get_num_tables(self):
204 |       dbs = spark.catalog.listDatabases()
205 |       table_count = 0
206 |       for db in dbs:
207 |         tables = spark.catalog.listTables(db.name)
208 |         table_count += len(tables)
209 |       return table_count 
210 |       
211 | 
212 | # COMMAND ----------
213 | 
214 | url = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) 
215 | token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)
216 | 
217 | client = migrateclient(token, url)
218 | 
219 | # COMMAND ----------
220 | 
221 | print("Num of users: ", client.get_num_users())
222 | print("Num of groups: ", client.get_num_groups())
223 | print("Approximate num of notebooks: ", client.get_num_notebooks(True))
224 | print("Num of internal jobs: ", client.get_num_defined_jobs())
225 | print("Num of external jobs: ", client.get_num_external_jobs())
226 | print("Num of databases: ", client.get_num_databases())
227 | print("Num of tables: ", client.get_num_tables())
228 | 
229 | # COMMAND ----------
230 | 
231 | 
232 | 


--------------------------------------------------------------------------------
/export_db.py:
--------------------------------------------------------------------------------
  1 | from dbclient import *
  2 | from timeit import default_timer as timer
  3 | from datetime import timedelta, datetime
  4 | import os
  5 | import shutil
  6 | 
  7 | 
  8 | # python 3.6
  9 | def main():
 10 |     # define a parser to identify what component to import / export
 11 |     my_parser = get_export_parser()
 12 |     # parse the args
 13 |     args = my_parser.parse_args()
 14 | 
 15 |     if os.name == 'nt' and (not args.bypass_windows_check):
 16 |         raise ValueError('This tool currently does not support running on Windows OS')
 17 | 
 18 |     # parse the path location of the Databricks CLI configuration
 19 |     login_args = get_login_credentials(profile=args.profile)
 20 |     if is_azure_creds(login_args) and (not args.azure):
 21 |         raise ValueError('Login credentials do not match args. Please provide --azure flag for azure envs.')
 22 | 
 23 |     # cant use netrc credentials because requests module tries to load the credentials into http basic auth headers
 24 |     # parse the credentials
 25 |     url = login_args['host']
 26 |     token = login_args['token']
 27 |     client_config = build_client_config(url, token, args)
 28 | 
 29 |     os.makedirs(client_config['export_dir'], exist_ok=True)
 30 | 
 31 |     if client_config['debug']:
 32 |         print(url, token)
 33 |     now = str(datetime.now())
 34 | 
 35 |     if args.users:
 36 |         print("Export all users and groups at {0}".format(now))
 37 |         scim_c = ScimClient(client_config)
 38 |         start = timer()
 39 |         # log all users
 40 |         scim_c.log_all_users()
 41 |         end = timer()
 42 |         print("Complete Users Export Time: " + str(timedelta(seconds=end - start)))
 43 |         start = timer()
 44 |         # log all groups
 45 |         scim_c.log_all_groups()
 46 |         end = timer()
 47 |         print("Complete Group Export Time: " + str(timedelta(seconds=end - start)))
 48 |         # log the instance profiles
 49 |         if scim_c.is_aws():
 50 |             cl_c = ClustersClient(client_config)
 51 |             print("Start instance profile logging ...")
 52 |             start = timer()
 53 |             cl_c.log_instance_profiles()
 54 |             end = timer()
 55 |             print("Complete Instance Profile Export Time: " + str(timedelta(seconds=end - start)))
 56 | 
 57 |     if args.workspace:
 58 |         print("Export the complete workspace at {0}".format(now))
 59 |         ws_c = WorkspaceClient(client_config)
 60 |         start = timer()
 61 |         # log notebooks and libraries
 62 |         ws_c.init_workspace_logfiles()
 63 |         num_notebooks = ws_c.log_all_workspace_items()
 64 |         print("Total number of notebooks logged: ", num_notebooks)
 65 |         end = timer()
 66 |         print("Complete Workspace Export Time: " + str(timedelta(seconds=end - start)))
 67 | 
 68 |     if args.workspace_acls:
 69 |         print("Export the ACLs for workspace objects at {0}".format(now))
 70 |         ws_c = WorkspaceClient(client_config)
 71 |         start = timer()
 72 |         # log notebooks and directory acls
 73 |         ws_c.log_all_workspace_acls()
 74 |         end = timer()
 75 |         print("Complete Workspace Permission Export Time: " + str(timedelta(seconds=end - start)))
 76 | 
 77 |     if args.download:
 78 |         print("Starting complete workspace download at {0}".format(now))
 79 |         ws_c = WorkspaceClient(client_config)
 80 |         start = timer()
 81 |         # log notebooks and libraries
 82 |         num_notebooks = ws_c.download_notebooks()
 83 |         print(f"Total number of notebooks downloaded: {num_notebooks}")
 84 |         end = timer()
 85 |         print("Complete Workspace Download Time: " + str(timedelta(seconds=end - start)))
 86 | 
 87 |     if args.libs:
 88 |         if not client_config['is_aws']:
 89 |             print("Databricks does not support library exports on Azure today")
 90 |         else:
 91 |             print("Starting complete library log at {0}".format(now))
 92 |             lib_c = LibraryClient(client_config)
 93 |             start = timer()
 94 |             lib_c.log_library_details()
 95 |             end = timer()
 96 |             print("Complete Library Download Time: " + str(timedelta(seconds=end - start)))
 97 | 
 98 |     if args.clusters:
 99 |         print("Export the cluster configs at {0}".format(now))
100 |         cl_c = ClustersClient(client_config)
101 |         start = timer()
102 |         # log the cluster json
103 |         cl_c.log_cluster_configs()
104 |         cl_c.log_cluster_policies()
105 |         end = timer()
106 |         print("Complete Cluster Export Time: " + str(timedelta(seconds=end - start)))
107 |         # log the instance pools
108 |         print("Start instance pool logging ...")
109 |         start = timer()
110 |         cl_c.log_instance_pools()
111 |         end = timer()
112 |         print("Complete Instance Pools Export Time: " + str(timedelta(seconds=end - start)))
113 | 
114 |     if args.jobs:
115 |         print("Export the jobs configs at {0}".format(now))
116 |         start = timer()
117 |         jobs_c = JobsClient(client_config)
118 |         # log job configs
119 |         jobs_c.log_job_configs()
120 |         end = timer()
121 |         print("Complete Jobs Export Time: " + str(timedelta(seconds=end - start)))
122 | 
123 |     if args.pause_all_jobs:
124 |         print("Pause all current jobs {0}".format(now))
125 |         start = timer()
126 |         jobs_c = JobsClient(client_config)
127 |         # log job configs
128 |         jobs_c.pause_all_jobs()
129 |         end = timer()
130 |         print("Paused all jobs time: " + str(timedelta(seconds=end - start)))
131 | 
132 |     if args.unpause_all_jobs:
133 |         print("Unpause all current jobs {0}".format(now))
134 |         start = timer()
135 |         jobs_c = JobsClient(client_config)
136 |         # log job configs
137 |         jobs_c.pause_all_jobs(False)
138 |         end = timer()
139 |         print("Unpaused all jobs time: " + str(timedelta(seconds=end - start)))
140 | 
141 |     if args.metastore or args.metastore_unicode:
142 |         print("Export the metastore configs at {0}".format(now))
143 |         start = timer()
144 |         hive_c = HiveClient(client_config)
145 |         if args.database is not None:
146 |             # export only a single database with a given iam role
147 |             database_name = args.database
148 |             hive_c.export_database(database_name, args.cluster_name, args.iam, has_unicode=args.metastore_unicode)
149 |         else:
150 |             # export all of the metastore
151 |             hive_c.export_hive_metastore(cluster_name=args.cluster_name, has_unicode=args.metastore_unicode)
152 |         end = timer()
153 |         print("Complete Metastore Export Time: " + str(timedelta(seconds=end - start)))
154 | 
155 |     if args.secrets:
156 |         if not args.cluster_name:
157 |             print("Please provide an existing cluster name w/ --cluster-name option\n")
158 |             return
159 |         print("Export the secret scopes configs at {0}".format(now))
160 |         start = timer()
161 |         sc = SecretsClient(client_config)
162 |         # log job configs
163 |         sc.log_all_secrets(args.cluster_name)
164 |         sc.log_all_secrets_acls()
165 |         end = timer()
166 |         print("Complete Secrets Export Time: " + str(timedelta(seconds=end - start)))
167 | 
168 |     if args.mounts:
169 |         print("Export the mount configs at {0}".format(now))
170 |         start = timer()
171 |         dbfs_c = DbfsClient(client_config)
172 |         # log job configs
173 |         dbfs_c.export_dbfs_mounts()
174 |         end = timer()
175 |         print("Complete Mounts Export Time: " + str(timedelta(seconds=end - start)))
176 | 
177 |     if args.update_account_id and args.old_account_id:
178 |         print("Updating old account id to new account at {0}".format(now))
179 |         start = timer()
180 |         client = dbclient(client_config)
181 |         client.update_account_id(args.update_account_id, args.old_account_id)
182 |         end = timer()
183 |         print("Complete account id update time: " + str(timedelta(seconds=end - start)))
184 | 
185 |     if args.replace_old_email and args.update_new_email:
186 |         print("Updating old email to new email address at {0}".format(now))
187 |         start = timer()
188 |         client = dbclient(client_config)
189 |         client.update_email_addresses(args.replace_old_email, args.update_new_email)
190 |         end = timer()
191 |         print("Complete email update time: " + str(timedelta(seconds=end - start)))
192 | 
193 |     if args.replace_email:
194 |         print("Updating old email(s) to new email(s)) at {0}".format(now))
195 |         start = timer()
196 |         client = dbclient(client_config)
197 |         #parse list list of e-mail mapping pairs. Format is:  old1@email.com:new1@e-mail.com,old2email.com:new2@email.com
198 |         emailpairs = args.replace_email.split(',')
199 |         print(str(len(emailpairs)) +' emails found to replace')
200 |         for emailpair in emailpairs:
201 |             if len(emailpair.split(':')) < 2:
202 |                 print('Syntax error in e-mail '+emailpair+'. Old e-mail address and new e-mail address new to be separated by a :')
203 |             else:
204 |                 old_email=emailpair.split(':')[0]
205 |                 new_email=emailpair.split(':')[1]
206 |                 print('Replacing old e-mail: '+old_email+' with new e-mail '+new_email)
207 |                 client.update_email_addresses(old_email, new_email)
208 |         end = timer()
209 |         print("Complete email update time: " + str(timedelta(seconds=end - start)))
210 | 
211 |     if args.single_user:
212 |         user_email = args.single_user
213 |         print(f"Export user {user_email} at {now}")
214 |         scim_c = ScimClient(client_config)
215 |         start = timer()
216 |         # log all users
217 |         scim_c.log_single_user(user_email)
218 |         end = timer()
219 |         print("Complete single user export: " + str(timedelta(seconds=end - start)))
220 | 
221 |     if args.workspace_top_level_only:
222 |         print("Export top level workspace objects at {0}".format(now))
223 |         ws_c = WorkspaceClient(client_config)
224 |         start = timer()
225 |         # log notebooks and directory acls
226 |         ws_c.export_top_level_folders()
227 |         end = timer()
228 |         print("Complete Workspace Top Level Notebooks Export Time: " + str(timedelta(seconds=end - start)))
229 | 
230 |     if args.export_home:
231 |         username = args.export_home
232 |         print("Exporting home directory: {0}".format(username))
233 |         ws_c = WorkspaceClient(client_config)
234 |         start = timer()
235 |         # log notebooks and libraries
236 |         ws_c.export_user_home(username, 'user_exports')
237 |         end = timer()
238 |         print("Complete User Export Time: " + str(timedelta(seconds=end - start)))
239 | 
240 |     if args.export_groups:
241 |         group_name_list = convert_args_to_list(args.export_groups)
242 |         print("Exporting Groups: {0}".format(group_name_list))
243 |         start = timer()
244 |         scim_c = ScimClient(client_config)
245 |         # log notebooks and libraries
246 |         user_names = scim_c.log_groups_from_list(group_name_list)
247 |         print('Export users notebooks:', user_names)
248 |         ws_c = WorkspaceClient(client_config)
249 |         for username in user_names:
250 |             is_user_home_empty = ws_c.is_user_home_empty(username)
251 |             if not is_user_home_empty:
252 |                 ws_c.export_user_home(username, 'user_exports')
253 |         print('Exporting users jobs:')
254 |         jobs_c = JobsClient(client_config)
255 |         jobs_c.log_job_configs(users_list=user_names)
256 |         end = timer()
257 |         print("Complete User Export Time: " + str(timedelta(seconds=end - start)))
258 | 
259 |     if args.reset_exports:
260 |         print('Request to clean up old export directory')
261 |         start = timer()
262 |         client = dbclient(client_config)
263 |         export_dir = client.get_export_dir()
264 |         response = prompt_for_input(f'\nPlease confirm that you would like to delete all the logs from {export_dir}'
265 |                                     f' [yes/no]:')
266 |         if response:
267 |             print('Deleting old export directory and logs ...')
268 |             try:
269 |                 shutil.rmtree(export_dir)
270 |             except OSError as e:
271 |                 print("Error: %s - %s." % (e.filename, e.strerror))
272 |         end = timer()
273 |         print("Completed cleanup: " + str(timedelta(seconds=end - start)))
274 | 
275 | 
276 | if __name__ == '__main__':
277 |     main()
278 | 


--------------------------------------------------------------------------------
/dbclient/dbclient.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import requests
  4 | import fileinput
  5 | import requests.packages.urllib3
  6 | 
  7 | global pprint_j
  8 | 
  9 | requests.packages.urllib3.disable_warnings()
 10 | 
 11 | 
 12 | # Helper to pretty print json
 13 | def pprint_j(i):
 14 |     print(json.dumps(i, indent=4, sort_keys=True))
 15 | 
 16 | 
 17 | class dbclient:
 18 |     """
 19 |     Rest API Wrapper for Databricks APIs
 20 |     """
 21 |     # set of http error codes to throw an exception if hit. Handles client and auth errors
 22 |     http_error_codes = (401, 403)
 23 | 
 24 |     def __init__(self, configs):
 25 |         self._token = {'Authorization': 'Bearer {0}'.format(configs['token'])}
 26 |         self._url = configs['url'].rstrip("/")
 27 |         self._export_dir = configs['export_dir']
 28 |         self._is_aws = configs['is_aws']
 29 |         self._skip_failed = configs['skip_failed']
 30 |         self._is_verbose = configs['verbose']
 31 |         self._verify_ssl = configs['verify_ssl']
 32 |         self._file_format = configs['file_format']
 33 |         if self._verify_ssl:
 34 |             # set these env variables if skip SSL verification is enabled
 35 |             os.environ['REQUESTS_CA_BUNDLE'] = ""
 36 |             os.environ['CURL_CA_BUNDLE'] = ""
 37 |         os.makedirs(self._export_dir, exist_ok=True)
 38 | 
 39 |     def is_aws(self):
 40 |         return self._is_aws
 41 | 
 42 |     def is_verbose(self):
 43 |         return self._is_verbose
 44 | 
 45 |     def is_skip_failed(self):
 46 |         return self._skip_failed
 47 | 
 48 |     def get_file_format(self):
 49 |         return self._file_format
 50 | 
 51 |     def is_source_file_format(self):
 52 |         if self._file_format == 'SOURCE':
 53 |             return True
 54 |         return False
 55 | 
 56 |     def test_connection(self):
 57 |         # verify the proper url settings to configure this client
 58 |         if self._url[-4:] != '.com' and self._url[-4:] != '.net':
 59 |             print("Hostname should end in '.com'")
 60 |             return -1
 61 |         results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token,
 62 |                                verify=self._verify_ssl)
 63 |         http_status_code = results.status_code
 64 |         if http_status_code != 200:
 65 |             print("Error. Either the credentials have expired or the credentials don't have proper permissions.")
 66 |             print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.")
 67 |             print(results.text)
 68 |             return -1
 69 |         return 0
 70 | 
 71 |     @staticmethod
 72 |     def delete_dir_if_empty(local_dir):
 73 |         if len(os.listdir(local_dir)) == 0:
 74 |             os.rmdir(local_dir)
 75 | 
 76 |     def get(self, endpoint, json_params=None, version='2.0', print_json=False):
 77 |         if version:
 78 |             ver = version
 79 |         full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint
 80 |         if self.is_verbose():
 81 |             print("Get: {0}".format(full_endpoint))
 82 |         if json_params:
 83 |             raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl)
 84 |             http_status_code = raw_results.status_code
 85 |             if http_status_code in dbclient.http_error_codes:
 86 |                 raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text))
 87 |             results = raw_results.json()
 88 |         else:
 89 |             raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl)
 90 |             http_status_code = raw_results.status_code
 91 |             if http_status_code in dbclient.http_error_codes:
 92 |                 raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text))
 93 |             results = raw_results.json()
 94 |         if print_json:
 95 |             print(json.dumps(results, indent=4, sort_keys=True))
 96 |         if type(results) == list:
 97 |             results = {'elements': results}
 98 |         results['http_status_code'] = raw_results.status_code
 99 |         return results
100 | 
101 |     def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None):
102 |         if version:
103 |             ver = version
104 |         full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint
105 |         if self.is_verbose():
106 |             print("{0}: {1}".format(http_type, full_endpoint))
107 |         if json_params:
108 |             if http_type == 'post':
109 |                 if files_json:
110 |                     raw_results = requests.post(full_endpoint, headers=self._token,
111 |                                                 data=json_params, files=files_json, verify=self._verify_ssl)
112 |                 else:
113 |                     raw_results = requests.post(full_endpoint, headers=self._token,
114 |                                                 json=json_params, verify=self._verify_ssl)
115 |             if http_type == 'put':
116 |                 raw_results = requests.put(full_endpoint, headers=self._token,
117 |                                            json=json_params, verify=self._verify_ssl)
118 |             if http_type == 'patch':
119 |                 raw_results = requests.patch(full_endpoint, headers=self._token,
120 |                                              json=json_params, verify=self._verify_ssl)
121 |             http_status_code = raw_results.status_code
122 |             if http_status_code in dbclient.http_error_codes:
123 |                 raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type,
124 |                                                                                       http_status_code,
125 |                                                                                       raw_results.text))
126 |             results = raw_results.json()
127 |         else:
128 |             print("Must have a payload in json_args param.")
129 |             return {}
130 |         if print_json:
131 |             print(json.dumps(results, indent=4, sort_keys=True))
132 |         # if results are empty, let's return the return status
133 |         if results:
134 |             results['http_status_code'] = raw_results.status_code
135 |             return results
136 |         else:
137 |             return {'http_status_code': raw_results.status_code}
138 | 
139 |     def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None):
140 |         return self.http_req('post', endpoint, json_params, version, print_json, files_json)
141 | 
142 |     def put(self, endpoint, json_params, version='2.0', print_json=False):
143 |         return self.http_req('put', endpoint, json_params, version, print_json)
144 | 
145 |     def patch(self, endpoint, json_params, version='2.0', print_json=False):
146 |         return self.http_req('patch', endpoint, json_params, version, print_json)
147 | 
148 |     @staticmethod
149 |     def get_num_lines(filename):
150 |         i = 0
151 |         with open(filename) as f:
152 |             for i, l in enumerate(f):
153 |                 pass
154 |         return i + 1
155 | 
156 |     @staticmethod
157 |     def get_key(http_resp, key_name):
158 |         value = http_resp.get(key_name, None)
159 |         if value is None:
160 |             print(http_resp)
161 |             raise ValueError('Unable to find key')
162 |         return value
163 | 
164 |     @staticmethod
165 |     def my_map(F, items):
166 |         to_return = []
167 |         for elem in items:
168 |             to_return.append(F(elem))
169 |         return to_return
170 | 
171 |     def whoami(self):
172 |         """
173 |         get current user userName from SCIM API
174 |         :return: username string
175 |         """
176 |         user_name = self.get('/preview/scim/v2/Me').get('userName')
177 |         return user_name
178 | 
179 |     def build_acl_args(self, full_acl_list, is_jobs=False):
180 |         """
181 |         Take the ACL json and return a json that corresponds to the proper input with permission level one level higher
182 |         { 'acl': [ { (user_name, group_name): {'permission_level': '*'}, ... ] }
183 |         for job ACLs, we need to reset the OWNER, so set the admin as CAN_MANAGE instead
184 |         :param full_acl_list:
185 |         :return:
186 |         """
187 |         acls_list = []
188 |         current_owner = ''
189 |         for member in full_acl_list:
190 |             permissions = member.get('all_permissions')[0].get('permission_level')
191 |             if 'user_name' in member:
192 |                 acls_list.append({'user_name': member.get('user_name'),
193 |                                   'permission_level': permissions})
194 |                 if permissions == 'IS_OWNER':
195 |                     current_owner = member.get('user_name')
196 |             else:
197 |                 if member.get('group_name') != 'admins':
198 |                     acls_list.append({'group_name': member.get('group_name'),
199 |                                       'permission_level': permissions})
200 |                     if permissions == 'IS_OWNER':
201 |                         current_owner = member.get('group_name')
202 | 
203 |         if is_jobs:
204 |             me = self.whoami()
205 |             if current_owner != me:
206 |                 update_admin = {'user_name': self.whoami(),
207 |                                 'permission_level': 'CAN_MANAGE'}
208 |                 acls_list.append(update_admin)
209 |         return acls_list
210 | 
211 |     def set_export_dir(self, dir_location):
212 |         self._export_dir = dir_location
213 | 
214 |     def get_export_dir(self):
215 |         return self._export_dir
216 | 
217 |     def get_url(self):
218 |         return self._url
219 | 
220 |     def get_latest_spark_version(self):
221 |         versions = self.get('/clusters/spark-versions')['versions']
222 |         v_sorted = sorted(versions, key=lambda i: i['key'], reverse=True)
223 |         for x in v_sorted:
224 |             img_type = x['key'].split('-')[1][0:5]
225 |             if img_type == 'scala':
226 |                 return x
227 | 
228 |     def replace_file_contents(self, old_str, new_str, filename):
229 |         """
230 |         regex replace all occurrences of a string with a new value
231 |         :param old_str: old value to replace, e.g. account id, old email, etc.
232 |         :param new_str: new value
233 |         :param filename: logfile path relative to the export dir
234 |         :return:
235 |         """
236 |         log_dir = self.get_export_dir()
237 |         update_filename = log_dir + filename
238 |         with fileinput.FileInput(update_filename, inplace=True, backup='.bak') as fp:
239 |             for line in fp:
240 |                 print(line.replace(old_str, new_str), end='')
241 |         # cleanup old backup file once completed
242 |         f_backup = log_dir + filename + '.bak'
243 |         os.remove(f_backup)
244 | 
245 |     def update_account_id(self, new_aws_account_id, old_account_id):
246 |         log_dir = self.get_export_dir()
247 |         logs_to_update = ['users.log',
248 |                           'instance_profiles.log', 'clusters.log', 'cluster_policies.log',
249 |                           'jobs.log']
250 |         # update individual logs first
251 |         for log_name in logs_to_update:
252 |             if os.path.exists(log_dir + log_name):
253 |                 self.replace_file_contents(old_account_id, new_aws_account_id, log_name)
254 |         # # update group logs
255 |         group_dir = log_dir + 'groups/'
256 |         groups = os.listdir(group_dir)
257 |         for group_name in groups:
258 |             group_file = 'groups/' + group_name
259 |             if os.path.exists(log_dir + group_file):
260 |                 self.replace_file_contents(old_account_id, new_aws_account_id, group_file)
261 | 
262 |     def update_email_addresses(self, old_email_address, new_email_address):
263 |         """
264 |         :param old_email_address:
265 |         :param new_email_address:
266 |         :return:
267 |         """
268 |         log_dir = self.get_export_dir()
269 |         logs_to_update = ['users.log',
270 |                           'acl_jobs.log',
271 |                           'acl_clusters.log', 'acl_cluster_policies.log',
272 |                           'acl_notebooks.log', 'acl_directories.log']
273 |         for logfile in logs_to_update:
274 |             if os.path.exists(log_dir + logfile):
275 |                 self.replace_file_contents(old_email_address, new_email_address, logfile)
276 |         # update the path for user notebooks in bulk export mode
277 |         bulk_export_dir = log_dir + 'artifacts/Users/'
278 |         old_bulk_export_dir = bulk_export_dir + old_email_address
279 |         new_bulk_export_dir = bulk_export_dir + new_email_address
280 |         if os.path.exists(old_bulk_export_dir):
281 |             os.rename(old_bulk_export_dir, new_bulk_export_dir)
282 |         # update the path for user notebooks in single user export mode
283 |         single_user_dir = log_dir + 'user_exports/'
284 |         old_single_user_dir = single_user_dir + old_email_address
285 |         new_single_user_dir = single_user_dir + new_email_address
286 |         if os.path.exists(old_single_user_dir):
287 |             os.rename(old_single_user_dir, new_single_user_dir)
288 |         old_single_user_nbs_dir = new_single_user_dir + '/user_artifacts/Users/' + old_email_address
289 |         new_single_user_nbs_dir = new_single_user_dir + '/user_artifacts/Users/' + new_email_address
290 |         if os.path.exists(old_single_user_nbs_dir):
291 |             os.rename(old_single_user_nbs_dir, new_single_user_nbs_dir)
292 |         print("Update email address complete")
293 | 


--------------------------------------------------------------------------------
/dbclient/parser.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import configparser
  3 | import re
  4 | from enum import Enum
  5 | from os import path
  6 | 
  7 | auth_key = ['host',
  8 |             'username',
  9 |             'token']
 10 | 
 11 | 
 12 | class NotebookFormat(Enum):
 13 |     dbc = 'DBC'
 14 |     source = 'SOURCE'
 15 |     html = 'HTML'
 16 |     # jupyter is only supported for python notebooks. consider adding this back if there's demand
 17 |     # jupyter = 'JUPYTER'
 18 | 
 19 |     def __str__(self):
 20 |         return self.value
 21 | 
 22 | 
 23 | def is_azure_creds(creds):
 24 |     if 'azuredatabricks.net' in creds['host']:
 25 |         return True
 26 |     return False
 27 | 
 28 | 
 29 | def convert_args_to_list(arg_str):
 30 |     arg_list = map(lambda x: x.lstrip().rstrip(), arg_str.split(','))
 31 |     return list(arg_list)
 32 | 
 33 | 
 34 | def get_login_credentials(creds_path='~/.databrickscfg', profile='DEFAULT'):
 35 |     config = configparser.ConfigParser()
 36 |     abs_creds_path = path.expanduser(creds_path)
 37 |     config.read(abs_creds_path)
 38 |     try:
 39 |         current_profile = dict(config[profile])
 40 |         return current_profile
 41 |     except KeyError:
 42 |         raise ValueError('Unable to find credentials to load for profile. Profile only supports tokens.')
 43 | 
 44 | 
 45 | def get_export_user_parser():
 46 |     # export workspace items
 47 |     parser = argparse.ArgumentParser(description='Export user(s) workspace artifacts from Databricks')
 48 | 
 49 |     parser.add_argument('--profile', action='store', default='DEFAULT',
 50 |                         help='Profile to parse the credentials')
 51 | 
 52 |     parser.add_argument('--azure', action='store_true', default=False,
 53 |                         help='Run on Azure. (Default is AWS)')
 54 | 
 55 |     parser.add_argument('--skip-failed', action='store_true', default=False,
 56 |                         help='Skip retries for any failed hive metastore exports.')
 57 | 
 58 |     parser.add_argument('--silent', action='store_true', default=False,
 59 |                         help='Silent all logging of export operations.')
 60 |     # Don't verify ssl
 61 |     parser.add_argument('--no-ssl-verification', action='store_true',
 62 |                         help='Set Verify=False when making http requests.')
 63 | 
 64 |     parser.add_argument('--debug', action='store_true',
 65 |                         help='Enable debug logging')
 66 | 
 67 |     parser.add_argument('--set-export-dir', action='store',
 68 |                         help='Set the base directory to export artifacts')
 69 | 
 70 |     parser.add_argument('--users', action='store',
 71 |                         help='Download user(s) artifacts such as notebooks, cluster specs, jobs. '
 72 |                              'Provide a list of user ids / emails to export')
 73 | 
 74 |     return parser
 75 | 
 76 | 
 77 | def get_export_parser():
 78 |     # export workspace items
 79 |     parser = argparse.ArgumentParser(description='Export full workspace artifacts from Databricks')
 80 | 
 81 |     # export all users and groups
 82 |     parser.add_argument('--users', action='store_true',
 83 |                         help='Download all the users and groups in the workspace')
 84 | 
 85 |     # log all user workspace paths
 86 |     parser.add_argument('--workspace', action='store_true',
 87 |                         help='Log all the notebook paths in the workspace. (metadata only)')
 88 | 
 89 |     parser.add_argument('--notebook-format', type=NotebookFormat,
 90 |                         choices=list(NotebookFormat), default=NotebookFormat.dbc,
 91 |                         help='Choose the file format to download the notebooks (default: DBC)')
 92 | 
 93 |     # download all user workspace notebooks
 94 |     parser.add_argument('--download', action='store_true',
 95 |                         help='Download all notebooks for the environment')
 96 | 
 97 |     # add all lib configs
 98 |     parser.add_argument('--libs', action='store_true',
 99 |                         help='Log all the libs for the environment')
100 | 
101 |     # add all clusters configs
102 |     parser.add_argument('--clusters', action='store_true',
103 |                         help='Log all the clusters for the environment')
104 | 
105 |     # get all job configs
106 |     parser.add_argument('--jobs', action='store_true',
107 |                         help='Log all the job configs for the environment')
108 |     # get all metastore
109 |     parser.add_argument('--metastore', action='store_true',
110 |                         help='log all the metastore table definitions')
111 | 
112 |     # get all secret scopes
113 |     parser.add_argument('--secrets', action='store_true',
114 |                         help='log all the secret scopes')
115 | 
116 |     # get all metastore
117 |     parser.add_argument('--metastore-unicode', action='store_true',
118 |                         help='log all the metastore table definitions including unicode characters')
119 | 
120 |     # cluster name used to export the metastore
121 |     parser.add_argument('--cluster-name', action='store',
122 |                         help='Cluster name to export the metastore to a specific cluster. Cluster will be started.')
123 | 
124 |     # get database to export for metastore
125 |     parser.add_argument('--database', action='store',
126 |                         help='Database name to export for the metastore. Single database name supported')
127 | 
128 |     # iam role used to export the metastore
129 |     parser.add_argument('--iam', action='store',
130 |                         help='IAM Instance Profile to export metastore entires')
131 | 
132 |     # skip failures
133 |     parser.add_argument('--skip-failed', action='store_true', default=False,
134 |                         help='Skip retries for any failed hive metastore exports.')
135 | 
136 |     # get mount points
137 |     parser.add_argument('--mounts', action='store_true', default=False,
138 |                         help='Log all mount points.')
139 |     # get azure logs
140 |     parser.add_argument('--azure', action='store_true', default=False,
141 |                         help='Run on Azure. (Default is AWS)')
142 |     #
143 |     parser.add_argument('--profile', action='store', default='DEFAULT',
144 |                         help='Profile to parse the credentials')
145 | 
146 |     parser.add_argument('--single-user', action='store',
147 |                         help='User\'s email to export their user identity and entitlements')
148 | 
149 |     parser.add_argument('--export-home', action='store',
150 |                         help='User workspace name to export, typically the users email address')
151 | 
152 |     parser.add_argument('--export-groups', action='store',
153 |                         help='Group names to export as a set. Includes group, users, and notebooks.')
154 | 
155 |     parser.add_argument('--workspace-acls', action='store_true',
156 |                         help='Permissions for workspace objects to export')
157 | 
158 |     parser.add_argument('--workspace-top-level-only', action='store_true',
159 |                         help='Download only top level notebook directories')
160 | 
161 |     parser.add_argument('--silent', action='store_true', default=False,
162 |                         help='Silent all logging of export operations.')
163 |     # Don't verify ssl
164 |     parser.add_argument('--no-ssl-verification', action='store_true',
165 |                         help='Set Verify=False when making http requests.')
166 | 
167 |     parser.add_argument('--debug', action='store_true',
168 |                         help='Enable debug logging')
169 | 
170 |     parser.add_argument('--reset-exports', action='store_true',
171 |                         help='Clear export directory')
172 | 
173 |     parser.add_argument('--set-export-dir', action='store',
174 |                         help='Set the base directory to export artifacts')
175 | 
176 |     parser.add_argument('--pause-all-jobs', action='store_true',
177 |                         help='Pause all scheduled jobs')
178 | 
179 |     parser.add_argument('--unpause-all-jobs', action='store_true',
180 |                         help='Unpause all scheduled jobs')
181 | 
182 |     parser.add_argument('--update-account-id', action='store',
183 |                         help='Set the account id for instance profiles to a new account id')
184 | 
185 |     parser.add_argument('--old-account-id', action='store',
186 |                         help='Old account ID to filter on')
187 | 
188 |     parser.add_argument('--replace-old-email', action='store',
189 |                         help='Old email address to update from logs')
190 | 
191 |     parser.add_argument('--update-new-email', action='store',
192 |                         help='New email address to replace the logs')
193 | 
194 |     parser.add_argument('--replace-email', action='store',
195 |                         help='Update old emails with new e-mails. NOTE: Similar to replace-old-email but capable of using multiple e-mails. Format old1@email:new1@email.com,old2@email.com:new2@email.com')
196 | 
197 |     parser.add_argument('--bypass-windows-check', action='store_true',
198 |                         help='By-pass windows os checker')
199 |     return parser
200 | 
201 | 
202 | def get_import_parser():
203 |     # import workspace items parser
204 |     parser = argparse.ArgumentParser(description='Import full workspace artifacts into Databricks')
205 | 
206 |     # import all users and groups
207 |     parser.add_argument('--users', action='store_true',
208 |                         help='Import all the users and groups from the logfile.')
209 | 
210 |     # import all notebooks
211 |     parser.add_argument('--workspace', action='store_true',
212 |                         help='Import all notebooks from export dir into the workspace.')
213 | 
214 |     # skip previous successful imports
215 |     parser.add_argument('--restart-from-checkpoint', action='store_true',
216 |                         help='Restart the workspace import and skip previously successful imports. '
217 |                              'Only works with --workspace option')
218 | 
219 |     parser.add_argument('--workspace-top-level', action='store_true',
220 |                         help='Import all top level notebooks from export dir into the workspace. Excluding Users dirs')
221 | 
222 |     parser.add_argument('--workspace-acls', action='store_true',
223 |                         help='Permissions for workspace objects to import')
224 | 
225 |     parser.add_argument('--notebook-format', type=NotebookFormat,
226 |                         choices=list(NotebookFormat), default=NotebookFormat.dbc,
227 |                         help='Choose the file format of the notebook to import (default: DBC)')
228 | 
229 |     parser.add_argument('--import-home', action='store',
230 |                         help='User workspace name to import, typically the users email address')
231 | 
232 |     parser.add_argument('--import-groups', action='store_true',
233 |                         help='Groups to import into a new workspace. Includes group creation and user notebooks.')
234 | 
235 |     # import all notebooks
236 |     parser.add_argument('--archive-missing', action='store_true',
237 |                         help='Import all missing users into the top level /Archive/ directory.')
238 | 
239 |     # import all lib configs
240 |     parser.add_argument('--libs', action='store_true',
241 |                         help='Import all the libs from the logfile into the workspace.')
242 | 
243 |     # import all clusters configs
244 |     parser.add_argument('--clusters', action='store_true',
245 |                         help='Import all the cluster configs for the environment')
246 | 
247 |     # import all job configs
248 |     parser.add_argument('--jobs', action='store_true',
249 |                         help='Import all job configurations to the environment.')
250 | 
251 |     # import all metastore
252 |     parser.add_argument('--metastore', action='store_true',
253 |                         help='Import the metastore to the workspace.')
254 | 
255 |     # import all metastore including defns with unicode
256 |     parser.add_argument('--metastore-unicode', action='store_true',
257 |                         help='Import all the metastore table definitions with unicode characters')
258 | 
259 |     parser.add_argument('--get-repair-log', action='store_true',
260 |                         help='Report on current tables requiring repairs')
261 | 
262 |     # cluster name used to import the metastore
263 |     parser.add_argument('--cluster-name', action='store',
264 |                         help='Cluster name to import the metastore to a specific cluster. Cluster will be started.')
265 |     # skip failures
266 |     parser.add_argument('--skip-failed', action='store_true', default=False,
267 |                         help='Skip missing users that do not exist when importing user notebooks')
268 | 
269 |     # get azure logs
270 |     parser.add_argument('--azure', action='store_true',
271 |                         help='Run on Azure. (Default is AWS)')
272 |     #
273 |     parser.add_argument('--profile', action='store', default='DEFAULT',
274 |                         help='Profile to parse the credentials')
275 | 
276 |     parser.add_argument('--single-user', action='store',
277 |                         help='User\'s email to export their user identity and entitlements')
278 | 
279 |     # Don't verify ssl
280 |     parser.add_argument('--no-ssl-verification', action='store_true',
281 |                         help='Set Verify=False when making http requests.')
282 | 
283 |     parser.add_argument('--silent', action='store_true',
284 |                         help='Silent all logging of import operations.')
285 | 
286 |     parser.add_argument('--debug', action='store_true',
287 |                         help='Enable debug logging')
288 | 
289 |     parser.add_argument('--set-export-dir', action='store',
290 |                         help='Set the base directory to import artifacts if the export dir was a customized')
291 | 
292 |     parser.add_argument('--pause-all-jobs', action='store_true',
293 |                         help='Pause all scheduled jobs')
294 | 
295 |     parser.add_argument('--unpause-all-jobs', action='store_true',
296 |                         help='Unpause all scheduled jobs')
297 | 
298 |     parser.add_argument('--delete-all-jobs', action='store_true',
299 |                         help='Delete all jobs')
300 |     return parser
301 | 
302 | 
303 | def prompt_for_input(message):
304 |     import sys
305 |     # raw_input returns the empty string for "enter", therefore default is no
306 |     yes = {'yes','y', 'ye'}
307 |     no = {'no','n', ''}
308 | 
309 |     choice = input(message + '\n').lower()
310 |     if choice in yes:
311 |         return True
312 |     elif choice in no:
313 |         return False
314 |     else:
315 |         sys.stdout.write("Please respond with 'yes' or 'no'")
316 | 
317 | 
318 | def url_validation(url):
319 |     if '/?o=' in url:
320 |         # if the workspace_id exists, lets remove it from the URL
321 |         new_url = re.sub("\/\?o=.*", '', url)
322 |         return new_url
323 |     elif 'net/' == url[-4:]:
324 |         return url[:-1]
325 |     elif 'com/' == url[-4:]:
326 |         return url[:-1]
327 |     return url
328 | 
329 | 
330 | def build_client_config(url, token, args):
331 |     # cant use netrc credentials because requests module tries to load the credentials into http basic auth headers
332 |     # aws is the default
333 |     config = {'url': url_validation(url),
334 |               'token': token,
335 |               'is_aws': (not args.azure),
336 |               'verbose': (not args.silent),
337 |               'verify_ssl': (not args.no_ssl_verification),
338 |               'skip_failed': args.skip_failed,
339 |               'debug': args.debug,
340 |               'file_format': str(args.notebook_format)
341 |               }
342 |     if args.set_export_dir:
343 |         if args.set_export_dir.rstrip()[-1] != '/':
344 |             config['export_dir'] = args.set_export_dir + '/'
345 |         else:
346 |             config['export_dir'] = args.set_export_dir
347 |     elif config['is_aws']:
348 |         config['export_dir'] = 'logs/'
349 |     else:
350 |         config['export_dir'] = 'azure_logs/'
351 |     return config
352 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [Deprecated] Databricks Migration Tool 
  2 | 
  3 | ## This project is deprecated and official support is moving to the Databricks Labs project: [migrate](https://github.com/databrickslabs/migrate)
  4 | 
  5 | This is a migration package to log all Databricks resources for backup and/or migrating to another Databricks workspace.
  6 | Migration allows a Databricks organization to move resources between Databricks Workspaces, 
  7 | to move between different cloud providers, or to move to different regions / accounts.  
  8 | 
  9 | Packaged is based on python 3.6 and DBR 6.x and 7.x releases.  
 10 | 
 11 | **Note:** Tools does not support windows currently since path resolution is different than mac / linux.  
 12 | Support for Windows is work in progress to update all paths to use pathlib resolution. 
 13 | 
 14 | This package uses credentials from the 
 15 | [Databricks CLI](https://docs.databricks.com/user-guide/dev-tools/databricks-cli.html)  
 16 | 
 17 | Support Matrix for Import and Export Operations:
 18 | 
 19 | | Component         | Export       | Import       |
 20 | | ----------------- | ------------ | ------------ |
 21 | | Users / Groups    | Supported    | Supported    |
 22 | | Clusters (w/ ACLs)| Supported    | Supported    |
 23 | | Notebooks         | Supported    | Supported    |
 24 | | Notebooks ACLs    | Supported    | Supported    |
 25 | | Metastore         | Supported    | Supported    |
 26 | | Jobs (w/ ACLs)    | Supported    | Supported    |
 27 | | Libraries         | Supported    | Unsupported  |
 28 | | Secrets           | Unsupported  | Unsupported  |
 29 | | ML Models         | Unsupported  | Unsupported  |
 30 | | Table ACLs        | Unsupported  | Unsupported  |
 31 | 
 32 | **DBFS Data Migration:**  
 33 | * DBFS is a protected object storage location on AWS and Azure.
 34 | * Data within the DBFS bucket can be very large, and the Databricks support team will need to help here. 
 35 |   * The Databricks support team has a tool available to help with DBFS migrations between AWS workspaces today. 
 36 |   * Azure DBFS migrations is work in progress. 
 37 | 
 38 | **Note:** MLFlow objects cannot be exported / imported with this tool.
 39 | For more details, please look [here](https://github.com/amesar/mlflow-tools/)
 40 | 
 41 | ## Workspace Analysis
 42 | Import this [notebook](data/workspace_migration_analysis.py) to do an analysis of the number of objects within the 
 43 | current workspace. The last cell will print:
 44 | 1. Number of users
 45 | 2. Number of groups
 46 | 3. Approximate number of notebooks
 47 | 4. Number of internal jobs defined
 48 | 5. Number of external jobs executed (from external API invocations)
 49 | 6. Number of databases
 50 | 7. Number of tables 
 51 | 
 52 | ## Order of Operations
 53 | 1. Export users and groups 
 54 | 2. Export cluster templates
 55 | 3. Export notebook metadata (listing of all notebooks)
 56 | 4. Export notebook content 
 57 | 5. Export job templates
 58 | 6. Export Hive Metastore data 
 59 | 
 60 | **Note:** During user / group import, users will be notified of the new workspace and account. This is required 
 61 | for them to set up their credentials to access the new workspace. We need the user to exist before loading their 
 62 | artifacts like notebooks, clusters, etc. 
 63 | 
 64 | By default, artifacts are stored in the `logs/` directory, and `azure_logs/` for Azure artifacts. 
 65 | This is configurable with the `--set-export-dir` flag to specify the log directory.
 66 | 
 67 | While exporting Libraries is supported, we do not have an implementation to import library definitions. 
 68 | ## Table of Contents
 69 | - [Users and Groups](#users-and-groups)
 70 | - [Clusters](#Clusters)
 71 | - [Notebooks](#Notebooks)
 72 | - [Jobs](#Jobs)
 73 | - [Export Help Text](#export-help-text)
 74 | - [Import Help Text](#import-help-text)
 75 | 
 76 | ### Users and Groups
 77 | This section uses the [SCIM API](https://docs.databricks.com/dev-tools/api/latest/scim/index.html) to export / import 
 78 | user and groups.  
 79 | [Instance Profiles API](https://docs.databricks.com/dev-tools/api/latest/instance-profiles.html) used 
 80 | to export instance profiles that are tied to user/group entitlements.   
 81 | For AWS users, this section will log the instance profiles used for IAM access to resources. 
 82 | 
 83 | To export users / groups, use the following:
 84 | ```bash
 85 | python export_db.py --profile DEMO --users
 86 | ```
 87 | 
 88 | To import these users:
 89 | ```bash
 90 | python import_db.py --profile NEW_DEMO --users
 91 | ```
 92 | 
 93 | If you plan to use this tool to export multiple workspaces, you can set the `--set-export-dir` directory to log 
 94 | artifacts into separate logging directories. 
 95 | 
 96 | 
 97 | ### Clusters
 98 | The section uses the [Clusters APIs](https://docs.databricks.com/dev-tools/api/latest/clusters.html)  
 99 | 
100 | ```bash
101 | python export_db.py --profile DEMO --clusters
102 | ```
103 | This will export the following:
104 | 1. Cluster templates + ACLs
105 | 2. Instance pool definitions
106 | 3. Cluster policies + ACLs
107 | 
108 | ```bash
109 | python import_db.py --profile NEW_DEMO --clusters
110 | ```
111 | 
112 | ### Notebooks
113 | This section uses the [Workspace API](https://docs.databricks.com/dev-tools/api/latest/workspace.html)
114 | 
115 | This part is a 3 part process. 
116 | 1. Download all notebook locations and paths
117 | 2. Download all notebook contents for every path
118 | 3. Download all workspace ACLs
119 | 
120 | ```bash
121 | python export_db.py --profile DEMO --workspace
122 | python export_db.py --profile DEMO --download
123 | python export_db.py --profile DEMO --workspace-acls
124 | ```
125 | 
126 | To import into a new workspace:
127 | ```bash
128 | python import_db.py --profile NEW_DEMO --workspace [--archive-missing]
129 | python import_db.py --profile NEW_DEMO --workspace-acls 
130 | ```
131 | If users have left your organization, their artifacts (notebooks / job templates) still exists. However, their user 
132 | object no longer exists. During the migration, we can keep the old users notebooks into the top level 
133 | directory `/Archive/{username}@domain.com`
134 | Use the `--archive-missing` option to put these artifacts in the archive folder. 
135 | 
136 | **Single User Export/Import**  
137 | The tool supports exporting single user workspaces using the following command:
138 | ```bash
139 | # export a single users workspace
140 | python export_db.py --profile DEMO --export-home example@foobar.com
141 | ```
142 | 
143 | The corollary is the `--import-home` option:
144 | ```bash
145 | python import_db.py --profile NEW_DEMO --import-home example@foobar.com
146 | ```
147 | 
148 | ### Jobs
149 | This section uses the [Jobs API](https://docs.databricks.com/dev-tools/api/latest/jobs.html)  
150 | Job ACLs are exported and imported with this option.
151 | 
152 | ```bash
153 | python export_db.py --profile DEMO --jobs
154 | ```
155 | If we're unable to find old cluster ids that are no longer available, we'll reset the job template 
156 | to use a new default cluster. 
157 | 
158 | ```bash
159 | python import_db.py --profile NEW_DEMO --jobs
160 | ```
161 | 
162 | Imported jobs into the new workspace are paused by default. We do not want to have 2 jobs run simultaneously. 
163 | Admins must pause their jobs with Databricks defined schedules using the following option:
164 | ```bash
165 | python export_db.py --profile DEMO --pause-all-jobs
166 | ```
167 | 
168 | Un-pause all jobs in the new workspace:
169 | ```bash
170 | python import_db.py --profile NEW_DEMO --unpause-all-jobs
171 | ```
172 | 
173 | ### Hive Metastore
174 | This section uses an API to remotely run Spark commands on a cluster, this API is called 
175 | [Execution Context](https://docs.databricks.com/dev-tools/api/1.2/index.html#execution-context)
176 | 
177 | By default, this will launch an small cluster in the `data/` folder to export the Hive Metastore data. 
178 | If you need a specific IAM role to export the metastore, use the `--cluster-name` option to connect to 
179 | a specific cluster. 
180 | 
181 | By default, we will edit the cluster for every defined IAM role to loop through all failed exports in case the tool was 
182 | missing IAM permissions. To disable looping through all failed exports, you can pass in `--skip-failed`
183 | 
184 | ```bash
185 | # export all metastore entries and brute force loop through all instance profiles / IAM roles
186 | python export_db.py --profile DEMO --metastore
187 | 
188 | # export all metastore entries on the default cluster without retries
189 | python export_db.py --profile DEMO --metastore --skip-failed 
190 | 
191 | # export all metastore entries on a specific cluster
192 | python export_db.py --profile DEMO --metastore --cluster-name "Test"
193 | 
194 | # export all tables within a specific database
195 | python export_db.py --profile DEMO --metastore --cluster-name "Test" --database "my_db"
196 | ```
197 | 
198 | To find legacy Hive tables that need to be repaired after a successful import, run the following:
199 | ```
200 | python import_db.py --profile DST --get-repair-log
201 | ```
202 | Once completed, it will upload a log to the destination location. 
203 | Use this [repair notebook](data/repair_tables_for_migration.py) to import into the destination environment to repair 
204 | all tables. 
205 | 
206 | ### Export Groups by Name
207 | This functionality exports group(s), their members, and corresponding notebooks.  
208 | This assumes an empty export directory to simplify the number of operations needed.  
209 | This does **not** include IAM roles as those likely change while moving across workspaces. 
210 | 
211 | ```bash
212 | # reset the export directory and export a set of groups
213 | python export_db.py --reset-export && python export_db.py --profile SRC --export-groups 'groupA,groupB'
214 | 
215 | # import the groups that were exported
216 | python import_db.py --profile DST --import-groups
217 | ```
218 | 
219 | ### Export / Import Top Level Notebooks
220 | This will export all notebooks that are not a part of the `/Users/` directories to help migrate notebooks that are 
221 | outside of personal workspace directories. 
222 | ```bash
223 | # reset the export directory and export the top level directories / notebooks
224 | python export_db.py --reset-export && python export_db.py --profile SRC --workspace-top-level-only
225 | # if ACLs are enabled, export the ACLs as well
226 | python export_db.py --profile SRC --workspace-acls
227 | 
228 | # import the groups that were exported
229 | python import_db.py --profile DST --workspace-top-level
230 | # apply acls if needed 
231 | python import_db.py --profile DST --workspace-acls
232 | ```
233 | 
234 | #### Export Help Text
235 | ```
236 | $ python export_db.py --help
237 | usage: export_db.py [-h] [--users] [--workspace]
238 |                     [--notebook-format {DBC,SOURCE,HTML}] [--download]
239 |                     [--libs] [--clusters] [--jobs] [--metastore] [--secrets]
240 |                     [--metastore-unicode] [--cluster-name CLUSTER_NAME]
241 |                     [--database DATABASE] [--iam IAM] [--skip-failed]
242 |                     [--mounts] [--azure] [--profile PROFILE]
243 |                     [--single-user SINGLE_USER] [--export-home EXPORT_HOME]
244 |                     [--export-groups EXPORT_GROUPS] [--workspace-acls]
245 |                     [--workspace-top-level-only] [--silent]
246 |                     [--no-ssl-verification] [--debug] [--reset-exports]
247 |                     [--set-export-dir SET_EXPORT_DIR] [--pause-all-jobs]
248 |                     [--unpause-all-jobs]
249 |                     [--update-account-id UPDATE_ACCOUNT_ID]
250 |                     [--old-account-id OLD_ACCOUNT_ID]
251 |                     [--replace-old-email REPLACE_OLD_EMAIL]
252 |                     [--update-new-email UPDATE_NEW_EMAIL]
253 |                     [--bypass-windows-check]
254 |                     
255 | Export full workspace artifacts from Databricks
256 | 
257 | optional arguments:
258 |   -h, --help            show this help message and exit
259 |   --users               Download all the users and groups in the workspace
260 |   --workspace           Log all the notebook paths in the workspace. (metadata
261 |                         only)
262 |   --notebook-format {DBC,SOURCE,HTML}
263 |                         Choose the file format to download the notebooks
264 |                         (default: DBC)
265 |   --download            Download all notebooks for the environment
266 |   --libs                Log all the libs for the environment
267 |   --clusters            Log all the clusters for the environment
268 |   --jobs                Log all the job configs for the environment
269 |   --metastore           log all the metastore table definitions
270 |   --metastore-unicode   log all the metastore table definitions including
271 |                         unicode characters
272 |   --cluster-name CLUSTER_NAME
273 |                         Cluster name to export the metastore to a specific
274 |                         cluster. Cluster will be started.
275 |   --database DATABASE   Database name to export for the metastore. Single
276 |                         database name supported
277 |   --iam IAM             IAM Instance Profile to export metastore entires
278 |   --skip-failed         Skip retries for any failed hive metastore exports.
279 |   --mounts              Log all mount points.
280 |   --azure               Run on Azure. (Default is AWS)
281 |   --profile PROFILE     Profile to parse the credentials
282 |   --export-home EXPORT_HOME
283 |                         User workspace name to export, typically the users
284 |                         email address
285 |   --export-groups EXPORT_GROUPS
286 |                         Group names to export as a set. Includes group, users,
287 |                         and notebooks.
288 |   --workspace-acls      Permissions for workspace objects to export
289 |   --workspace-top-level-only
290 |                         Download only top level notebook directories
291 |   --silent              Silent all logging of export operations.
292 |   --no-ssl-verification
293 |                         Set Verify=False when making http requests.
294 |   --debug               Enable debug logging
295 |   --reset-exports       Clear export directory
296 |   --set-export-dir SET_EXPORT_DIR
297 |                         Set the base directory to export artifacts
298 |   --pause-all-jobs      Pause all scheduled jobs
299 |   --unpause-all-jobs    Unpause all scheduled jobs
300 |   --update-account-id UPDATE_ACCOUNT_ID
301 |                         Set the account id for instance profiles to a new
302 |                         account id
303 |   --old-account-id OLD_ACCOUNT_ID
304 |                         Old account ID to filter on
305 |   --replace-old-email REPLACE_OLD_EMAIL
306 |                         Old email address to update from logs
307 |   --update-new-email UPDATE_NEW_EMAIL
308 |                         New email address to replace the logs
309 | ```
310 | 
311 | #### Import Help Text
312 | ```
313 | $ python import_db.py --help
314 | usage: import_db.py [-h] [--users] [--workspace] [--workspace-top-level]
315 |                     [--workspace-acls] [--notebook-format {DBC,SOURCE,HTML}]
316 |                     [--import-home IMPORT_HOME] [--import-groups]
317 |                     [--archive-missing] [--libs] [--clusters] [--jobs]
318 |                     [--metastore] [--metastore-unicode] [--get-repair-log]
319 |                     [--cluster-name CLUSTER_NAME] [--skip-failed] [--azure]
320 |                     [--profile PROFILE] [--single-user SINGLE_USER]
321 |                     [--no-ssl-verification] [--silent] [--debug]
322 |                     [--set-export-dir SET_EXPORT_DIR] [--pause-all-jobs]
323 |                     [--unpause-all-jobs] [--delete-all-jobs]
324 |                                         
325 | Import full workspace artifacts into Databricks
326 | 
327 | optional arguments:
328 |   -h, --help            show this help message and exit
329 |   --users               Import all the users and groups from the logfile.
330 |   --workspace           Import all notebooks from export dir into the
331 |                         workspace.
332 |   --workspace-top-level
333 |                         Import all top level notebooks from export dir into
334 |                         the workspace. Excluding Users dirs
335 |   --notebook-format {DBC,SOURCE,HTML}
336 |                         Choose the file format of the notebook to import
337 |                         (default: DBC)
338 |   --workspace-acls      Permissions for workspace objects to import
339 |   --import-home IMPORT_HOME
340 |                         User workspace name to import, typically the users
341 |                         email address
342 |   --import-groups       Groups to import into a new workspace. Includes group
343 |                         creation and user notebooks.
344 |   --archive-missing     Import all missing users into the top level /Archive/
345 |                         directory.
346 |   --libs                Import all the libs from the logfile into the
347 |                         workspace.
348 |   --clusters            Import all the cluster configs for the environment
349 |   --jobs                Import all job configurations to the environment.
350 |   --metastore           Import the metastore to the workspace.
351 |   --metastore-unicode   Import all the metastore table definitions with
352 |                         unicode characters
353 |   --get-repair-log      Report on current tables requiring repairs
354 |   --cluster-name CLUSTER_NAME
355 |                         Cluster name to import the metastore to a specific
356 |                         cluster. Cluster will be started.
357 |   --skip-failed         Skip missing users that do not exist when importing
358 |                         user notebooks
359 |   --azure               Run on Azure. (Default is AWS)
360 |   --profile PROFILE     Profile to parse the credentials
361 |   --no-ssl-verification
362 |                         Set Verify=False when making http requests.
363 |   --silent              Silent all logging of import operations.
364 |   --debug               Enable debug logging
365 |   --set-export-dir SET_EXPORT_DIR
366 |                         Set the base directory to import artifacts if the
367 |                         export dir was a customized
368 |   --pause-all-jobs      Pause all scheduled jobs
369 |   --unpause-all-jobs    Unpause all scheduled jobs
370 |   --delete-all-jobs     Delete all jobs
371 | ```
372 | 
373 | #### FAQs / Limitations
374 | **Note**: To disable ssl verification pass the flag `--no-ssl-verification`.
375 | If still getting SSL Error add the following to your current bash shell -
376 | ```
377 | export REQUESTS_CA_BUNDLE=""
378 | export CURL_CA_BUNDLE=""
379 | ```
380 | 
381 | Limitations:
382 | * Instance profiles (AWS only): Group access to instance profiles will take precedence. If a user is added to the role 
383 | directly, and has access via a group, only the group access will be granted during a migration.  
384 | * Clusters: Cluster creator will be seen as the single admin user who migrated all the clusters. (Relevant for billing 
385 | purposes)
386 |   * Cluster creator tags cannot be updated. Added a custom tag named `OriginalCreator` with the original cluster creator
387 |    for DBU tracking.
388 | * Jobs: Job owners will be seen as the single admin user who migrate the job configurations. (Relevant for billing 
389 | purposes)
390 |   * Jobs with existing clusters that no longer exist will be reset to the default cluster type
391 |   * Jobs with older legacy instances will fail with unsupported DBR or instance types. See release notes for the latest
392 |    supported releases. 
393 | 
394 | 


--------------------------------------------------------------------------------
/dbclient/ScimClient.py:
--------------------------------------------------------------------------------
  1 | from dbclient import *
  2 | import os
  3 | import json
  4 | 
  5 | 
  6 | class ScimClient(dbclient):
  7 | 
  8 |     def get_active_users(self):
  9 |         users = self.get('/preview/scim/v2/Users').get('Resources', None)
 10 |         return users if users else None
 11 | 
 12 |     def log_all_users(self, log_file='users.log'):
 13 |         user_log = self.get_export_dir() + log_file
 14 |         users = self.get('/preview/scim/v2/Users').get('Resources', None)
 15 |         if users:
 16 |             with open(user_log, "w") as fp:
 17 |                 for x in users:
 18 |                     fullname = x.get('name', None)
 19 |                     if fullname:
 20 |                         given_name = fullname.get('givenName', None)
 21 |                         # if user is an admin, skip this user entry
 22 |                         if x['userName'] == 'admin' and given_name == 'Administrator':
 23 |                             continue
 24 |                     fp.write(json.dumps(x) + '\n')
 25 |         else:
 26 |             print("Users returned an empty object")
 27 | 
 28 |     def log_single_user(self, user_email, log_file='single_user.log'):
 29 |         single_user_log = self.get_export_dir() + log_file
 30 |         users = self.get_active_users()
 31 |         found_user = False
 32 |         for user in users:
 33 |             current_email = user['emails'][0]['value']
 34 |             if user_email == current_email:
 35 |                 found_user = True
 36 |                 print(user)
 37 |                 with open(single_user_log, 'w') as fp:
 38 |                     fp.write(json.dumps(user) + '\n')
 39 |         if not found_user:
 40 |             print("User not found. Emails are case sensitive. Please verify email address")
 41 | 
 42 |     def import_single_user(self, user_email, log_file='single_user.log'):
 43 |         single_user_log = self.get_export_dir() + log_file
 44 |         resp = self.import_users(single_user_log)
 45 | 
 46 |     def get_users_from_log(self, users_log='users.log'):
 47 |         """
 48 |         fetch a list of user names from the users log file
 49 |         meant to be used during group exports where the user list is a subset of users
 50 |         :param users_log:
 51 |         :return: a list of usernames that help identify their workspace paths
 52 |         """
 53 |         user_logfile = self.get_export_dir() + users_log
 54 |         username_list = []
 55 |         with open(user_logfile, 'r') as fp:
 56 |             for u in fp:
 57 |                 user_json = json.loads(u)
 58 |                 username_list.append(user_json.get('userName'))
 59 |         return username_list
 60 | 
 61 |     @staticmethod
 62 |     def is_member_a_user(member_json):
 63 |         if 'Users/' in member_json['$ref']:
 64 |             return True
 65 |         return False
 66 | 
 67 |     @staticmethod
 68 |     def is_member_a_group(member_json):
 69 |         if 'Groups/' in member_json['$ref']:
 70 |             return True
 71 |         return False
 72 | 
 73 |     @staticmethod
 74 |     def is_member_a_service_principal(member_json):
 75 |         if 'ServicePrincipals/' in member_json['$ref']:
 76 |             return True
 77 |         return False
 78 | 
 79 |     def add_username_to_group(self, group_json):
 80 |         # add the userName field to json since ids across environments may not match
 81 |         members = group_json.get('members', [])
 82 |         new_members = []
 83 |         for m in members:
 84 |             m_id = m['value']
 85 |             if self.is_member_a_user(m):
 86 |                 user_resp = self.get('/preview/scim/v2/Users/{0}'.format(m_id))
 87 |                 m['userName'] = user_resp['userName']
 88 |                 m['type'] = 'user'
 89 |             elif self.is_member_a_group(m):
 90 |                 m['type'] = 'group'
 91 |             elif self.is_member_a_service_principal(m):
 92 |                 m['type'] = 'service-principal'
 93 |             else:
 94 |                 m['type'] = 'unknown'
 95 |             new_members.append(m)
 96 |         group_json['members'] = new_members
 97 |         return group_json
 98 | 
 99 |     def log_all_groups(self, group_log_dir='groups/'):
100 |         group_dir = self.get_export_dir() + group_log_dir
101 |         os.makedirs(group_dir, exist_ok=True)
102 |         group_list = self.get("/preview/scim/v2/Groups").get('Resources', [])
103 |         for x in group_list:
104 |             group_name = x['displayName']
105 |             with open(group_dir + group_name, "w") as fp:
106 |                 fp.write(json.dumps(self.add_username_to_group(x)))
107 | 
108 |     @staticmethod
109 |     def build_group_dict(group_list):
110 |         group_dict = {}
111 |         for group in group_list:
112 |             group_dict[group.get('displayName')] = group
113 |         return group_dict
114 | 
115 |     def log_groups_from_list(self, group_name_list, group_log_dir='groups/', users_logfile='users.log'):
116 |         """
117 |         take a list of groups and log all the members
118 |         :param group_name_list: a list obj of group names
119 |         :param group_log_dir:
120 |         :param users_logfile: logfile to store the user log data
121 |         :return: return a list of userNames to export their notebooks for the next api call
122 |         """
123 |         group_dir = self.get_export_dir() + group_log_dir
124 |         os.makedirs(group_dir, exist_ok=True)
125 |         group_list = self.get("/preview/scim/v2/Groups").get('Resources', [])
126 |         group_dict = self.build_group_dict(group_list)
127 |         member_id_list = []
128 |         for group_name in group_name_list:
129 |             group_details = group_dict[group_name]
130 |             members_list = group_details.get('members', [])
131 |             filtered_users = list(filter(lambda y: 'Users' in y.get('$ref', None), members_list))
132 |             filtered_sub_groups = list(filter(lambda y: 'Groups' in y.get('$ref', None), members_list))
133 |             if filtered_sub_groups:
134 |                 sub_group_names = list(map(lambda z: z.get('display'), filtered_sub_groups))
135 |                 group_name_list.extend(sub_group_names)
136 |             member_id_list.extend(list(map(lambda y: y['value'], filtered_users)))
137 |             with open(group_dir + group_name, "w") as fp:
138 |                 group_details.pop('roles', None)  # removing the roles field from the groups arg
139 |                 fp.write(json.dumps(self.add_username_to_group(group_details)))
140 |         users_log = self.get_export_dir() + users_logfile
141 |         user_names_list = []
142 |         with open(users_log, 'w') as u_fp:
143 |             for mid in member_id_list:
144 |                 print('Exporting', mid)
145 |                 api = f'/preview/scim/v2/Users/{mid}'
146 |                 user_resp = self.get(api)
147 |                 user_resp.pop('roles', None)  # remove roles since those can change during the migration
148 |                 user_resp.pop('http_status_code', None)  # remove unnecessary params
149 |                 user_names_list.append(user_resp.get('userName'))
150 |                 u_fp.write(json.dumps(user_resp) + '\n')
151 |         return user_names_list
152 | 
153 |     def get_user_id_mapping(self):
154 |         # return a dict of the userName to id mapping of the new env
155 |         user_list = self.get('/preview/scim/v2/Users').get('Resources', None)
156 |         if user_list:
157 |             user_id_dict = {}
158 |             for user in user_list:
159 |                 user_id_dict[user['userName']] = user['id']
160 |             return user_id_dict
161 |         return None
162 | 
163 |     @staticmethod
164 |     def assign_roles_args(roles_list):
165 |         # roles list passed from file, which is in proper patch arg format already
166 |         # this method is used to patch the group IAM roles
167 |         assign_args = {"schemas": ["urn:ietf:params:scim:api:messages:2.0:PatchOp"],
168 |                        "Operations": [{"op": "add",
169 |                                        "path": "roles",
170 |                                        "value": roles_list}]}
171 |         return assign_args
172 | 
173 |     @staticmethod
174 |     def assign_entitlements_args(entitlements_list):
175 |         # roles list passed from file, which is in proper patch arg format already
176 |         # this method is used to patch the group IAM roles
177 |         assign_args = {"schemas": ["urn:ietf:params:scim:api:messages:2.0:PatchOp"],
178 |                        "Operations": [{"op": "add",
179 |                                        "path": "entitlements",
180 |                                        "value": entitlements_list}]}
181 |         return assign_args
182 | 
183 |     def assign_group_entitlements(self, group_dir):
184 |         # assign group role ACLs, which are only available via SCIM apis
185 |         group_ids = self.get_current_group_ids()
186 |         if not os.path.exists(group_dir):
187 |             print("No groups defined. Skipping group entitlement assignment")
188 |             return
189 |         groups = os.listdir(group_dir)
190 |         for group_name in groups:
191 |             with open(group_dir + group_name, 'r') as fp:
192 |                 group_data = json.loads(fp.read())
193 |                 entitlements = group_data.get('entitlements', None)
194 |                 if entitlements:
195 |                     g_id = group_ids[group_name]
196 |                     update_entitlements = self.assign_entitlements_args(entitlements)
197 |                     up_resp = self.patch(f'/preview/scim/v2/Groups/{g_id}', update_entitlements)
198 |                     print(up_resp)
199 | 
200 |     def assign_group_roles(self, group_dir):
201 |         # assign group role ACLs, which are only available via SCIM apis
202 |         group_ids = self.get_current_group_ids()
203 |         if not os.path.exists(group_dir):
204 |             print("No groups defined. Skipping group entitlement assignment")
205 |             return
206 |         groups = os.listdir(group_dir)
207 |         for group_name in groups:
208 |             with open(group_dir + group_name, 'r') as fp:
209 |                 group_data = json.loads(fp.read())
210 |                 roles = group_data.get('roles', None)
211 |                 if roles:
212 |                     g_id = group_ids[group_name]
213 |                     update_roles = self.assign_roles_args(roles)
214 |                     up_resp = self.patch(f'/preview/scim/v2/Groups/{g_id}', update_roles)
215 |                     print(up_resp)
216 |                 entitlements = group_data.get('entitlements', None)
217 |                 if entitlements:
218 |                     g_id = group_ids[group_name]
219 |                     update_entitlements = self.assign_entitlements_args(entitlements)
220 |                     up_resp = self.patch(f'/preview/scim/v2/Groups/{g_id}', update_entitlements)
221 |                     print(up_resp)
222 | 
223 |     def get_current_user_ids(self):
224 |         # return a dict of user email to user id mappings
225 |         users = self.get('/preview/scim/v2/Users')['Resources']
226 |         user_id = {}
227 |         for user in users:
228 |             user_id[user['emails'][0]['value']] = user['id']
229 |         return user_id
230 | 
231 |     def get_old_user_emails(self, users_logfile='users.log'):
232 |         # return a dictionary of { old_id : email } from the users log
233 |         users_log = self.get_export_dir() + users_logfile
234 |         email_dict = {}
235 |         with open(users_log, 'r') as fp:
236 |             for x in fp:
237 |                 user = json.loads(x)
238 |                 email_dict[user['id']] = user['emails'][0]['value']
239 |         return email_dict
240 | 
241 |     def get_current_group_ids(self):
242 |         # return a dict of group displayName and id mappings
243 |         groups = self.get('/preview/scim/v2/Groups').get('Resources', None)
244 |         group_ids = {}
245 |         for group in groups:
246 |             group_ids[group['displayName']] = group['id']
247 |         return group_ids
248 | 
249 |     @staticmethod
250 |     def add_roles_arg(roles_list):
251 |         # this builds the args from a list of IAM roles. diff built from user logfile
252 |         role_values = [{'value': x} for x in roles_list]
253 |         patch_roles_arg = {
254 |             "schemas": ["urn:ietf:params:scim:api:messages:2.0:PatchOp"],
255 |             "Operations": [
256 |                 {
257 |                     "op": "add",
258 |                     "path": "roles",
259 |                     "value": role_values
260 |                 }
261 |             ]
262 |         }
263 |         return patch_roles_arg
264 | 
265 |     def assign_user_entitlements(self, user_log_file='users.log'):
266 |         """
267 |         assign user entitlements to allow cluster create, job create, sql analytics etc
268 |         :param user_log_file:
269 |         :return:
270 |         """
271 |         user_log = self.get_export_dir() + user_log_file
272 |         if not os.path.exists(user_log):
273 |             print("Skipping user entitlement assignment. Logfile does not exist")
274 |             return
275 |         user_ids = self.get_user_id_mapping()
276 |         with open(user_log, 'r') as fp:
277 |             # loop through each user in the file
278 |             for line in fp:
279 |                 user = json.loads(line)
280 |                 # add the users entitlements
281 |                 user_entitlements = user.get('entitlements', None)
282 |                 # get the current registered user id
283 |                 user_id = user_ids[user['userName']]
284 |                 if user_entitlements:
285 |                     entitlements_args = self.assign_entitlements_args(user_entitlements)
286 |                     update_resp = self.patch(f'/preview/scim/v2/Users/{user_id}', entitlements_args)
287 | 
288 |     def assign_user_roles(self, user_log_file='users.log'):
289 |         """
290 |         assign user roles that are missing after adding group assignment
291 |         Note: There is a limitation in the exposed API. If a user is assigned a role permission & the permission
292 |         is granted via a group, we can't distinguish the difference. Only group assignment will be migrated.
293 |         :param user_log_file: logfile of all user properties
294 |         :return:
295 |         """
296 |         user_log = self.get_export_dir() + user_log_file
297 |         if not os.path.exists(user_log):
298 |             print("Skipping user entitlement assignment. Logfile does not exist")
299 |             return
300 |         # keys to filter from the user log to get the user / role mapping
301 |         old_role_keys = ('userName', 'roles')
302 |         cur_role_keys = ('schemas', 'userName', 'entitlements', 'roles', 'groups')
303 |         # get current user id of the new environment, k,v = email, id
304 |         user_ids = self.get_user_id_mapping()
305 |         with open(user_log, 'r') as fp:
306 |             # loop through each user in the file
307 |             for line in fp:
308 |                 user = json.loads(line)
309 |                 user_roles = {k: user[k] for k in old_role_keys if k in user}
310 |                 # get the current registered user id
311 |                 user_id = user_ids[user['userName']]
312 |                 # get the current users settings
313 |                 cur_user = self.get('/preview/scim/v2/Users/{0}'.format(user_id))
314 |                 # get the current users IAM roles
315 |                 current_roles = cur_user.get('roles', None)
316 |                 if current_roles:
317 |                     cur_role_values = set([x['value'] for x in current_roles])
318 |                 else:
319 |                     cur_role_values = set()
320 |                 # get the users saved IAM roles from the export
321 |                 saved_roles = user_roles.get('roles', None)
322 |                 if saved_roles:
323 |                     saved_role_values = set([y['value'] for y in saved_roles])
324 |                 else:
325 |                     saved_role_values = set()
326 |                 roles_needed = list(saved_role_values - cur_role_values)
327 |                 if roles_needed:
328 |                     # get the json to add the roles to the user profile
329 |                     patch_roles = self.add_roles_arg(roles_needed)
330 |                     update_resp = self.patch(f'/preview/scim/v2/Users/{user_id}', patch_roles)
331 | 
332 |     @staticmethod
333 |     def get_member_args(member_id_list):
334 |         """
335 |         helper function to form the json args to the patch request to update group memberships
336 |         :param member_id_list: member ids to add to a specific group
337 |         :return: dict args for the patch operation
338 |         """
339 |         member_id_list_json = []
340 |         for m_id in member_id_list:
341 |             member_id_list_json.append({'value': '{0}'.format(m_id)})
342 | 
343 |         add_members_args = {
344 |             "schemas": ["urn:ietf:params:scim:api:messages:2.0:PatchOp"],
345 |             "Operations": [{
346 |                 "op": "add",
347 |                 "value": {"members": member_id_list_json}
348 |                 }
349 |             ]
350 |         }
351 |         return add_members_args
352 | 
353 |     @staticmethod
354 |     def is_user(member_json):
355 |         # currently a workaround to get whether the member is a user or group
356 |         # check the ref instead of the type field
357 |         # once fixed, the type should be `user` or `group` in lowercase
358 |         if 'Users/' in member_json['$ref']:
359 |             return True
360 |         return False
361 | 
362 |     @staticmethod
363 |     def is_group(member_json):
364 |         # currently a workaround to get whether the member is a user or group
365 |         # check the ref instead of the type field
366 |         # once fixed, the type should be `user` or `group` in lowercase
367 |         if 'Groups/' in member_json['$ref']:
368 |             return True
369 |         return False
370 | 
371 |     def import_groups(self, group_dir):
372 |         # list all the groups and create groups first
373 |         if not os.path.exists(group_dir):
374 |             print("No groups to import.")
375 |             return
376 |         groups = os.listdir(group_dir)
377 |         create_args = {
378 |             "schemas": ["urn:ietf:params:scim:schemas:core:2.0:Group"],
379 |             "displayName": "default"
380 |         }
381 |         for x in groups:
382 |             print('Creating group: {0}'.format(x))
383 |             # set the create args displayName property aka group name
384 |             create_args['displayName'] = x
385 |             group_resp = self.post('/preview/scim/v2/Groups', create_args)
386 | 
387 |         # dict of { group_name : group_id }
388 |         current_group_ids = self.get_current_group_ids()
389 |         # dict of { email : current_user_id }
390 |         current_user_ids = self.get_current_user_ids()
391 |         # dict of { old_user_id : email }
392 |         old_user_emails = self.get_old_user_emails()
393 |         for group_name in groups:
394 |             with open(group_dir + group_name, 'r') as fp:
395 |                 members = json.loads(fp.read()).get('members', None)
396 |                 if members:
397 |                     # grab a list of ids to add either groups or users to this current group
398 |                     member_id_list = []
399 |                     for m in members:
400 |                         if self.is_user(m):
401 |                             old_email = old_user_emails[m['value']]
402 |                             this_user_id = current_user_ids.get(old_email, '')
403 |                             if not this_user_id:
404 |                                 raise ValueError(f'Unable to find user {old_email} in the new workspace. '
405 |                                                  f'This users email case has changed and needs to be updated with '
406 |                                                  f'the --replace-old-email and --update-new-email options')
407 |                             member_id_list.append(this_user_id)
408 |                         elif self.is_group(m):
409 |                             this_group_id = current_group_ids.get(m['display'])
410 |                             member_id_list.append(this_group_id)
411 |                         else:
412 |                             print("Skipping service principal members and other identities not within users/groups")
413 |                     add_members_json = self.get_member_args(member_id_list)
414 |                     group_id = current_group_ids[group_name]
415 |                     add_resp = self.patch('/preview/scim/v2/Groups/{0}'.format(group_id), add_members_json)
416 | 
417 |     def import_users(self, user_log):
418 |         # first create the user identities with the required fields
419 |         create_keys = ('emails', 'entitlements', 'displayName', 'name', 'userName')
420 |         if not os.path.exists(user_log):
421 |             print("No users to import.")
422 |             return
423 |         with open(user_log, 'r') as fp:
424 |             for x in fp:
425 |                 user = json.loads(x)
426 |                 print("Creating user: {0}".format(user['userName']))
427 |                 user_create = {k: user[k] for k in create_keys if k in user}
428 |                 create_resp = self.post('/preview/scim/v2/Users', user_create)
429 | 
430 |     def import_all_users_and_groups(self, user_log_file='users.log', group_log_dir='groups/'):
431 |         user_log = self.get_export_dir() + user_log_file
432 |         group_dir = self.get_export_dir() + group_log_dir
433 | 
434 |         self.import_users(user_log)
435 |         self.import_groups(group_dir)
436 |         # assign the users to IAM roles if on AWS
437 |         if self.is_aws():
438 |             print("Update group role assignments")
439 |             self.assign_group_roles(group_dir)
440 |             print("Update user role assignments")
441 |             self.assign_user_roles(user_log_file)
442 |             print("Done")
443 |         # need to separate role assignment and entitlements to support Azure
444 |         print("Updating groups entitlements")
445 |         self.assign_group_entitlements(group_dir)
446 |         print("Updating users entitlements")
447 |         self.assign_user_entitlements(user_log_file)
448 | 


--------------------------------------------------------------------------------
/dbclient/ClustersClient.py:
--------------------------------------------------------------------------------
  1 | import os, re, time
  2 | 
  3 | from dbclient import *
  4 | 
  5 | 
  6 | class ClustersClient(dbclient):
  7 |     create_configs = {'num_workers',
  8 |                       'autoscale',
  9 |                       'cluster_name',
 10 |                       'spark_version',
 11 |                       'spark_conf',
 12 |                       'aws_attributes',
 13 |                       'node_type_id',
 14 |                       'driver_node_type_id',
 15 |                       'ssh_public_keys',
 16 |                       'custom_tags',
 17 |                       'cluster_log_conf',
 18 |                       'init_scripts',
 19 |                       'docker_image',
 20 |                       'spark_env_vars',
 21 |                       'autotermination_minutes',
 22 |                       'enable_elastic_disk',
 23 |                       'instance_pool_id',
 24 |                       'policy_id',
 25 |                       'pinned_by_user_name',
 26 |                       'creator_user_name',
 27 |                       'cluster_id'}
 28 | 
 29 |     def cleanup_cluster_pool_configs(self, cluster_json, cluster_creator, is_job_cluster=False):
 30 |         """
 31 |         Pass in cluster json and cluster_creator to update fields that are not needed for clusters submitted to pools
 32 |         :param cluster_json:
 33 |         :param cluster_creator:
 34 |         :param is_job_cluster: flag to not add tags for job clusters since those clusters don't have this behavior
 35 |                 as interactive clusters
 36 |         :return:
 37 |         """
 38 |         pool_id_dict = self.get_instance_pool_id_mapping()
 39 |         # if pool id exists, remove instance types
 40 |         cluster_json.pop('node_type_id', None)
 41 |         cluster_json.pop('driver_node_type_id', None)
 42 |         cluster_json.pop('enable_elastic_disk', None)
 43 |         if not is_job_cluster:
 44 |             # add custom tag for original cluster creator for cost tracking
 45 |             if 'custom_tags' in cluster_json:
 46 |                 tags = cluster_json['custom_tags']
 47 |                 tags['OriginalCreator'] = cluster_creator
 48 |                 cluster_json['custom_tags'] = tags
 49 |             else:
 50 |                 cluster_json['custom_tags'] = {'OriginalCreator': cluster_creator}
 51 |         # remove all aws_attr except for IAM role if it exists
 52 |         if 'aws_attributes' in cluster_json:
 53 |             aws_conf = cluster_json.pop('aws_attributes')
 54 |             iam_role = aws_conf.get('instance_profile_arn', None)
 55 |             if not iam_role:
 56 |                 cluster_json['aws_attributes'] = {'instance_profile_arn': iam_role}
 57 |         # map old pool ids to new pool ids
 58 |         old_pool_id = cluster_json['instance_pool_id']
 59 |         cluster_json['instance_pool_id'] = pool_id_dict[old_pool_id]
 60 |         return cluster_json
 61 | 
 62 |     def delete_all_clusters(self):
 63 |         cl = self.get_cluster_list(False)
 64 |         for x in cl:
 65 |             self.post('/clusters/unpin', {'cluster_id': x['cluster_id']})
 66 |             self.post('/clusters/permanent-delete', {'cluster_id': x['cluster_id']})
 67 | 
 68 |     def edit_cluster(self, cid, iam_role):
 69 |         """Edits the existing metastore cluster
 70 |         Returns cluster_id"""
 71 |         version = self.get_latest_spark_version()
 72 |         import os
 73 |         real_path = os.path.dirname(os.path.realpath(__file__))
 74 |         if self.is_aws():
 75 |             print("Updating cluster with: " + iam_role)
 76 |             current_cluster_json = self.get(f'/clusters/get?cluster_id={cid}')
 77 |             run_properties = set(list(current_cluster_json.keys())) - self.create_configs
 78 |             for p in run_properties:
 79 |                 del current_cluster_json[p]
 80 |             if 'aws_attributes' in current_cluster_json:
 81 |                 aws_conf = current_cluster_json.pop('aws_attributes')
 82 |                 aws_conf['instance_profile_arn'] = iam_role
 83 |             else:
 84 |                 aws_conf = {'instance_profile_arn': iam_role}
 85 |             current_cluster_json['aws_attributes'] = aws_conf
 86 |             resp = self.post('/clusters/edit', current_cluster_json)
 87 |             print(resp)
 88 |             new_cid = self.wait_for_cluster(cid)
 89 |             return new_cid
 90 |         else:
 91 |             return False
 92 | 
 93 |     def get_cluster_acls(self, cluster_id, cluster_name):
 94 |         """
 95 |         Export all cluster permissions for a specific cluster id
 96 |         :return:
 97 |         """
 98 |         perms = self.get(f'/preview/permissions/clusters/{cluster_id}/')
 99 |         perms['cluster_name'] = cluster_name
100 |         return perms
101 | 
102 |     def get_cluster_id_by_name(self, cname, running_only=False):
103 |         cluster_list = self.get('/clusters/list').get('clusters', [])
104 |         if running_only:
105 |             running = list(filter(lambda x: x['state'] == "RUNNING", cluster_list))
106 |             for x in running:
107 |                 if cname == x['cluster_name']:
108 |                     return x['cluster_id']
109 |         else:
110 |             for x in cluster_list:
111 |                 if cname == x['cluster_name']:
112 |                     return x['cluster_id']
113 |         return None
114 | 
115 |     def get_cluster_list(self, alive=True):
116 |         """
117 |         Returns an array of json objects for the running clusters.
118 |         Grab the cluster_name or cluster_id
119 |         """
120 |         clusters_list = self.get("/clusters/list", print_json=False).get('clusters', [])
121 |         if alive and clusters_list:
122 |             running = filter(lambda x: x['state'] == "RUNNING", clusters_list)
123 |             return list(running)
124 |         else:
125 |             return clusters_list
126 | 
127 |     def get_execution_context(self, cid):
128 |         print("Creating remote Spark Session")
129 |         time.sleep(5)
130 |         ec_payload = {"language": "python",
131 |                       "clusterId": cid}
132 |         ec = self.post('/contexts/create', json_params=ec_payload, version="1.2")
133 |         # Grab the execution context ID
134 |         ec_id = ec.get('id', None)
135 |         if not ec_id:
136 |             print('Unable to establish remote session')
137 |             print(ec)
138 |             raise Exception("Remote session error")
139 |         return ec_id
140 | 
141 |     def get_global_init_scripts(self):
142 |         """ return a list of global init scripts. Currently not logged """
143 |         ls = self.get('/dbfs/list', {'path': '/databricks/init/'}).get('files', None)
144 |         if ls is None:
145 |             return []
146 |         else:
147 |             global_scripts = [{'path': x['path']} for x in ls if x['is_dir'] == False]
148 |             return global_scripts
149 | 
150 |     def get_instance_pool_id_mapping(self, log_file='instance_pools.log'):
151 |         pool_log = self.get_export_dir() + log_file
152 |         current_pools = self.get('/instance-pools/list').get('instance_pools', None)
153 |         if not current_pools:
154 |             return None
155 |         new_pools = {}
156 |         # build dict of pool name and id mapping
157 |         for p in current_pools:
158 |             new_pools[p['instance_pool_name']] = p['instance_pool_id']
159 |         # mapping id from old_pool_id to new_pool_id
160 |         pool_mapping_dict = {}
161 |         with open(pool_log, 'r') as fp:
162 |             for line in fp:
163 |                 pool_conf = json.loads(line)
164 |                 old_pool_id = pool_conf['instance_pool_id']
165 |                 pool_name = pool_conf['instance_pool_name']
166 |                 new_pool_id = new_pools[pool_name]
167 |                 pool_mapping_dict[old_pool_id] = new_pool_id
168 |         return pool_mapping_dict
169 | 
170 |     def get_policy_id_by_name_dict(self):
171 |         name_id_dict = {}
172 |         resp = self.get('/policies/clusters/list').get('policies', [])
173 |         for policy in resp:
174 |             name_id_dict[policy['name']] = policy['policy_id']
175 |         return name_id_dict
176 | 
177 |     def get_spark_versions(self):
178 |         return self.get("/clusters/spark-versions", print_json=True)
179 | 
180 |     def get_instance_profiles_list(self):
181 |         if self.is_aws():
182 |             ip_json_list = self.get('/instance-profiles/list').get('instance_profiles', [])
183 |             iam_roles_list = list(map(lambda x: x.get('instance_profile_arn'), ip_json_list))
184 |             return iam_roles_list
185 |         return []
186 | 
187 |     def get_iam_role_by_cid(self, cid):
188 |         if self.is_aws():
189 |             cluster_resp = self.get(f'/clusters/get?cluster_id={cid}')
190 |             return cluster_resp.get('aws_attributes').get('instance_profile_arn', None)
191 |         return None
192 | 
193 |     def get_new_policy_id_dict(self, policy_file='cluster_policies.log'):
194 |         """
195 |         mapping function to get the new policy ids. ids change when migrating to a new workspace
196 |         read the log file and map the old id to the new id
197 |         :param old_policy_id: str of the old id
198 |         :return: str of new policy id
199 |         """
200 |         policy_log = self.get_export_dir() + policy_file
201 |         current_policies = self.get('/policies/clusters/list').get('policies', [])
202 |         current_policies_dict = {}  # name : current policy id
203 |         for policy in current_policies:
204 |             current_name = policy['name']
205 |             current_id = policy['policy_id']
206 |             current_policies_dict[current_name] = current_id
207 |         policy_id_dict = {}
208 |         with open(policy_log, 'r') as fp:
209 |             for line in fp:
210 |                 policy_conf = json.loads(line)
211 |                 policy_name = policy_conf['name']
212 |                 old_policy_id = policy_conf['policy_id']
213 |                 policy_id_dict[old_policy_id] = current_policies_dict[policy_name] # old_id : new_id
214 |         return policy_id_dict
215 | 
216 |     def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None):
217 |         """
218 |         Import cluster configs and update appropriate properties / tags in the new env
219 |         :param log_file:
220 |         :return:
221 |         """
222 |         cluster_log = self.get_export_dir() + log_file
223 |         acl_cluster_log = self.get_export_dir() + acl_log_file
224 |         if not os.path.exists(cluster_log):
225 |             print("No clusters to import.")
226 |             return
227 |         current_cluster_names = set([x.get('cluster_name', None) for x in self.get_cluster_list(False)])
228 |         old_2_new_policy_ids = self.get_new_policy_id_dict()  # dict of {old_id : new_id}
229 |         # get instance pool id mappings
230 |         with open(cluster_log, 'r') as fp:
231 |             for line in fp:
232 |                 cluster_conf = json.loads(line)
233 |                 cluster_name = cluster_conf['cluster_name']
234 |                 if cluster_name in current_cluster_names:
235 |                     print("Cluster already exists, skipping: {0}".format(cluster_name))
236 |                     continue
237 |                 cluster_creator = cluster_conf.pop('creator_user_name')
238 |                 if 'policy_id' in cluster_conf:
239 |                     old_policy_id = cluster_conf['policy_id']
240 |                     cluster_conf['policy_id'] = old_2_new_policy_ids[old_policy_id]
241 |                 # check for instance pools and modify cluster attributes
242 |                 if 'instance_pool_id' in cluster_conf:
243 |                     new_cluster_conf = self.cleanup_cluster_pool_configs(cluster_conf, cluster_creator)
244 |                 else:
245 |                     # update cluster configs for non-pool clusters
246 |                     # add original creator tag to help with DBU tracking
247 |                     if 'custom_tags' in cluster_conf:
248 |                         tags = cluster_conf['custom_tags']
249 |                         tags['OriginalCreator'] = cluster_creator
250 |                         cluster_conf['custom_tags'] = tags
251 |                     else:
252 |                         cluster_conf['custom_tags'] = {'OriginalCreator': cluster_creator}
253 |                     new_cluster_conf = cluster_conf
254 |                 print("Creating cluster: {0}".format(new_cluster_conf['cluster_name']))
255 |                 cluster_resp = self.post('/clusters/create', new_cluster_conf)
256 |                 if cluster_resp['http_status_code'] == 200:
257 |                     stop_resp = self.post('/clusters/delete', {'cluster_id': cluster_resp['cluster_id']})
258 |                     if 'pinned_by_user_name' in cluster_conf:
259 |                         pin_resp = self.post('/clusters/pin', {'cluster_id': cluster_resp['cluster_id']})
260 |                 else:
261 |                     print(cluster_resp)
262 |         # add cluster ACLs
263 |         # loop through and reapply cluster ACLs
264 |         with open(acl_cluster_log, 'r') as acl_fp:
265 |             for x in acl_fp:
266 |                 data = json.loads(x)
267 |                 cluster_name = data['cluster_name']
268 |                 print(f'Applying acl for {cluster_name}')
269 |                 acl_args = {'access_control_list' : self.build_acl_args(data['access_control_list'])}
270 |                 cid = self.get_cluster_id_by_name(cluster_name)
271 |                 if cid is None:
272 |                     raise ValueError('Cluster id must exist in new env. Re-import cluster configs.')
273 |                 api = f'/preview/permissions/clusters/{cid}'
274 |                 resp = self.put(api, acl_args)
275 |                 print(resp)
276 | 
277 |     def import_cluster_policies(self, log_file='cluster_policies.log', acl_log_file='acl_cluster_policies.log'):
278 |         policies_log = self.get_export_dir() + log_file
279 |         acl_policies_log = self.get_export_dir() + acl_log_file
280 |         # create the policies
281 |         if os.path.exists(policies_log):
282 |             with open(policies_log, 'r') as policy_fp:
283 |                 for p in policy_fp:
284 |                     policy_conf = json.loads(p)
285 |                     # when creating the policy, we only need `name` and `definition` fields
286 |                     create_args = {'name': policy_conf['name'],
287 |                                    'definition': policy_conf['definition']}
288 |                     resp = self.post('/policies/clusters/create', create_args)
289 |             # ACLs are created by using the `access_control_list` key
290 |             with open(acl_policies_log, 'r') as acl_fp:
291 |                 id_map = self.get_policy_id_by_name_dict()
292 |                 for x in acl_fp:
293 |                     p_acl = json.loads(x)
294 |                     acl_create_args = {'access_control_list': self.build_acl_args(p_acl['access_control_list'])}
295 |                     policy_id = id_map[p_acl['name']]
296 |                     api = f'/permissions/cluster-policies/{policy_id}'
297 |                     resp = self.put(api, acl_create_args)
298 |                     print(resp)
299 |         else:
300 |             print('Skipping cluster policies as no log file exists')
301 | 
302 |     def import_instance_pools(self, log_file='instance_pools.log'):
303 |         pool_log = self.get_export_dir() + log_file
304 |         if not os.path.exists(pool_log):
305 |             print("No instance pools to import.")
306 |             return
307 |         with open(pool_log, 'r') as fp:
308 |             for line in fp:
309 |                 pool_conf = json.loads(line)
310 |                 pool_resp = self.post('/instance-pools/create', pool_conf)
311 | 
312 |     def import_instance_profiles(self, log_file='instance_profiles.log'):
313 |         # currently an AWS only operation
314 |         ip_log = self.get_export_dir() + log_file
315 |         if not os.path.exists(ip_log):
316 |             print("No instance profiles to import.")
317 |             return
318 |         # check current profiles and skip if the profile already exists
319 |         ip_list = self.get('/instance-profiles/list').get('instance_profiles', None)
320 |         if ip_list:
321 |             list_of_profiles = [x['instance_profile_arn'] for x in ip_list]
322 |         else:
323 |             list_of_profiles = []
324 |         list_of_profiles = []
325 |         import_profiles_count = 0
326 |         with open(ip_log, "r") as fp:
327 |             for line in fp:
328 |                 ip_arn = json.loads(line).get('instance_profile_arn', None)
329 |                 if ip_arn not in list_of_profiles:
330 |                     print("Importing arn: {0}".format(ip_arn))
331 |                     resp = self.post('/instance-profiles/add', {'instance_profile_arn': ip_arn})
332 |                     if 'error_code' in resp:
333 |                         print("Error")
334 |                     else:
335 |                         import_profiles_count += 1
336 |                     print(resp)
337 |                 else:
338 |                     print("Skipping since profile exists: {0}".format(ip_arn))
339 |         return import_profiles_count
340 | 
341 |     def is_spark_3(self, cid):
342 |         spark_version = self.get(f'/clusters/get?cluster_id={cid}').get('spark_version', "")
343 |         if spark_version[0] >= '7':
344 |             return True
345 |         else:
346 |             return False
347 | 
348 |     def launch_cluster(self, iam_role=None):
349 |         """ Launches a cluster to get DDL statements.
350 |         Returns a cluster_id """
351 |         # removed for now as Spark 3.0 will have backwards incompatible changes
352 |         # version = self.get_latest_spark_version()
353 |         import os
354 |         real_path = os.path.dirname(os.path.realpath(__file__))
355 |         if self.is_aws():
356 |             with open(real_path + '/../data/aws_cluster.json', 'r') as fp:
357 |                 cluster_json = json.loads(fp.read())
358 |             if iam_role:
359 |                 aws_attr = cluster_json['aws_attributes']
360 |                 print("Creating cluster with: " + iam_role)
361 |                 aws_attr['instance_profile_arn'] = iam_role
362 |                 cluster_json['aws_attributes'] = aws_attr
363 |         else:
364 |             with open(real_path + '/../data/azure_cluster.json', 'r') as fp:
365 |                 cluster_json = json.loads(fp.read())
366 |         # set the latest spark release regardless of defined cluster json
367 |         # cluster_json['spark_version'] = version['key']
368 |         cluster_name = cluster_json['cluster_name']
369 |         existing_cid = self.get_cluster_id_by_name(cluster_name)
370 |         if existing_cid:
371 |             # if the cluster id exists, then a cluster exists in a terminated state. let's start it
372 |             cid = self.start_cluster_by_name(cluster_name)
373 |             return cid
374 |         else:
375 |             print("Starting cluster with name: {0} ".format(cluster_name))
376 |             c_info = self.post('/clusters/create', cluster_json)
377 |             if c_info['http_status_code'] != 200:
378 |                 raise Exception("Could not launch cluster. Verify that the --azure flag or cluster config is correct.")
379 |             self.wait_for_cluster(c_info['cluster_id'])
380 |             return c_info['cluster_id']
381 | 
382 |     def log_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None):
383 |         """
384 |         Log the current cluster configs in json file
385 |         :param log_file: log the cluster configs
386 |         :param acl_log_file: log the ACL definitions
387 |         :param filter_user: user name to filter and log the cluster config
388 |         :return:
389 |         """
390 |         cluster_log = self.get_export_dir() + log_file
391 |         acl_cluster_log = self.get_export_dir() + acl_log_file
392 |         # pinned by cluster_user is a flag per cluster
393 |         cl_raw = self.get_cluster_list(False)
394 |         cluster_list = self.remove_automated_clusters(cl_raw)
395 |         ip_list = self.get('/instance-profiles/list').get('instance_profiles', [])
396 |         nonempty_ip_list = []
397 |         if ip_list:
398 |             # filter none if we hit a profile w/ a none object
399 |             # generate list of registered instance profiles to check cluster configs against
400 |             nonempty_ip_list = list(filter(None, [x.get('instance_profile_arn', None) for x in ip_list]))
401 | 
402 |         # filter on these items as MVP of the cluster configs
403 |         # https://docs.databricks.com/api/latest/clusters.html#request-structure
404 |         with open(cluster_log, 'w') as log_fp, open(acl_cluster_log, 'w') as acl_log_fp:
405 |             for cluster_json in cluster_list:
406 |                 run_properties = set(list(cluster_json.keys())) - self.create_configs
407 |                 for p in run_properties:
408 |                     del cluster_json[p]
409 |                 if 'aws_attributes' in cluster_json:
410 |                     aws_conf = cluster_json.pop('aws_attributes')
411 |                     iam_role = aws_conf.get('instance_profile_arn', None)
412 |                     if iam_role and ip_list:
413 |                         if iam_role not in nonempty_ip_list:
414 |                             print("Skipping log of default IAM role: " + iam_role)
415 |                             del aws_conf['instance_profile_arn']
416 |                             cluster_json['aws_attributes'] = aws_conf
417 |                     cluster_json['aws_attributes'] = aws_conf
418 |                 cluster_perms = self.get_cluster_acls(cluster_json['cluster_id'], cluster_json['cluster_name'])
419 |                 acl_log_fp.write(json.dumps(cluster_perms) + '\n')
420 |                 if filter_user:
421 |                     if cluster_json['creator_user_name'] == filter_user:
422 |                         log_fp.write(json.dumps(cluster_json) + '\n')
423 |                 else:
424 |                     log_fp.write(json.dumps(cluster_json) + '\n')
425 | 
426 |     def log_cluster_policies(self, log_file='cluster_policies.log', acl_log_file='acl_cluster_policies.log'):
427 |         policies_log = self.get_export_dir() + log_file
428 |         acl_policies_log = self.get_export_dir() + acl_log_file
429 |         # log all cluster policy definitions
430 |         policy_ids = {}
431 |         policies_list = self.get('/policies/clusters/list').get('policies', [])
432 |         with open(policies_log, 'w') as fp:
433 |             for x in policies_list:
434 |                 policy_ids[x.get('policy_id')] = x.get('name')
435 |                 fp.write(json.dumps(x) + '\n')
436 |         # log cluster policy ACLs, which takes a policy id as arguments
437 |         with open(acl_policies_log, 'w') as acl_fp:
438 |             for pid in policy_ids:
439 |                 api = f'/preview/permissions/cluster-policies/{pid}'
440 |                 perms = self.get(api)
441 |                 perms['name'] = policy_ids[pid]
442 |                 acl_fp.write(json.dumps(perms) + '\n')
443 | 
444 |     def log_instance_pools(self, log_file='instance_pools.log'):
445 |         pool_log = self.get_export_dir() + log_file
446 |         pools = self.get('/instance-pools/list').get('instance_pools', None)
447 |         if pools:
448 |             with open(pool_log, "w") as fp:
449 |                 for x in pools:
450 |                     fp.write(json.dumps(x) + '\n')
451 | 
452 |     def log_instance_profiles(self, log_file='instance_profiles.log'):
453 |         ip_log = self.get_export_dir() + log_file
454 |         ips = self.get('/instance-profiles/list').get('instance_profiles', None)
455 |         if ips:
456 |             with open(ip_log, "w") as fp:
457 |                 for x in ips:
458 |                     fp.write(json.dumps(x) + '\n')
459 | 
460 |     def remove_automated_clusters(self, cluster_list, log_file='skipped_clusters.log'):
461 |         """
462 |         Automated clusters like job clusters or model endpoints should be excluded
463 |         :param cluster_list: list of cluster configurations
464 |         :return: cleaned list with automated clusters removed
465 |         """
466 |         # model endpoint clusters start with the following
467 |         ml_model_pattern = "mlflow-model-"
468 |         # job clusters have specific format, job-JOBID-run-RUNID
469 |         re_expr = re.compile("job-\d+-run-\d+$")
470 |         clean_cluster_list = []
471 |         with open(self.get_export_dir() + log_file, 'w') as log_fp:
472 |             for cluster in cluster_list:
473 |                 cluster_name = cluster['cluster_name']
474 |                 if re_expr.match(cluster_name) or cluster_name.startswith(ml_model_pattern):
475 |                     log_fp.write(json.dumps(cluster) + '\n')
476 |                 else:
477 |                     clean_cluster_list.append(cluster)
478 |         return clean_cluster_list
479 | 
480 |     def start_cluster_by_name(self, cluster_name):
481 |         cid = self.get_cluster_id_by_name(cluster_name)
482 |         if cid is None:
483 |             raise Exception('Error: Cluster name does not exist')
484 |         print("Starting {0} with id {1}".format(cluster_name, cid))
485 |         resp = self.post('/clusters/start', {'cluster_id': cid})
486 |         if 'error_code' in resp:
487 |             if resp.get('error_code', None) == 'INVALID_STATE':
488 |                 print('Error: {0}'.format(resp.get('message', None)))
489 |             else:
490 |                 raise Exception('Error: cluster does not exist, or is in a state that is unexpected. '
491 |                                 'Cluster should either be terminated state, or already running.')
492 |         self.wait_for_cluster(cid)
493 |         return cid
494 | 
495 |     def submit_command(self, cid, ec_id, cmd):
496 |         # This launches spark commands and print the results. We can pull out the text results from the API
497 |         command_payload = {'language': 'python',
498 |                            'contextId': ec_id,
499 |                            'clusterId': cid,
500 |                            'command': cmd}
501 |         command = self.post('/commands/execute',
502 |                             json_params=command_payload,
503 |                             version="1.2")
504 | 
505 |         com_id = command.get('id', None)
506 |         if not com_id:
507 |             print("ERROR: ")
508 |             print(command)
509 |         # print('command_id : ' + com_id)
510 |         result_payload = {'clusterId': cid, 'contextId': ec_id, 'commandId': com_id}
511 | 
512 |         resp = self.get('/commands/status', json_params=result_payload, version="1.2")
513 |         is_running = self.get_key(resp, 'status')
514 | 
515 |         # loop through the status api to check for the 'running' state call and sleep 1 second
516 |         while (is_running == "Running") or (is_running == 'Queued'):
517 |             resp = self.get('/commands/status', json_params=result_payload, version="1.2")
518 |             is_running = self.get_key(resp, 'status')
519 |             time.sleep(1)
520 |         end_result_status = self.get_key(resp, 'status')
521 |         end_results = self.get_key(resp, 'results')
522 |         if end_results.get('resultType', None) == 'error':
523 |             print("ERROR: ")
524 |             print(end_results.get('summary', None))
525 |         return end_results
526 | 
527 |     def wait_for_cluster(self, cid):
528 |         c_state = self.get('/clusters/get', {'cluster_id': cid})
529 |         while c_state['state'] != 'RUNNING' and c_state['state'] != 'TERMINATED':
530 |             c_state = self.get('/clusters/get', {'cluster_id': cid})
531 |             print('Cluster state: {0}'.format(c_state['state']))
532 |             time.sleep(2)
533 |         if c_state['state'] == 'TERMINATED':
534 |             raise RuntimeError("Cluster is terminated. Please check EVENT history for details")
535 |         return cid
536 | 
537 | 


--------------------------------------------------------------------------------
/dbclient/WorkspaceClient.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | from dbclient import *
  3 | from timeit import default_timer as timer
  4 | from datetime import timedelta
  5 | import os
  6 | 
  7 | WS_LIST = "/workspace/list"
  8 | WS_STATUS = "/workspace/get-status"
  9 | WS_MKDIRS = "/workspace/mkdirs"
 10 | WS_IMPORT = "/workspace/import"
 11 | WS_EXPORT = "/workspace/export"
 12 | LS_ZONES = "/clusters/list-zones"
 13 | 
 14 | 
 15 | class WorkspaceClient(ScimClient):
 16 |     _languages = {'.py': 'PYTHON',
 17 |                   '.scala': 'SCALA',
 18 |                   '.r': 'R',
 19 |                   '.sql': 'SQL'}
 20 | 
 21 |     def get_language(self, file_ext):
 22 |         return self._languages[file_ext]
 23 | 
 24 |     def get_top_level_folders(self):
 25 |         # get top level folders excluding the /Users path
 26 |         supported_types = ('NOTEBOOK', 'DIRECTORY')
 27 |         root_items = self.get(WS_LIST, {'path': '/'}).get('objects', [])
 28 |         # filter out Projects and Users folders
 29 |         non_users_dir = list(filter(lambda x: (x.get('path') != '/Users' and x.get('path') != '/Projects'),
 30 |                                     root_items))
 31 |         dirs_and_nbs = list(filter(lambda x: (x.get('object_type') in supported_types),
 32 |                                     non_users_dir))
 33 |         return dirs_and_nbs
 34 | 
 35 |     def export_top_level_folders(self):
 36 |         ls_tld = self.get_top_level_folders()
 37 |         logged_nb_count = 0
 38 |         for tld_obj in ls_tld:
 39 |             # obj has 3 keys, object_type, path, object_id
 40 |             tld_path = tld_obj.get('path')
 41 |             log_count = self.log_all_workspace_items(ws_path=tld_path)
 42 |             logged_nb_count += log_count
 43 |         dl_nb_count = self.download_notebooks()
 44 |         print(f'Total logged notebooks: {logged_nb_count}')
 45 |         print(f'Total Downloaded notebooks: {dl_nb_count}')
 46 | 
 47 |     def get_user_import_args(self, full_local_path, nb_full_path):
 48 |         """
 49 |         helper function to define the import parameters to upload a notebook object
 50 |         :param full_local_path: full local path of the notebook to read
 51 |         :param nb_full_path: full destination path, e.g. /Users/foo@db.com/bar.dbc . Includes extension / type
 52 |         :return: return the full input args to upload to the destination system
 53 |         """
 54 |         is_source_format = self.is_source_file_format()
 55 |         fp = open(full_local_path, "rb")
 56 |         (nb_path_dest, nb_type) = os.path.splitext(nb_full_path)
 57 |         in_args = {
 58 |             "content": base64.encodebytes(fp.read()).decode('utf-8'),
 59 |             "path": nb_path_dest,
 60 |             "format": self.get_file_format()
 61 |         }
 62 |         if is_source_format:
 63 |             in_args['language'] = self.get_language(nb_type)
 64 |             in_args['object_type'] = 'NOTEBOOK'
 65 |         return in_args
 66 | 
 67 |     @staticmethod
 68 |     def build_ws_lookup_table(success_ws_logfile):
 69 |         ws_hashmap = set()
 70 |         with open(success_ws_logfile, 'r') as fp:
 71 |             for line in fp:
 72 |                 ws_hashmap.add(line.rstrip())
 73 |         return ws_hashmap
 74 | 
 75 |     @staticmethod
 76 |     def is_user_ws_item(ws_dir):
 77 |         """
 78 |         Checks if this is a user artifact / notebook.
 79 |         We can't create user home folders, hence we need to identify user items
 80 |         """
 81 |         path_list = [x for x in ws_dir.split('/') if x]
 82 |         if len(path_list) >= 2 and path_list[0] == 'Users':
 83 |             return True
 84 |         return False
 85 | 
 86 |     @staticmethod
 87 |     def is_user_ws_root(ws_dir):
 88 |         """
 89 |         Check if we're at the users home folder to skip folder creation
 90 |         """
 91 |         if ws_dir == '/Users/' or ws_dir == '/Users':
 92 |             return True
 93 |         path_list = [x for x in ws_dir.split('/') if x]
 94 |         if len(path_list) == 2 and path_list[0] == 'Users':
 95 |             return True
 96 |         return False
 97 | 
 98 |     @staticmethod
 99 |     def get_user(ws_dir):
100 |         """
101 |         returns the username of the workspace / folder path
102 |         """
103 |         path_list = [x for x in ws_dir.split('/') if x]
104 |         if len(path_list) < 2:
105 |             raise ValueError("Error: Not a users workspace directory")
106 |         return path_list[1]
107 | 
108 |     @staticmethod
109 |     def is_user_trash(ws_path):
110 |         """
111 |         checks if this is the users home folder trash directory, which is a special dir
112 |         """
113 |         path_list = ws_path.split('/')
114 |         if len(path_list) == 4:
115 |             if path_list[1] == 'Users' and path_list[3] == 'Trash':
116 |                 return True
117 |         return False
118 | 
119 |     def is_user_home_empty(self, username):
120 |         user_root = '/Users/' + username.rstrip().lstrip()
121 |         get_args = {'path': user_root}
122 |         items = self.get(WS_LIST, get_args).get('objects', None)
123 |         if items:
124 |             folders = self.filter_workspace_items(items, 'DIRECTORY')
125 |             notebooks = self.filter_workspace_items(items, 'NOTEBOOK')
126 |             # if both notebooks and directories are empty, return true
127 |             if not folders and not notebooks:
128 |                 return True
129 |             return False
130 |         return True
131 | 
132 |     @staticmethod
133 |     def get_num_of_saved_users(export_dir):
134 |         """
135 |         returns the number of exported user items to check against number of created users in the new workspace
136 |         this helps identify if the new workspace is ready for the import, or if we should skip / archive failed imports
137 |         """
138 |         # get current number of saved workspaces
139 |         user_home_dir = export_dir + 'Users'
140 |         num_of_users = 0
141 |         if os.path.exists(user_home_dir):
142 |             ls = os.listdir(user_home_dir)
143 |             for x in ls:
144 |                 if os.path.isdir(user_home_dir + '/' + x):
145 |                     num_of_users += 1
146 |         return num_of_users
147 | 
148 |     def export_user_home(self, username, local_export_dir):
149 |         """
150 |         Export the provided user's home directory
151 |         :param username: user's home directory to export
152 |         :param local_export_dir: folder location to do single user exports
153 |         :return: None
154 |         """
155 |         original_export_dir = self.get_export_dir()
156 |         user_export_dir = self.get_export_dir() + local_export_dir
157 |         user_root = '/Users/' + username.rstrip().lstrip()
158 |         self.set_export_dir(user_export_dir + '/{0}/'.format(username))
159 |         print("Export path: {0}".format(self.get_export_dir()))
160 |         num_of_nbs = self.log_all_workspace_items(ws_path=user_root)
161 |         if num_of_nbs == 0:
162 |             raise ValueError('User does not have any notebooks in this path. Please verify the case of the email')
163 |         num_of_nbs_dl = self.download_notebooks(ws_dir='user_artifacts/')
164 |         print(f"Total notebooks logged: {num_of_nbs}")
165 |         print(f"Total notebooks downloaded: {num_of_nbs_dl}")
166 |         if num_of_nbs != num_of_nbs_dl:
167 |             print(f"Notebooks logged != downloaded. Check the failed download file at: {user_export_dir}")
168 |         # reset the original export dir for other calls to this method using the same client
169 |         self.set_export_dir(original_export_dir)
170 | 
171 |     def import_user_home(self, username, local_export_dir):
172 |         """
173 |         Import the provided user's home directory
174 |         logs/user_exports/{{USERNAME}}/ stores the log files to understand what was exported
175 |         logs/user_exports/{{USERNAME}}/user_artifacts/ stores the notebook contents
176 |         :param username: user's home directory to export
177 |         :param local_export_dir: the log directory for this users workspace items
178 |         :return: None
179 |         """
180 |         original_export_dir = self.get_export_dir()
181 |         user_import_dir = self.get_export_dir() + local_export_dir
182 |         if self.does_user_exist(username):
183 |             print("Yes, we can upload since the user exists")
184 |         else:
185 |             print("User must exist before we upload the notebook contents. Please add the user to the platform first")
186 |         user_root = '/Users/' + username.rstrip().lstrip()
187 |         self.set_export_dir(user_import_dir + '/{0}/'.format(username))
188 |         print("Import local path: {0}".format(self.get_export_dir()))
189 |         notebook_dir = self.get_export_dir() + 'user_artifacts/'
190 |         for root, subdirs, files in os.walk(notebook_dir):
191 |             upload_dir = '/' + root.replace(notebook_dir, '')
192 |             # if the upload dir is the 2 root directories, skip and continue
193 |             if upload_dir == '/' or upload_dir == '/Users':
194 |                 continue
195 |             if not self.is_user_ws_root(upload_dir):
196 |                 # if it is not the /Users/example@example.com/ root path, don't create the folder
197 |                 resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
198 |                 print(resp_mkdirs)
199 |             for f in files:
200 |                 # get full path for the local notebook file
201 |                 local_file_path = os.path.join(root, f)
202 |                 # create upload path and remove file format extension
203 |                 ws_file_path = upload_dir + '/' + f
204 |                 # generate json args with binary data for notebook to upload to the workspace path
205 |                 nb_input_args = self.get_user_import_args(local_file_path, ws_file_path)
206 |                 # call import to the workspace
207 |                 if self.is_verbose():
208 |                     print("Path: {0}".format(nb_input_args['path']))
209 |                 resp_upload = self.post(WS_IMPORT, nb_input_args)
210 |                 if self.is_verbose():
211 |                     print(resp_upload)
212 |         self.set_export_dir(original_export_dir)
213 | 
214 |     def download_notebooks(self, ws_log_file='user_workspace.log', ws_dir='artifacts/'):
215 |         """
216 |         Loop through all notebook paths in the logfile and download individual notebooks
217 |         :param ws_log_file: logfile for all notebook paths in the workspace
218 |         :param ws_dir: export directory to store all notebooks
219 |         :return: None
220 |         """
221 |         ws_log = self.get_export_dir() + ws_log_file
222 |         num_notebooks = 0
223 |         if not os.path.exists(ws_log):
224 |             raise Exception("Run --workspace first to download full log of all notebooks.")
225 |         with open(ws_log, "r") as fp:
226 |             # notebook log metadata file now contains object_id to help w/ ACL exports
227 |             # pull the path from the data to download the individual notebook contents
228 |             for notebook_data in fp:
229 |                 notebook_path = json.loads(notebook_data).get('path', None).rstrip()
230 |                 dl_resp = self.download_notebook_helper(notebook_path, export_dir=self.get_export_dir() + ws_dir)
231 |                 if 'error_code' not in dl_resp:
232 |                     num_notebooks += 1
233 |         return num_notebooks
234 | 
235 |     def download_notebook_helper(self, notebook_path, export_dir='artifacts/'):
236 |         """
237 |         Helper function to download an individual notebook, or log the failure in the failure logfile
238 |         :param notebook_path: an individual notebook path
239 |         :param export_dir: directory to store all notebooks
240 |         :return: return the notebook path that's successfully downloaded
241 |         """
242 |         get_args = {'path': notebook_path, 'format': self.get_file_format()}
243 |         if self.is_verbose():
244 |             print("Downloading: {0}".format(get_args['path']))
245 |         resp = self.get(WS_EXPORT, get_args)
246 |         with open(self.get_export_dir() + 'failed_notebooks.log', 'a') as err_log:
247 |             if resp.get('error_code', None):
248 |                 err_msg = {'error_code': resp.get('error_code'), 'path': notebook_path}
249 |                 err_log.write(json.dumps(err_msg) + '\n')
250 |                 return err_msg
251 |         nb_path = os.path.dirname(notebook_path)
252 |         if nb_path != '/':
253 |             # path is NOT empty, remove the trailing slash from export_dir
254 |             save_path = export_dir[:-1] + nb_path + '/'
255 |         else:
256 |             save_path = export_dir
257 |         save_filename = save_path + os.path.basename(notebook_path) + '.' + resp.get('file_type')
258 |         # If the local path doesn't exist,we create it before we save the contents
259 |         if not os.path.exists(save_path) and save_path:
260 |             os.makedirs(save_path, exist_ok=True)
261 |         with open(save_filename, "wb") as f:
262 |             f.write(base64.b64decode(resp['content']))
263 |         return {'path': notebook_path}
264 | 
265 |     def filter_workspace_items(self, item_list, item_type):
266 |         """
267 |         Helper function to filter on different workspace types.
268 |         :param item_list: iterable of workspace items
269 |         :param item_type: DIRECTORY, NOTEBOOK, LIBRARY
270 |         :return: list of items filtered by type
271 |         """
272 |         supported_types = {'DIRECTORY', 'NOTEBOOK', 'LIBRARY'}
273 |         if item_type not in supported_types:
274 |             raise ValueError('Unsupported type provided: {0}.\n. Supported types: {1}'.format(item_type,
275 |                                                                                               str(supported_types)))
276 |         filtered_list = list(self.my_map(lambda y: {'path': y.get('path', None),
277 |                                                     'object_id': y.get('object_id', None)},
278 |                                          filter(lambda x: x.get('object_type', None) == item_type, item_list)))
279 |         return filtered_list
280 | 
281 |     def init_workspace_logfiles(self, workspace_log_file='user_workspace.log',
282 |                                 libs_log_file='libraries.log', workspace_dir_log_file='user_dirs.log'):
283 |         """
284 |         initialize the logfile locations since we run a recursive function to download notebooks
285 |         """
286 |         workspace_log = self.get_export_dir() + workspace_log_file
287 |         libs_log = self.get_export_dir() + libs_log_file
288 |         workspace_dir_log = self.get_export_dir() + workspace_dir_log_file
289 |         if os.path.exists(workspace_log):
290 |             os.remove(workspace_log)
291 |         if os.path.exists(workspace_dir_log):
292 |             os.remove(workspace_dir_log)
293 |         if os.path.exists(libs_log):
294 |             os.remove(libs_log)
295 | 
296 |     def log_all_workspace_items(self, ws_path='/', workspace_log_file='user_workspace.log',
297 |                                 libs_log_file='libraries.log', dir_log_file='user_dirs.log'):
298 |         """
299 |         Loop and log all workspace items to download them at a later time
300 |         :param ws_path: root path to log all the items of the notebook workspace
301 |         :param workspace_log_file: logfile to store all the paths of the notebooks
302 |         :param libs_log_file: library logfile to store workspace libraries
303 |         :param dir_log_file: log directory for users
304 |         :return:
305 |         """
306 |         # define log file names for notebooks, folders, and libraries
307 |         workspace_log = self.get_export_dir() + workspace_log_file
308 |         workspace_dir_log = self.get_export_dir() + dir_log_file
309 |         libs_log = self.get_export_dir() + libs_log_file
310 |         if ws_path == '/':
311 |             # default is the root path
312 |             get_args = {'path': '/'}
313 |         else:
314 |             get_args = {'path': ws_path}
315 | 
316 |         if not os.path.exists(self.get_export_dir()):
317 |             os.makedirs(self.get_export_dir(), exist_ok=True)
318 |         items = self.get(WS_LIST, get_args).get('objects', None)
319 |         num_nbs = 0
320 |         if self.is_verbose():
321 |             print("Listing: {0}".format(get_args['path']))
322 |         if items is not None:
323 |             # list all the users folders only
324 |             folders = self.filter_workspace_items(items, 'DIRECTORY')
325 |             # should be no notebooks, but lets filter and can check later
326 |             notebooks = self.filter_workspace_items(items, 'NOTEBOOK')
327 |             libraries = self.filter_workspace_items(items, 'LIBRARY')
328 |             with open(workspace_log, "a") as ws_fp, open(libs_log, "a") as libs_fp:
329 |                 for x in notebooks:
330 |                     # notebook objects has path and object_id
331 |                     if self.is_verbose():
332 |                         print("Saving path: {0}".format(x.get('path')))
333 |                     ws_fp.write(json.dumps(x) + '\n')
334 |                     num_nbs += 1
335 |                 for y in libraries:
336 |                     libs_fp.write(json.dumps(y) + '\n')
337 |             # log all directories to export permissions
338 |             if folders:
339 |                 with open(workspace_dir_log, "a") as dir_fp:
340 |                     for f in folders:
341 |                         dir_path = f.get('path', None)
342 |                         if not WorkspaceClient.is_user_trash(dir_path):
343 |                             dir_fp.write(json.dumps(f) + '\n')
344 |                             num_nbs += self.log_all_workspace_items(ws_path=dir_path,
345 |                                                                     workspace_log_file=workspace_log_file,
346 |                                                                     libs_log_file=libs_log_file)
347 |         return num_nbs
348 | 
349 |     def get_obj_id_by_path(self, input_path):
350 |         resp = self.get(WS_STATUS, {'path': input_path})
351 |         obj_id = resp.get('object_id', None)
352 |         return obj_id
353 | 
354 |     def log_acl_to_file(self, artifact_type, read_log_filename, write_log_filename, failed_log_filename):
355 |         """
356 |         generic function to log the notebook/directory ACLs to specific file names
357 |         :param artifact_type: set('notebooks', 'directories') ACLs to be logged
358 |         :param read_log_filename: the list of the notebook paths / object ids
359 |         :param write_log_filename: output file to store object_id acls
360 |         :param failed_log_filename: failed acl logs for resources, should be empty
361 |         """
362 |         read_log_path = self.get_export_dir() + read_log_filename
363 |         write_log_path = self.get_export_dir() + write_log_filename
364 |         failed_log_path = self.get_export_dir() + failed_log_filename
365 |         with open(read_log_path, 'r') as read_fp, open(write_log_path, 'w') as write_fp, \
366 |                 open(failed_log_path, 'w') as failed_fp:
367 |             for x in read_fp:
368 |                 data = json.loads(x)
369 |                 obj_id = data.get('object_id', None)
370 |                 api_endpoint = '/permissions/{0}/{1}'.format(artifact_type, obj_id)
371 |                 acl_resp = self.get(api_endpoint)
372 |                 acl_resp['path'] = data.get('path')
373 |                 if 'error_code' in acl_resp:
374 |                     failed_fp.write(json.dumps(acl_resp) + '\n')
375 |                     continue
376 |                 acl_resp.pop('http_status_code')
377 |                 write_fp.write(json.dumps(acl_resp) + '\n')
378 | 
379 |     def log_all_workspace_acls(self, workspace_log_file='user_workspace.log',
380 |                                dir_log_file='user_dirs.log'):
381 |         """
382 |         loop through all notebooks and directories to store their associated ACLs
383 |         :param workspace_log_file: input file for user notebook listing
384 |         :param dir_log_file: input file for user directory listing
385 |         """
386 |         # define log file names for notebooks, folders, and libraries
387 |         print("Exporting the notebook permissions")
388 |         start = timer()
389 |         self.log_acl_to_file('notebooks', workspace_log_file, 'acl_notebooks.log', 'failed_acl_notebooks.log')
390 |         end = timer()
391 |         print("Complete Notebook ACLs Export Time: " + str(timedelta(seconds=end - start)))
392 |         print("Exporting the directories permissions")
393 |         start = timer()
394 |         self.log_acl_to_file('directories', dir_log_file, 'acl_directories.log', 'failed_acl_directories.log')
395 |         end = timer()
396 |         print("Complete Directories ACLs Export Time: " + str(timedelta(seconds=end - start)))
397 | 
398 |     def apply_acl_on_object(self, acl_str):
399 |         """
400 |         apply the acl definition to the workspace object
401 |         object_id comes from the export data which contains '/type/id' format for this key
402 |         the object_id contains the {{/type/object_id}} format which helps craft the api endpoint
403 |         setting acl definitions uses the patch rest api verb
404 |         :param acl_str: the complete string from the logfile. contains object defn and acl lists
405 |         """
406 |         object_acl = json.loads(acl_str)
407 |         # the object_type
408 |         object_type = object_acl.get('object_type', None)
409 |         obj_path = object_acl['path']
410 |         obj_status = self.get(WS_STATUS, {'path': obj_path})
411 |         print("ws-stat: ", obj_status)
412 |         current_obj_id = obj_status.get('object_id', None)
413 |         if not current_obj_id:
414 |             print('Object id missing from destination workspace', obj_path)
415 |             return
416 |         if object_type == 'directory':
417 |             object_id_with_type = f'/directories/{current_obj_id}'
418 |         elif object_type == 'notebook':
419 |             object_id_with_type = f'/notebooks/{current_obj_id}'
420 |         else:
421 |             raise ValueError('Object for Workspace ACLs is Undefined')
422 |         api_path = '/permissions' + object_id_with_type
423 |         acl_list = object_acl.get('access_control_list', None)
424 |         api_args = {'access_control_list': self.build_acl_args(acl_list)}
425 |         resp = self.patch(api_path, api_args)
426 |         print(resp)
427 |         return resp
428 | 
429 |     def import_workspace_acls(self, workspace_log_file='acl_notebooks.log',
430 |                               dir_log_file='acl_directories.log'):
431 |         """
432 |         import the notebook and directory acls by looping over notebook and dir logfiles
433 |         """
434 |         dir_acl_logs = self.get_export_dir() + dir_log_file
435 |         notebook_acl_logs = self.get_export_dir() + workspace_log_file
436 |         with open(notebook_acl_logs) as nb_acls_fp:
437 |             for nb_acl_str in nb_acls_fp:
438 |                 self.apply_acl_on_object(nb_acl_str)
439 |         with open(dir_acl_logs) as dir_acls_fp:
440 |             for dir_acl_str in dir_acls_fp:
441 |                 self.apply_acl_on_object(dir_acl_str)
442 |         print("Completed import ACLs of Notebooks and Directories")
443 | 
444 |     def get_current_users(self):
445 |         """
446 |         get the num of defined user home directories in the new workspace
447 |         if this is 0, we must create the users before importing the notebooks over.
448 |         we cannot create the users home directory since its a special type of directory
449 |         """
450 |         ws_users = self.get(WS_LIST, {'path': '/Users/'}).get('objects', None)
451 |         if ws_users:
452 |             return len(ws_users)
453 |         else:
454 |             return 0
455 | 
456 |     def does_user_exist(self, username):
457 |         """
458 |         check if the users home dir exists
459 |         """
460 |         stat = self.get(WS_STATUS, {'path': '/Users/{0}'.format(username)})
461 |         if stat.get('object_type', None) == 'DIRECTORY':
462 |             return True
463 |         return False
464 | 
465 |     def does_path_exist(self, dir_path):
466 |         status_resp = self.get(WS_STATUS, {'path': dir_path})
467 |         if 'error_code' in status_resp:
468 |             if status_resp.get('error_code') == 'RESOURCE_DOES_NOT_EXIST':
469 |                 return False
470 |             else:
471 |                 print('Failure:' + json.dumps(status_resp))
472 |                 return False
473 |         return True
474 | 
475 |     def import_current_workspace_items(self,artifact_dir='artifacts/'):
476 |         src_dir = self.get_export_dir() + artifact_dir
477 |         for root, subdirs, files in os.walk(src_dir):
478 |             # replace the local directory with empty string to get the notebook workspace directory
479 |             nb_dir = '/' + root.replace(src_dir, '')
480 |             upload_dir = nb_dir
481 |             if not nb_dir == '/':
482 |                 upload_dir = nb_dir + '/'
483 |             if not self.does_path_exist(upload_dir):
484 |                 resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
485 |             for f in files:
486 |                 print("Uploading: {0}".format(f))
487 |                 # create the local file path to load the DBC file
488 |                 local_file_path = os.path.join(root, f)
489 |                 # create the ws full file path including filename
490 |                 ws_file_path = upload_dir + f
491 |                 # generate json args with binary data for notebook to upload to the workspace path
492 |                 nb_input_args = self.get_user_import_args(local_file_path, ws_file_path)
493 |                 # call import to the workspace
494 |                 if self.is_verbose():
495 |                     print("Path: {0}".format(nb_input_args['path']))
496 |                 resp_upload = self.post(WS_IMPORT, nb_input_args)
497 | 
498 |     def import_all_workspace_items(self, artifact_dir='artifacts/', success_log='success_ws_import.log',
499 |                                    failed_log='failed_ws_import.log', archive_missing=False, restart_from_last=False):
500 |         """
501 |         import all notebooks into a new workspace
502 |         :param artifact_dir: notebook download directory
503 |         :param success_log: success log to allow recovery from last successful upload
504 |         :param failed_log: failed import log
505 |         :param archive_missing: whether to put missing users into a /Archive/ top level directory
506 |         :param restart_from_last: flag to restart import and skip past successful imports
507 |         """
508 |         src_dir = self.get_export_dir() + artifact_dir
509 |         success_logfile = self.get_export_dir() + success_log
510 |         failed_logfile = self.get_export_dir() + failed_log
511 |         overwrite_or_append = 'w'
512 |         if restart_from_last:
513 |             # if we're restarting from checkpoint, append to the successful logfile
514 |             overwrite_or_append = 'a'
515 |             uploaded_hashmap = self.build_ws_lookup_table(success_logfile)
516 |             print(uploaded_hashmap)
517 |         else:
518 |             # delete the log if we start from the beginning
519 |             if os.path.exists(success_logfile):
520 |                 os.remove(success_logfile)
521 |         num_exported_users = self.get_num_of_saved_users(src_dir)
522 |         num_current_users = self.get_current_users()
523 |         if num_current_users == 0:
524 |             print("No registered users in existing environment. Please import users / groups first.")
525 |             raise ValueError("No registered users in the current environment")
526 |         if (num_current_users < num_exported_users) and (not archive_missing):
527 |             print("Exported number of user workspaces: {0}".format(num_exported_users))
528 |             print("Current number of user workspaces: {0}".format(num_current_users))
529 |             print("Re-run with the `--archive-missing` flag to load missing users into a separate directory")
530 |             raise ValueError("Current number of users is less than number of user workspaces to import.")
531 |         archive_users = set()
532 |         with open(success_logfile, overwrite_or_append) as success_fp, open(failed_logfile, 'w') as failed_fp:
533 |             for root, subdirs, files in os.walk(src_dir):
534 |                 # replace the local directory with empty string to get the notebook workspace directory
535 |                 nb_dir = '/' + root.replace(src_dir, '')
536 |                 upload_dir = nb_dir
537 |                 if not nb_dir == '/':
538 |                     upload_dir = nb_dir + '/'
539 |                 if self.is_user_ws_item(upload_dir):
540 |                     ws_user = self.get_user(upload_dir)
541 |                     if archive_missing:
542 |                         if ws_user in archive_users:
543 |                             upload_dir = upload_dir.replace('Users', 'Archive', 1)
544 |                         elif not self.does_user_exist(ws_user):
545 |                             # add the user to the cache / set of missing users
546 |                             print("User workspace does not exist, adding to archive cache: {0}".format(ws_user))
547 |                             archive_users.add(ws_user)
548 |                             # append the archive path to the upload directory
549 |                             upload_dir = upload_dir.replace('Users', 'Archive', 1)
550 |                         else:
551 |                             print("User workspace exists: {0}".format(ws_user))
552 |                     elif not self.does_user_exist(ws_user):
553 |                         print("User {0} is missing. "
554 |                               "Please re-run with --archive-missing flag "
555 |                               "or first verify all users exist in the new workspace".format(ws_user))
556 |                         return
557 |                     else:
558 |                         print("Uploading for user: {0}".format(ws_user))
559 |                 # make the top level folder before uploading files within the loop
560 |                 if not self.is_user_ws_root(upload_dir):
561 |                     # if it is not the /Users/example@example.com/ root path, don't create the folder
562 |                     resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
563 |                 for f in files:
564 |                     print("Uploading: {0}".format(f))
565 |                     # create the local file path to load the DBC file
566 |                     local_file_path = os.path.join(root, f)
567 |                     # create the ws full file path including filename
568 |                     ws_file_path = upload_dir + f
569 |                     if restart_from_last:
570 |                         if ws_file_path in uploaded_hashmap:
571 |                             print(f"Skipping upload as file has already been uploaded: {ws_file_path}")
572 |                             continue
573 |                     # generate json args with binary data for notebook to upload to the workspace path
574 |                     nb_input_args = self.get_user_import_args(local_file_path, ws_file_path)
575 |                     # call import to the workspace
576 |                     if self.is_verbose():
577 |                         print("Path: {0}".format(nb_input_args['path']))
578 |                     resp_upload = self.post(WS_IMPORT, nb_input_args)
579 |                     if 'error_code' in resp_upload:
580 |                         # log this path to a success logfile
581 |                         print(f'Error uploading file: {ws_file_path}')
582 |                         failed_fp.write(json.dumps(resp_upload) + '\n')
583 |                     else:
584 |                         success_fp.write(ws_file_path + '\n')
585 | 


--------------------------------------------------------------------------------
/dbclient/HiveClient.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import os
  3 | import time
  4 | import base64
  5 | import re
  6 | from datetime import timedelta
  7 | from timeit import default_timer as timer
  8 | from dbclient import *
  9 | 
 10 | 
 11 | class HiveClient(ClustersClient):
 12 | 
 13 |     @staticmethod
 14 |     def is_delta_table(local_path):
 15 |         with open(local_path, 'r') as fp:
 16 |             for line in fp:
 17 |                 lower_line = line.lower()
 18 |                 if lower_line.startswith('using delta'):
 19 |                     return True
 20 |         return False
 21 | 
 22 |     @staticmethod
 23 |     def get_ddl_by_keyword_group(local_path):
 24 |         """
 25 |         return a list of DDL strings that are grouped by keyword arguments and their parameters
 26 |         """
 27 |         ddl_statement = []
 28 |         parameter_group = []
 29 |         with open(local_path, 'r') as fp:
 30 |             for line in fp:
 31 |                 raw = line.rstrip()
 32 |                 if not raw:
 33 |                     # make sure it's not an empty line, continue if empty
 34 |                     continue
 35 |                 if raw[0] == ' ' or raw[0] == ')':
 36 |                     parameter_group.append(raw)
 37 |                 else:
 38 |                     if parameter_group:
 39 |                         ddl_statement.append(''.join(parameter_group))
 40 |                     parameter_group = [raw]
 41 |             ddl_statement.append(''.join(parameter_group))
 42 |         return ddl_statement
 43 | 
 44 |     @staticmethod
 45 |     def get_path_option_if_available(stmt):
 46 |         # parse the OPTIONS keyword and pull out the `path` parameter if it exists
 47 |         params = re.search(r'\((.*?)\)', stmt).group(1)
 48 |         params_list = list(map(lambda p: p.lstrip().rstrip(), params.split(',')))
 49 |         for x in params_list:
 50 |             if x.startswith('path'):
 51 |                 return f'OPTIONS ( {x} )'
 52 |         return ''
 53 | 
 54 |     def is_table_location_defined(self, local_table_path):
 55 |         """ check if LOCATION or OPTIONS(path ..) are defined for the table
 56 |         """
 57 |         ddl_statement = self.get_ddl_by_keyword_group(local_table_path)
 58 |         for keyword_param in ddl_statement:
 59 |             if keyword_param.startswith('OPTIONS'):
 60 |                 options_param = self.get_path_option_if_available(keyword_param)
 61 |                 if options_param:
 62 |                     # if the return is not empty, the path option is provided which means its an external table
 63 |                     return True
 64 |             elif keyword_param.startswith('LOCATION'):
 65 |                 # if LOCATION is defined, we know the external table location
 66 |                 return True
 67 |         return False
 68 | 
 69 |     def get_local_tmp_ddl_if_applicable(self, current_local_ddl_path):
 70 |         """
 71 |         method to identify if we should update the current DDL if OPTIONS or TBLPROPERTIES keywords exist
 72 |         """
 73 |         ddl_statement = self.get_ddl_by_keyword_group(current_local_ddl_path)
 74 |         tmp_ddl_path = self.get_export_dir() + 'tmp_ddl.txt'
 75 |         return_tmp_file = False
 76 |         with open(tmp_ddl_path, 'w') as fp:
 77 |             for keyword_param in ddl_statement:
 78 |                 if keyword_param.startswith('OPTIONS'):
 79 |                     return_tmp_file = True
 80 |                     options_param = self.get_path_option_if_available(keyword_param)
 81 |                     if options_param:
 82 |                         fp.write(options_param + ' ')
 83 |                     continue
 84 |                 elif keyword_param.startswith('TBLPROPERTIES'):
 85 |                     return_tmp_file = True
 86 |                     continue
 87 |                 fp.write(keyword_param + ' ')
 88 |         if return_tmp_file:
 89 |             return tmp_ddl_path
 90 |         else:
 91 |             os.remove(tmp_ddl_path)
 92 |             return current_local_ddl_path
 93 | 
 94 |     def update_table_ddl(self, local_table_path, db_path):
 95 |         # check if the database location / path is the default DBFS path
 96 |         table_name = os.path.basename(local_table_path)
 97 |         is_db_default_path = db_path.startswith('dbfs:/user/hive/warehouse')
 98 |         if (not is_db_default_path) and (not self.is_table_location_defined(local_table_path)):
 99 |             # the LOCATION attribute is not defined and the Database has a custom location defined
100 |             # therefore we need to add it to the DDL, e.g. dbfs:/db_path/table_name
101 |             table_path = db_path + '/' + table_name
102 |             location_stmt = f"\nLOCATION '{table_path}'"
103 |             with open(local_table_path, 'a') as fp:
104 |                 fp.write(location_stmt)
105 |             return True
106 |         return False
107 | 
108 |     def apply_table_ddl(self, local_table_path, ec_id, cid, db_path, has_unicode=False):
109 |         """
110 |         Run DDL command on destination workspace
111 |         :param local_table_path: local file path to the table DDL
112 |         :param ec_id: execution context id to run remote commands
113 |         :param cid: cluster id to connect to
114 |         :param db_path: database S3 / Blob Storage / ADLS path for the Database
115 |         :param has_unicode: Whether the table definitions have unicode characters.
116 |         :return: rest api response
117 |         """
118 |         # get file size in bytes
119 |         updated_table_status = self.update_table_ddl(local_table_path, db_path)
120 |         # update local table ddl to a new temp file with OPTIONS and TBLPROPERTIES removed from the DDL for delta tables
121 |         if self.is_delta_table(local_table_path):
122 |             local_table_path = self.get_local_tmp_ddl_if_applicable(local_table_path)
123 | 
124 |         f_size_bytes = os.path.getsize(local_table_path)
125 |         if f_size_bytes > 1024 or has_unicode:
126 |             # upload first to tmp DBFS path and apply
127 |             dbfs_path = '/tmp/migration/tmp_import_ddl.txt'
128 |             path_args = {'path': dbfs_path}
129 |             del_resp = self.post('/dbfs/delete', path_args)
130 |             if self.is_verbose():
131 |                 print(del_resp)
132 |             file_content_json = {'files': open(local_table_path, 'r')}
133 |             put_resp = self.post('/dbfs/put', path_args, files_json=file_content_json)
134 |             if self.is_verbose():
135 |                 print(put_resp)
136 |             spark_big_ddl_cmd = f'with open("/dbfs{dbfs_path}", "r") as fp: tmp_ddl = fp.read(); spark.sql(tmp_ddl)'
137 |             ddl_results = self.submit_command(cid, ec_id, spark_big_ddl_cmd)
138 |             return ddl_results
139 |         else:
140 |             with open(local_table_path, "r") as fp:
141 |                 ddl_statement = fp.read()
142 |                 spark_ddl_statement = self.get_spark_ddl(ddl_statement)
143 |                 ddl_results = self.submit_command(cid, ec_id, spark_ddl_statement)
144 |                 return ddl_results
145 | 
146 |     def check_if_instance_profiles_exists(self, log_file='instance_profiles.log'):
147 |         ip_log = self.get_export_dir() + log_file
148 |         ips = self.get('/instance-profiles/list').get('instance_profiles', None)
149 |         if ips:
150 |             with open(ip_log, "w") as fp:
151 |                 for x in ips:
152 |                     fp.write(json.dumps(x) + '\n')
153 |             return True
154 |         return False
155 | 
156 |     def create_database_db(self, db_name, ec_id, cid, db_attributes):
157 |         location = db_attributes.get('Location', '')
158 |         if not location.startswith('dbfs:/user/hive/warehouse/'):
159 |             create_stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION '{location}'"
160 |         else:
161 |             create_stmt = f"CREATE DATABASE IF NOT EXISTS {db_name}"
162 |         create_db_sql = f'spark.sql("{create_stmt}")'
163 |         db_results = self.submit_command(cid, ec_id, create_db_sql)
164 |         return db_results
165 | 
166 |     def get_database_detail_dict(self, db_log='database_details.log'):
167 |         db_logfile = self.get_export_dir() + db_log
168 |         all_db_json = {}
169 |         with open(db_logfile, 'r') as fp:
170 |             for x in fp:
171 |                 db_json = json.loads(x)
172 |                 db_name = db_json.pop('Database Name')
173 |                 all_db_json[db_name] = db_json
174 |         return all_db_json
175 | 
176 |     def set_desc_database_helper(self, cid, ec_id):
177 |         """
178 |         define the helper function on the cluster
179 |         :param cid: cluster id to run against
180 |         :param ec_id: execution id, aka spark session id
181 |         :return: api response object
182 |         """
183 |         # replacement strings
184 |         helper_func_cmd1 = """def get_db_json(db_name): import json; rows = spark.sql(f"DESC DATABASE EXTENDED \
185 |             {db_name}").toJSON().collect(); return list(map(lambda x: json.loads(x), rows))"""
186 |         helper_func_cmd2 = """def format_db_json(db_list): return dict(list(map(lambda x: \
187 |             (x.get('database_description_item'), x.get('database_description_value')), db_list)))"""
188 |         helper_func_cmd3 = "def get_db_details(db_name): return format_db_json(get_db_json(db_name))"
189 |         resp1 = self.submit_command(cid, ec_id, helper_func_cmd1)
190 |         resp2 = self.submit_command(cid, ec_id, helper_func_cmd2)
191 |         resp3 = self.submit_command(cid, ec_id, helper_func_cmd3)
192 |         return resp3
193 | 
194 |     def get_desc_database_details(self, db_name, cid, ec_id):
195 |         """
196 |         Returns a dict object of the `desc database extended {db_name}` command to include location, comment, etc fields
197 |         :param db_name: database name to fetch
198 |         :param cid: cluster id
199 |         :param ec_id: execution id aka spark context id
200 |         :return: database json object
201 |         """
202 |         desc_database_cmd = f'print(get_db_details(\"{db_name}\"))'
203 |         results = self.submit_command(cid, ec_id, desc_database_cmd)
204 |         if results['resultType'] != 'text':
205 |             print(json.dumps(results) + '\n')
206 |             raise ValueError("Desc database extended failure")
207 |         db_json = ast.literal_eval(results['data'])
208 |         return db_json
209 | 
210 |     def export_database(self, db_name, cluster_name=None, iam_role=None, metastore_dir='metastore/',
211 |                         fail_log='failed_metastore.log', success_log='success_metastore.log',
212 |                         has_unicode=False, db_log='database_details.log'):
213 |         """
214 |         :param db_name:  database name
215 |         :param cluster_name: cluster to run against if provided
216 |         :param iam_role: iam role to launch the cluster with
217 |         :param metastore_dir: directory to store all the metadata
218 |         :param has_unicode: whether the metadata has unicode characters to export
219 |         :param db_log: specific database properties logfile
220 |         :return:
221 |         """
222 |         # check if instance profile exists, ask users to use --users first or enter yes to proceed.
223 |         start = timer()
224 |         if cluster_name:
225 |             cid = self.start_cluster_by_name(cluster_name)
226 |             current_iam = self.get_iam_role_by_cid(cid)
227 |         else:
228 |             current_iam = iam_role
229 |             cid = self.launch_cluster(current_iam)
230 |         end = timer()
231 |         print("Cluster creation time: " + str(timedelta(seconds=end - start)))
232 |         time.sleep(5)
233 |         ec_id = self.get_execution_context(cid)
234 |         # if metastore failed log path exists, cleanup before re-running
235 |         failed_metastore_log_path = self.get_export_dir() + fail_log
236 |         success_metastore_log_path = self.get_export_dir() + success_log
237 |         if os.path.exists(failed_metastore_log_path):
238 |             os.remove(failed_metastore_log_path)
239 |         if os.path.exists(success_metastore_log_path):
240 |             os.remove(success_metastore_log_path)
241 |         database_logfile = self.get_export_dir() + db_log
242 |         resp = self.set_desc_database_helper(cid, ec_id)
243 |         if self.is_verbose():
244 |             print(resp)
245 |         with open(database_logfile, 'w') as fp:
246 |             db_json = self.get_desc_database_details(db_name, cid, ec_id)
247 |             fp.write(json.dumps(db_json) + '\n')
248 |         os.makedirs(self.get_export_dir() + metastore_dir + db_name, exist_ok=True)
249 |         self.log_all_tables(db_name, cid, ec_id, metastore_dir, failed_metastore_log_path,
250 |                             success_metastore_log_path, current_iam, has_unicode)
251 | 
252 |     def export_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', db_log='database_details.log',
253 |                               success_log='success_metastore.log', fail_log='failed_metastore.log', has_unicode=False):
254 |         start = timer()
255 |         instance_profiles = self.get_instance_profiles_list()
256 |         if cluster_name:
257 |             cid = self.start_cluster_by_name(cluster_name)
258 |             current_iam_role = self.get_iam_role_by_cid(cid)
259 |         elif instance_profiles:
260 |             # if any instance profile exists, lets start w/ this on the first cluster to launch and export
261 |             current_iam_role = instance_profiles[0]
262 |             cid = self.launch_cluster(iam_role=current_iam_role)
263 |         else:
264 |             current_iam_role = None
265 |             cid = self.launch_cluster()
266 |         end = timer()
267 |         print("Cluster creation time: " + str(timedelta(seconds=end - start)))
268 |         time.sleep(5)
269 |         ec_id = self.get_execution_context(cid)
270 |         # if metastore failed log path exists, cleanup before re-running
271 |         failed_metastore_log_path = self.get_export_dir() + fail_log
272 |         success_metastore_log_path = self.get_export_dir() + success_log
273 |         database_logfile = self.get_export_dir() + db_log
274 |         if os.path.exists(failed_metastore_log_path):
275 |             os.remove(failed_metastore_log_path)
276 |         if os.path.exists(success_metastore_log_path):
277 |             os.remove(success_metastore_log_path)
278 |         all_dbs = self.get_all_databases(cid, ec_id)
279 |         resp = self.set_desc_database_helper(cid, ec_id)
280 |         if self.is_verbose():
281 |             print(resp)
282 |         with open(database_logfile, 'w') as fp:
283 |             for db_name in all_dbs:
284 |                 os.makedirs(self.get_export_dir() + metastore_dir + db_name, exist_ok=True)
285 |                 db_json = self.get_desc_database_details(db_name, cid, ec_id)
286 |                 fp.write(json.dumps(db_json) + '\n')
287 |                 self.log_all_tables(db_name, cid, ec_id, metastore_dir, failed_metastore_log_path,
288 |                                     success_metastore_log_path, current_iam_role, has_unicode)
289 | 
290 |         total_failed_entries = self.get_num_of_lines(failed_metastore_log_path)
291 |         if (not self.is_skip_failed()) and self.is_aws() and total_failed_entries > 0:
292 |             print("Retrying failed metastore export with registered IAM roles")
293 |             remaining_iam_roles = instance_profiles[1:]
294 |             self.retry_failed_metastore_export(cid, failed_metastore_log_path, remaining_iam_roles,
295 |                                                success_metastore_log_path, has_unicode)
296 |             print("Failed count before retry: " + str(total_failed_entries))
297 |             print("Total Databases attempted export: " + str(len(all_dbs)))
298 |         else:
299 |             print("Failed count: " + str(total_failed_entries))
300 |             print("Total Databases attempted export: " + str(len(all_dbs)))
301 | 
302 |     @staticmethod
303 |     def get_num_of_lines(filename):
304 |         if not os.path.exists(filename):
305 |             return 0
306 |         else:
307 |             i = 0
308 |             with open(filename) as fp:
309 |                 for line in fp:
310 |                     i += 1
311 |             return i
312 | 
313 |     @staticmethod
314 |     def get_spark_ddl(table_ddl):
315 |         """
316 |         Formats the provided DDL into spark.sql() command to run remotely
317 |         """
318 |         spark_ddl = 'spark.sql(""" {0} """)'.format(table_ddl)
319 |         return spark_ddl
320 | 
321 |     @staticmethod
322 |     def is_ddl_a_view(ddl_list):
323 |         first_statement = ddl_list[0]
324 |         if first_statement.startswith('CREATE VIEW'):
325 |             return True
326 |         return False
327 | 
328 |     def move_table_view(self, db_name, tbl_name, local_table_ddl, views_dir='metastore_views/'):
329 |         metastore_view_dir = self.get_export_dir() + views_dir
330 |         ddl_statement = self.get_ddl_by_keyword_group(local_table_ddl)
331 |         if self.is_ddl_a_view(ddl_statement):
332 |             dst_local_ddl = metastore_view_dir + db_name + '/' + tbl_name
333 |             os.rename(local_table_ddl, dst_local_ddl)
334 |             return True
335 |         return False
336 | 
337 |     def import_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', views_dir='metastore_views/',
338 |                               has_unicode=False):
339 |         metastore_local_dir = self.get_export_dir() + metastore_dir
340 |         metastore_view_dir = self.get_export_dir() + views_dir
341 |         os.makedirs(metastore_view_dir, exist_ok=True)
342 |         if cluster_name:
343 |             cid = self.start_cluster_by_name(cluster_name)
344 |         else:
345 |             cid = self.launch_cluster()
346 |         time.sleep(2)
347 |         ec_id = self.get_execution_context(cid)
348 |         # get local databases
349 |         db_list = os.listdir(metastore_local_dir)
350 |         # make directory in DBFS root bucket path for tmp data
351 |         resp = self.post('/dbfs/mkdirs', {'path': '/tmp/migration/'})
352 |         # iterate over the databases saved locally
353 |         all_db_details_json = self.get_database_detail_dict()
354 |         for db_name in db_list:
355 |             # create a dir to host the view ddl if we find them
356 |             os.makedirs(metastore_view_dir + db_name, exist_ok=True)
357 |             # get the local database path to list tables
358 |             local_db_path = metastore_local_dir + db_name
359 |             # get a dict of the database attributes
360 |             database_attributes = all_db_details_json.get(db_name, '')
361 |             if not database_attributes:
362 |                 print(all_db_details_json)
363 |                 raise ValueError('Missing Database Attributes Log. Re-run metastore export')
364 |             create_db_resp = self.create_database_db(db_name, ec_id, cid, database_attributes)
365 |             db_path = database_attributes.get('Location')
366 |             if os.path.isdir(local_db_path):
367 |                 # all databases should be directories, no files at this level
368 |                 # list all the tables in the database local dir
369 |                 tables = os.listdir(local_db_path)
370 |                 for tbl_name in tables:
371 |                     # build the path for the table where the ddl is stored
372 |                     print("Importing table {0}.{1}".format(db_name, tbl_name))
373 |                     local_table_ddl = metastore_local_dir + db_name + '/' + tbl_name
374 |                     if not self.move_table_view(db_name, tbl_name, local_table_ddl):
375 |                         # we hit a table ddl here, so we apply the ddl
376 |                         is_successful = self.apply_table_ddl(local_table_ddl, ec_id, cid, db_path, has_unicode)
377 |                         print(is_successful)
378 |                     else:
379 |                         print(f'Moving view ddl to re-apply later: {db_name}.{tbl_name}')
380 |             else:
381 |                 print("Error: Only databases should exist at this level: {0}".format(db_name))
382 |             self.delete_dir_if_empty(metastore_view_dir + db_name)
383 |         views_db_list = os.listdir(metastore_view_dir)
384 |         for db_name in views_db_list:
385 |             local_view_db_path = metastore_view_dir + db_name
386 |             database_attributes = all_db_details_json.get(db_name, '')
387 |             db_path = database_attributes.get('Location')
388 |             if os.path.isdir(local_view_db_path):
389 |                 views = os.listdir(local_view_db_path)
390 |                 for view_name in views:
391 |                     print("Importing view {0}.{1}".format(db_name, view_name))
392 |                     local_view_ddl = metastore_view_dir + db_name + '/' + view_name
393 |                     is_successful = self.apply_table_ddl(local_view_ddl, ec_id, cid, db_path, has_unicode)
394 |                     print(is_successful)
395 | 
396 |     def get_all_databases(self, cid, ec_id):
397 |         # submit first command to find number of databases
398 |         # DBR 7.0 changes databaseName to namespace for the return value of show databases
399 |         all_dbs_cmd = 'all_dbs = [x.databaseName for x in spark.sql("show databases").collect()]; print(len(all_dbs))'
400 |         results = self.submit_command(cid, ec_id, all_dbs_cmd)
401 |         if results['resultType'] != 'text':
402 |             print(json.dumps(results) + '\n')
403 |             raise ValueError("Cannot identify number of databases due to the above error")
404 |         num_of_dbs = ast.literal_eval(results['data'])
405 |         batch_size = 100    # batch size to iterate over databases
406 |         num_of_buckets = (num_of_dbs // batch_size) + 1     # number of slices of the list to take
407 | 
408 |         all_dbs = []
409 |         for m in range(0, num_of_buckets):
410 |             db_slice = 'print(all_dbs[{0}:{1}])'.format(batch_size*m, batch_size*(m+1))
411 |             results = self.submit_command(cid, ec_id, db_slice)
412 |             db_names = ast.literal_eval(results['data'])
413 |             for db in db_names:
414 |                 all_dbs.append(db)
415 |                 print("Database: {0}".format(db))
416 |         return all_dbs
417 | 
418 |     def log_all_tables(self, db_name, cid, ec_id, metastore_dir, err_log_path, success_log_path, iam,
419 |                        has_unicode=False):
420 |         all_tables_cmd = 'all_tables = [x.tableName for x in spark.sql("show tables in {0}").collect()]'.format(db_name)
421 |         results = self.submit_command(cid, ec_id, all_tables_cmd)
422 |         results = self.submit_command(cid, ec_id, 'print(len(all_tables))')
423 |         num_of_tables = ast.literal_eval(results['data'])
424 | 
425 |         batch_size = 100    # batch size to iterate over databases
426 |         num_of_buckets = (num_of_tables // batch_size) + 1     # number of slices of the list to take
427 | 
428 |         all_tables = []
429 |         with open(success_log_path, 'a') as sfp:
430 |             for m in range(0, num_of_buckets):
431 |                 tables_slice = 'print(all_tables[{0}:{1}])'.format(batch_size*m, batch_size*(m+1))
432 |                 results = self.submit_command(cid, ec_id, tables_slice)
433 |                 table_names = ast.literal_eval(results['data'])
434 |                 for table_name in table_names:
435 |                     print("Table: {0}".format(table_name))
436 |                     is_successful = self.log_table_ddl(cid, ec_id, db_name, table_name, metastore_dir,
437 |                                                        err_log_path, has_unicode)
438 |                     if is_successful == 0:
439 |                         print(f"Exported {db_name}.{table_name}")
440 |                         success_item = {'table': f'{db_name}.{table_name}', 'iam': iam}
441 |                         sfp.write(json.dumps(success_item))
442 |                         sfp.write('\n')
443 |                     else:
444 |                         print("Logging failure")
445 |         return True
446 | 
447 |     def log_table_ddl(self, cid, ec_id, db_name, table_name, metastore_dir, err_log_path, has_unicode):
448 |         """
449 |         Log the table DDL to handle large DDL text
450 |         :param cid: cluster id
451 |         :param ec_id: execution context id (rest api 1.2)
452 |         :param db_name: database name
453 |         :param table_name: table name
454 |         :param metastore_dir: metastore export directory name
455 |         :param err_log_path: log for errors
456 |         :param has_unicode: export to a file if this flag is true
457 |         :return: 0 for success, -1 for error
458 |         """
459 |         set_ddl_str_cmd = f'ddl_str = spark.sql("show create table {db_name}.{table_name}").collect()[0][0]'
460 |         ddl_str_resp = self.submit_command(cid, ec_id, set_ddl_str_cmd)
461 |         with open(err_log_path, 'a') as err_log:
462 |             if ddl_str_resp['resultType'] != 'text':
463 |                 ddl_str_resp['table'] = '{0}.{1}'.format(db_name, table_name)
464 |                 err_log.write(json.dumps(ddl_str_resp) + '\n')
465 |                 return -1
466 |             get_ddl_str_len = 'ddl_len = len(ddl_str); print(ddl_len)'
467 |             len_resp = self.submit_command(cid, ec_id, get_ddl_str_len)
468 |             ddl_len = int(len_resp['data'])
469 |             if ddl_len <= 0:
470 |                 len_resp['table'] = '{0}.{1}'.format(db_name, table_name)
471 |                 err_log.write(json.dumps(len_resp) + '\n')
472 |                 return -1
473 |             # if (len > 2k chars) OR (has unicode chars) then export to file
474 |             table_ddl_path = self.get_export_dir() + metastore_dir + db_name + '/' + table_name
475 |             if ddl_len > 2048 or has_unicode:
476 |                 # create the dbfs tmp path for exports / imports. no-op if exists
477 |                 resp = self.post('/dbfs/mkdirs', {'path': '/tmp/migration/'})
478 |                 # save the ddl to the tmp path on dbfs
479 |                 save_ddl_cmd = "with open('/dbfs/tmp/migration/tmp_export_ddl.txt', 'w') as fp: fp.write(ddl_str)"
480 |                 save_resp = self.submit_command(cid, ec_id, save_ddl_cmd)
481 |                 # read that data using the dbfs rest endpoint which can handle 2MB of text easily
482 |                 read_args = {'path': '/tmp/migration/tmp_export_ddl.txt'}
483 |                 read_resp = self.get('/dbfs/read', read_args)
484 |                 with open(table_ddl_path, "w") as fp:
485 |                     fp.write(base64.b64decode(read_resp.get('data')).decode('utf-8'))
486 |                 return 0
487 |             else:
488 |                 export_ddl_cmd = 'print(ddl_str)'
489 |                 ddl_resp = self.submit_command(cid, ec_id, export_ddl_cmd)
490 |                 with open(table_ddl_path, "w") as fp:
491 |                     fp.write(ddl_resp.get('data'))
492 |                 return 0
493 | 
494 |     def retry_failed_metastore_export(self, cid, failed_metastore_log_path, iam_roles_list, success_metastore_log_path,
495 |                                       has_unicode, metastore_dir='metastore/'):
496 |         # check if instance profile exists, ask users to use --users first or enter yes to proceed.
497 |         if self.is_aws() and iam_roles_list:
498 |             do_instance_profile_exist = True
499 |         else:
500 |             do_instance_profile_exist = False
501 |         # get total failed entries
502 |         total_failed_entries = self.get_num_of_lines(failed_metastore_log_path)
503 |         if do_instance_profile_exist:
504 |             print("Instance profiles exist, retrying export of failed tables with each instance profile")
505 |             err_log_list = []
506 |             with open(failed_metastore_log_path, 'r') as err_log:
507 |                 for table in err_log:
508 |                     err_log_list.append(table)
509 | 
510 |             with open(success_metastore_log_path, 'a') as sfp:
511 |                 for iam_role in iam_roles_list:
512 |                     self.edit_cluster(cid, iam_role)
513 |                     ec_id = self.get_execution_context(cid)
514 |                     for table in err_log_list:
515 |                         table_json = json.loads(table)
516 |                         db_name = table_json['table'].split(".")[0]
517 |                         table_name = table_json['table'].split(".")[1]
518 | 
519 |                         is_successful = self.log_table_ddl(cid, ec_id, db_name, table_name, metastore_dir,
520 |                                                            failed_metastore_log_path, has_unicode)
521 |                         if is_successful == 0:
522 |                             err_log_list.remove(table)
523 |                             print(f"Exported {db_name}.{table_name}")
524 |                             success_item = {'table': f'{db_name}.{table_name}', 'iam': iam_role}
525 |                             sfp.write(json.dumps(success_item))
526 |                             sfp.write('\n')
527 |                         else:
528 |                             print('Failed to get ddl for {0}.{1} with iam role {2}'.format(db_name, table_name,
529 |                                                                                            iam_role))
530 | 
531 |                     os.remove(failed_metastore_log_path)
532 |                     with open(failed_metastore_log_path, 'w') as fm:
533 |                         for table in err_log_list:
534 |                             fm.write(table)
535 |                     failed_count_after_retry = self.get_num_of_lines(failed_metastore_log_path)
536 |                     print("Failed count after retry: " + str(failed_count_after_retry))
537 |         else:
538 |             print("No registered instance profiles to retry export")
539 | 
540 |     def report_legacy_tables_to_fix(self, metastore_dir='metastore/', fix_table_log='repair_tables.log'):
541 |         metastore_local_dir = self.get_export_dir() + metastore_dir
542 |         fix_log = self.get_export_dir() + fix_table_log
543 |         db_list = os.listdir(metastore_local_dir)
544 |         num_of_tables = 0
545 |         with open(fix_log, 'w') as fp:
546 |             for db_name in db_list:
547 |                 local_db_path = metastore_local_dir + db_name
548 |                 if os.path.isdir(local_db_path):
549 |                     # all databases should be directories, no files at this level
550 |                     # list all the tables in the database local dir
551 |                     tables = os.listdir(local_db_path)
552 |                     for tbl_name in tables:
553 |                         local_table_ddl = local_db_path + '/' + tbl_name
554 |                         if self.is_legacy_table_partitioned(local_table_ddl):
555 |                             num_of_tables += 1
556 |                             print(f'Table needs repair: {db_name}.{tbl_name}')
557 |                             fp.write(f'{db_name}.{tbl_name}\n')
558 |         # once completed, check if the file exists
559 |         log_size = os.stat(fix_log).st_size
560 |         if log_size > 0:
561 |             # repair log exists, upload to the platform to repair these tables
562 |             print(f"Total number of tables needing repair: {num_of_tables}")
563 |             dbfs_path = '/tmp/migration/repair_ddl.log'
564 |             print(f"Uploading repair log to DBFS: {dbfs_path}")
565 |             path_args = {'path': dbfs_path, 'overwrite': 'true'}
566 |             file_content_json = {'files': open(fix_log, 'r')}
567 |             put_resp = self.post('/dbfs/put', path_args, files_json=file_content_json)
568 |             if self.is_verbose():
569 |                 print(put_resp)
570 |         else:
571 |             os.remove(fix_log)
572 | 
573 |     def is_legacy_table_partitioned(self, table_local_path):
574 |         if not self.is_delta_table(table_local_path):
575 |             ddl_group = self.get_ddl_by_keyword_group(table_local_path)
576 |             for kw in ddl_group:
577 |                 kw_lower = kw.lower()
578 |                 if kw_lower.startswith('partitioned by'):
579 |                     return True
580 |         return False
581 | 


--------------------------------------------------------------------------------