├── README.md ├── migration ├── aws │ ├── requirements.txt │ ├── settings.sh │ ├── README.md │ ├── migration_copy.py │ ├── tsupload.py │ ├── tsquery.py │ ├── migration_gap.py │ ├── migration_loss.py │ ├── migration_kmubigdata_to_spotrank.py │ └── migration.py ├── gcp │ ├── gcp_spotlake_migration.sh │ ├── gcp_preprocess_rawdata.py │ └── gcp_write_timestream.py └── azure │ ├── modify_s3_azure_data.py │ └── azure_data_changed.py └── LICENSE /README.md: -------------------------------------------------------------------------------- 1 | # spotlake-migration -------------------------------------------------------------------------------- /migration/aws/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.24.45 2 | pandas==1.4.3 3 | -------------------------------------------------------------------------------- /migration/gcp/gcp_spotlake_migration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | aws s3 sync s3://tmp-gcp/rawdata /home/ubuntu/gcp_rawdata 4 | python3 gcp_preprocess_rawdata.py 5 | aws s3 sync s3://spotlake/rawdata/gcp /home/ubuntu/gcp_newrawdata 6 | python3 gcp_write_timestream.py -------------------------------------------------------------------------------- /migration/aws/settings.sh: -------------------------------------------------------------------------------- 1 | sudo DEBIAN_FRONTEND=noninteractive apt-get dist-upgrade -y 2 | 3 | sudo apt-get update 4 | sudo apt-get install awscli -y 5 | sudo apt-get install python3-pip -y 6 | aws --version 7 | pip --version 8 | 9 | pip install -r requirements.txt 10 | pip install --upgrade awscli 11 | 12 | aws configure 13 | -------------------------------------------------------------------------------- /migration/aws/README.md: -------------------------------------------------------------------------------- 1 | # Spotlake-Migration: AWS 2 | 3 | This repository is code that move spot data from a timestream to other. 4 | 5 | ### How To Use 6 | 1. clone this repository 7 | 2. run 'setting.sh' 8 | 3. run migration codes you want 9 | (optional) put latest file as 'latest.csv.gz' in this directory before running migration code 10 | 11 | ### Migration codes 12 | * migration.py : Migration code with multi-processing 13 | * migration_gap.py : Migration code for data gap between migration and collector 14 | * migration_copy.py : Migration code to copy data from a Timestream DB to other in same account 15 | * migration_loss.py : Migration code to upload data from S3 Bucket to Timestream DB 16 | * migration_kmubigdata_to_spotrank : Migration code for data from Timestream DB in KMUBIGDATA account to Timestream DB in SPOTRANK account 17 | 18 | -------------------------------------------------------------------------------- /migration/azure/modify_s3_azure_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | 5 | dir_path = "./" 6 | 7 | for (root, directories, files) in os.walk(dir_path): 8 | for file in files: 9 | # 현재 위치에서 하위 디렉토리의 csv.gz의 파일에 경우 10 | if '.csv.gz' in file: 11 | file_path = os.path.join(root, file) 12 | # 압축을 하제하고 df으로 만든다. 13 | df = pd.read_csv(f'{file_path}', compression='gzip') 14 | # spotprice가 없는 행은 drop하고 15 | df = df.dropna(subset=['spotPrice']) 16 | # 사용하지 않는 vendor행을 drop한다. 17 | df = df.drop(columns=['vendor'], axis=1) 18 | 19 | # 반복문으로 instanceTier에 instanceType에 있는경우 nan값으로 바꾸어주고 instanceType에 값을 옮긴다. 20 | for index, row in df.iterrows(): 21 | if not(row['instanceTier'] == 'Standard' or row['instanceTier'] == 'Basic'): 22 | df.loc[index, 'instanceType'] = df.loc[index, 'instanceTier'] 23 | df.loc[index, 'instanceTier'] = np.nan 24 | # 다시 압축하여 저장한다. 25 | df.to_csv(f'{file_path}', compression='gzip') 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Distributed Data Processing Systems Lab. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /migration/azure/azure_data_changed.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import boto3 4 | from compare_data import compare 5 | 6 | pd.set_option('display.max_columns', None) 7 | WORKLOAD_COLS = ['instanceTier', 'instanceType', 'region'] 8 | FEATURE_COLS = ['ondemandPrice', 'spotPrice'] 9 | 10 | # s3에 rawdata를 local로 가져옵니다. 11 | s3 = boto3.resource('s3', aws_access_key_id='', aws_secret_access_key='') 12 | bucket = s3.Bucket('tmp-azure') 13 | prefix = 'rawdata' 14 | for object in bucket.objects.filter(Prefix = 'rawdata'): 15 | os.makedirs(os.path.dirname(f'./{object.key}'), exist_ok=True) 16 | bucket.download_file(object.key, object.key) 17 | 18 | # local에 잇는 rawdata의 filepath를 list로 저장합니다. 19 | file_list = [] 20 | for (path, dir, files) in os.walk("./rawdata"): 21 | for filename in files: 22 | ext = os.path.splitext(filename)[-1] 23 | if ext == '.gz': 24 | file_list.append("%s/%s" % (path, filename)) 25 | 26 | # 순서대로 비교할 수 있게 정렬 27 | file_list.sort() 28 | 29 | # 맨 처음 rawdata를 previous_df로 설정 10분뒤 데이터를 current_df로 하여 계속 10분뒤 데이터와 비교 30 | for i in range(1, len(file_list)): 31 | previous_df = pd.read_csv(file_list[i-1], compression='gzip') 32 | current_df = pd.read_csv(file_list[i], compression='gzip') 33 | try: 34 | changed_df = compare(previous_df, current_df, WORKLOAD_COLS, FEATURE_COLS) 35 | if not changed_df.empty: 36 | print(i) 37 | print(changed_df) 38 | except Exception as e: 39 | print(f"exception{e} : {i}") 40 | -------------------------------------------------------------------------------- /migration/aws/migration_copy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | import tsquery 4 | import tsupload 5 | import pandas as pd 6 | from multiprocessing import Pool 7 | 8 | import time 9 | import pytz 10 | from datetime import datetime, timedelta 11 | 12 | 13 | SAVE_FILENAME = 'latest.csv.gz' 14 | PROFILE_NAME = 'default' 15 | BUCKET_NAME = 'spotlake' 16 | REGION_NAME = "us-west-2" 17 | QUERY_DATABASE_NAME = "spotlake" 18 | QUERY_TABLE_NAME = "temp" 19 | UPLOAD_DATABASE_NAME = 'spotlake' 20 | UPLOAD_TABLE_NAME = 'aws' 21 | NUM_CPUS = 8 22 | if 24 % NUM_CPUS != 0: 23 | raise Exception('use only 1, 2, 3, 4, 6, 8, 12, 24') 24 | CHUNK_HOUR = 24 / NUM_CPUS 25 | 26 | start_date = datetime(2022, 1, 1, 0, 0, 0, 0, pytz.UTC) 27 | end_date = datetime(2022, 8, 23, 0, 0, 0, 0, pytz.UTC) 28 | 29 | tsquery.PROFILE_NAME = PROFILE_NAME # tsquery.PROFILE_NAME must be credential of source database 30 | tsquery.REGION_NAME = REGION_NAME 31 | tsquery.DATABASE_NAME = QUERY_DATABASE_NAME 32 | tsquery.TABLE_NAME = QUERY_TABLE_NAME 33 | tsupload.PROFILE_NAME = PROFILE_NAME 34 | tsupload.REGION_NAME = REGION_NAME 35 | tsupload.DATABASE_NAME = UPLOAD_DATABASE_NAME 36 | tsupload.TABLE_NAME = UPLOAD_TABLE_NAME 37 | 38 | 39 | def date_range(start, end): 40 | delta = end - start 41 | days = [start + timedelta(days=i) for i in range(delta.days + 1)] 42 | return days 43 | 44 | 45 | def time_format(timestamp): 46 | return 'T'.join(str(timestamp).split()) 47 | 48 | 49 | days = date_range(start_date, end_date) 50 | 51 | perf_start_total = time.time() 52 | for idx in range(len(days)-1): 53 | perf_start = time.time() 54 | start_timestamp = days[idx] 55 | end_timestamp = days[idx+1] 56 | 57 | start_end_time_process_list = [] 58 | for i in range(NUM_CPUS): 59 | start_time_process = start_timestamp + timedelta(hours = CHUNK_HOUR*i) 60 | end_time_process = start_timestamp + timedelta(hours = CHUNK_HOUR*(i+1)) 61 | start_end_time_process_list.append((time_format(start_time_process), time_format(end_time_process))) 62 | 63 | with Pool(NUM_CPUS) as p: 64 | process_df_list = p.starmap(tsquery.get_timestream, start_end_time_process_list) 65 | 66 | day_df = pd.concat(process_df_list, axis=0, ignore_index=True) 67 | day_df['SPS'] = day_df['SPS'].astype(int) 68 | day_df['SpotPrice'] = day_df['SpotPrice'].astype(float) 69 | day_df['SpotPrice'] = day_df['SpotPrice'].round(5) 70 | 71 | tsupload.upload_timestream(day_df) 72 | print(f"elapsed time - single day query: {time.time() - perf_start}") 73 | print(f"elapsed time - total: {time.time() - perf_start_total}") 74 | -------------------------------------------------------------------------------- /migration/aws/tsupload.py: -------------------------------------------------------------------------------- 1 | import time 2 | import boto3 3 | import pandas as pd 4 | from botocore.config import Config 5 | from botocore.exceptions import ClientError 6 | 7 | 8 | PROFILE_NAME = 'default' 9 | REGION_NAME = 'us-east-2' 10 | DATABASE_NAME = 'dbname' 11 | TABLE_NAME = 'tablename' 12 | 13 | 14 | # Submit Batch To Timestream 15 | def submit_batch(records, counter, recursive, write_client): 16 | if recursive == 10: 17 | return 18 | try: 19 | result = write_client.write_records(DatabaseName=DATABASE_NAME, TableName = TABLE_NAME, Records=records, CommonAttributes={}) 20 | except write_client.exceptions.RejectedRecordsException as err: 21 | print(err) 22 | re_records = [] 23 | for rr in err.response["RejectedRecords"]: 24 | re_records.append(records[rr["RecordIndex"]]) 25 | submit_batch(re_records, counter, recursive + 1) 26 | except Exception as err: 27 | print(err) 28 | exit() 29 | 30 | 31 | # Check Database And Table Are Exist and Upload Data to Timestream 32 | def upload_timestream(data): 33 | session = boto3.Session(profile_name=PROFILE_NAME, region_name=REGION_NAME) 34 | write_client = session.client('timestream-write', config=Config(read_timeout=20, max_pool_connections=5000, retries={'max_attempts':10})) 35 | 36 | records = [] 37 | counter = 0 38 | for idx, row in data.iterrows(): 39 | time_value = str(row['time']).split('+')[0] 40 | time_value = time.strptime(time_value, '%Y-%m-%d %H:%M:%S') 41 | time_value = time.mktime(time_value) 42 | time_value = str(int(round(time_value * 1000))) 43 | 44 | dimensions = [] 45 | for column in data.columns: 46 | if column in ['InstanceType', 'Region', 'AZ', 'Ceased']: 47 | dimensions.append({'Name':column, 'Value': str(row[column])}) 48 | 49 | measures = [] 50 | for column, types in [('SPS', 'BIGINT'), ('IF', 'DOUBLE'), ('SpotPrice', 'DOUBLE')]: 51 | measures.append({'Name': column, 'Value': str(row[column]), 'Type': types}) 52 | 53 | submit_data = { 54 | 'Dimensions': dimensions, 55 | 'MeasureName': 'aws_values', 56 | 'MeasureValues': measures, 57 | 'MeasureValueType': 'MULTI', 58 | 'Time': time_value 59 | } 60 | 61 | records.append(submit_data) 62 | counter += 1 63 | if len(records) == 100: 64 | submit_batch(records, counter, 0, write_client) 65 | records = [] 66 | 67 | if len(records) != 0: 68 | submit_batch(records, counter, 0, write_client) 69 | -------------------------------------------------------------------------------- /migration/gcp/gcp_preprocess_rawdata.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from botocore.config import Config 3 | import os 4 | import pandas as pd 5 | import gzip 6 | from datetime import datetime, timezone 7 | 8 | ### preprocess rawdata from s3 temp bucket and save 'spotlake' bucket 9 | ### before this, have to aws sync from s3 temp bucket 10 | ### plz check NEW_BUCKET_NAME, file paths before running this 11 | 12 | session = boto3.session.Session(region_name='us-west-2') 13 | write_client = session.client('timestream-write', config=Config(read_timeout=20, max_pool_connections=5000, retries={'max_attempts':10})) 14 | client = session.client('timestream-query') 15 | 16 | NEW_BUCKET_NAME = 'spotlake' 17 | 18 | FILE_PATH = '/home/ubuntu/gcp_rawdata' # 2022/MM/dd 19 | TEMP_FILE_PATH = './' 20 | NEW_FILE_PATH = '/home/ubuntu/gcp_newrawdata' 21 | workload_cols = ['InstanceType', 'Region'] 22 | feature_cols = ['Calculator OnDemand Price', 'Calculator Preemptible Price', 'VM Instance OnDemand Price', 'VM Instance Preemptible Price'] 23 | 24 | def save_raw(data, timestamp): 25 | SAVE_FILENAME = f"{TEMP_FILE_PATH}/spotlake_"+f"{timestamp}.csv.gz" 26 | data.to_csv(SAVE_FILENAME, index=False, compression='gzip') 27 | session = boto3.Session() 28 | s3 = session.client('s3') 29 | s3_dir_name = timestamp.strftime("%Y/%m/%d") 30 | s3_obj_name = timestamp.strftime("%H:%M:%S") 31 | with open(SAVE_FILENAME, 'rb') as f: 32 | s3.upload_fileobj( 33 | f, NEW_BUCKET_NAME, f"rawdata/gcp/{s3_dir_name}/{s3_obj_name}.csv.gz") 34 | 35 | for filename in os.listdir(f"{TEMP_FILE_PATH}/"): 36 | if "spotlake_" in filename: 37 | os.remove(f"{TEMP_FILE_PATH}/{filename}") 38 | 39 | 40 | # sort gcp_rawdata folder paths 41 | 42 | paths = [] 43 | for (path, dir, files) in os.walk(FILE_PATH): 44 | for filename in files: 45 | final_path = path + '/' + filename 46 | paths.append(final_path) 47 | paths.sort() 48 | 49 | # remove unnecessary vendor, change nan into -1 and save into tmp-change-bucket 50 | for path in paths: 51 | changed_time = path.split('gcp_rawdata/')[1].split('.csv.gz')[0] 52 | timestamp = datetime.strptime(changed_time, '%Y/%m/%d/%H:%M:%S') 53 | 54 | df_old = pd.DataFrame() 55 | 56 | with gzip.open(path, 'rb') as f: 57 | df_old = pd.read_csv(f) 58 | 59 | # remove Vendor, Calculator Savings, VM Instance Savings 60 | df_new = pd.DataFrame() 61 | try : 62 | df_new = df_old.drop(['Vendor', 'Calculator Savings', 'VM Instance Savings'], axis=1) 63 | except: 64 | df_new = df_old 65 | 66 | # # have to change nan into -1 67 | df_new = df_new.replace(float('nan'), -1) 68 | 69 | # # write to tmp-changed-gcp 70 | save_raw(df_new, timestamp) 71 | print(timestamp) 72 | 73 | -------------------------------------------------------------------------------- /migration/aws/tsquery.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import pandas as pd 3 | 4 | 5 | PROFILE_NAME = "source-profile" 6 | REGION_NAME = "us-west-2" 7 | 8 | 9 | def run_query(query_string): 10 | try: 11 | session = boto3.Session(profile_name=PROFILE_NAME, region_name=REGION_NAME) 12 | query_client = session.client('timestream-query') 13 | paginator = query_client.get_paginator('query') 14 | page_iterator = paginator.paginate(QueryString=query_string) 15 | for page in page_iterator: 16 | _parse_query_result(page) 17 | except Exception as err: 18 | print("Exception while running query:", err) 19 | 20 | 21 | def _parse_query_result(query_result): 22 | query_status = query_result["QueryStatus"] 23 | column_info = query_result['ColumnInfo'] 24 | for row in query_result['Rows']: 25 | _parse_row(column_info, row) 26 | 27 | 28 | def _parse_row(column_info, row): 29 | data = row['Data'] 30 | row_output = [] 31 | for j in range(len(data)): 32 | info = column_info[j] 33 | datum = data[j] 34 | row_output.append(_parse_datum(info, datum)) 35 | return "{%s}" % str(row_output) 36 | 37 | 38 | def _parse_datum(info, datum): 39 | if datum.get('NullValue', False): 40 | return "%s=NULL" % info['Name'], 41 | 42 | column_type = info['Type'] 43 | 44 | # If the column is of TimeSeries Type 45 | if 'TimeSeriesMeasureValueColumnInfo' in column_type: 46 | return _parse_time_series(info, datum) 47 | 48 | # If the column is of Array Type 49 | elif 'ArrayColumnInfo' in column_type: 50 | array_values = datum['ArrayValue'] 51 | return "%s=%s" % (info['Name'], _parse_array(info['Type']['ArrayColumnInfo'], array_values)) 52 | 53 | # If the column is of Row Type 54 | elif 'RowColumnInfo' in column_type: 55 | row_column_info = info['Type']['RowColumnInfo'] 56 | row_values = datum['RowValue'] 57 | return _parse_row(row_column_info, row_values) 58 | 59 | # If the column is of Scalar Type 60 | else: 61 | global timestream_data 62 | if info['Name'] == "time": 63 | timestream_data[info['Name']].append(datum['ScalarValue'].split('.')[0]+"+00:00") 64 | elif info['Name'] != "measure_name" and info['Name'] != "measure_value::double": 65 | timestream_data[info['Name']].append(datum['ScalarValue']) 66 | return _parse_column_name(info) + datum['ScalarValue'] 67 | 68 | 69 | def _parse_time_series(info, datum): 70 | time_series_output = [] 71 | for data_point in datum['TimeSeriesValue']: 72 | time_series_output.append("{time=%s, value=%s}" 73 | % (data_point['Time'], 74 | _parse_datum(info['Type']['TimeSeriesMeasureValueColumnInfo'], 75 | data_point['Value']))) 76 | return "[%s]" % str(time_series_output) 77 | 78 | 79 | def _parse_array(array_column_info, array_values): 80 | array_output = [] 81 | for datum in array_values: 82 | array_output.append(_parse_datum(array_column_info, datum)) 83 | 84 | return "[%s]" % str(array_output) 85 | 86 | 87 | def _parse_column_name(info): 88 | if 'Name' in info: 89 | return info['Name'] + "=" 90 | else: 91 | return "" 92 | 93 | 94 | def get_timestream(start_date, end_date): 95 | global timestream_data 96 | timestream_data = {"SpotPrice" : [], "Savings" : [], "SPS" : [], "AZ" : [], "Region" : [], "InstanceType" : [], "IF" : [], "time" : []} 97 | 98 | print(f"Start query ({start_date}~{end_date})") 99 | query_string = f"""SELECT * FROM "spotrank-timestream"."spot-table" WHERE time between from_iso8601_timestamp('{start_date}') and from_iso8601_timestamp('{end_date}') ORDER BY time""" 100 | run_query(query_string) 101 | print(start_date + "~" + end_date + " is end") 102 | timestream_df = pd.DataFrame(timestream_data) 103 | timestream_df.drop_duplicates(inplace=True) 104 | return timestream_df 105 | 106 | 107 | def get_timestamps(start_date, end_date): 108 | global timestream_data 109 | timestream_data = {"time" : []} 110 | 111 | print(f"Start query ({start_date}~{end_date})") 112 | query_string = f"""SELECT DISTINCT time FROM "spotrank-timestream"."spot-table" WHERE time between from_iso8601_date('{start_date}') and from_iso8601_date('{end_date}') ORDER BY time""" 113 | run_query(query_string) 114 | print(start_date + "~" + end_date + " is end") 115 | return timestream_data['time'] 116 | -------------------------------------------------------------------------------- /migration/aws/migration_gap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | import tsquery 4 | import tsupload 5 | import pandas as pd 6 | from multiprocessing import Pool 7 | 8 | import time 9 | import pytz 10 | from datetime import datetime, timedelta 11 | 12 | 13 | SAVE_FILENAME = 'latest.csv.gz' 14 | PROFILE_NAME = 'default' 15 | BUCKET_NAME = 'spotlake' 16 | REGION_NAME = "us-west-2" 17 | DATABASE_NAME = 'spotlake' 18 | TABLE_NAME = 'aws' 19 | 20 | start_date = datetime(2022, 8, 23, 0, 0, 0, 0, pytz.UTC) 21 | end_date = datetime(2022, 8, 23, 7, 50, 0, 0, pytz.UTC) 22 | 23 | workload_cols = ['InstanceType', 'Region', 'AZ'] 24 | feature_cols = ['SPS', 'IF', 'SpotPrice'] 25 | 26 | # tsquery.PROFILE_NAME = PROFILE_NAME # tsquery.PROFILE_NAME must be credential of source database 27 | tsquery.REGION_NAME = REGION_NAME 28 | tsupload.PROFILE_NAME = PROFILE_NAME 29 | tsupload.REGION_NAME = REGION_NAME 30 | tsupload.DATABASE_NAME = DATABASE_NAME 31 | tsupload.TABLE_NAME = TABLE_NAME 32 | 33 | 34 | # compress data as gzip file, save to local file system, upload file to s3 35 | def save_gz_s3(df, timestamp): 36 | # compress and save to LFS 37 | df.to_csv(SAVE_FILENAME, index=False, compression="gzip") 38 | 39 | # upload compressed file to S3 40 | session = boto3.Session(profile_name=PROFILE_NAME) 41 | s3 = session.client('s3') 42 | s3_dir_name = '/'.join(timestamp.split()[0].split('-')) 43 | s3_obj_name = timestamp.split()[1] 44 | 45 | with open(SAVE_FILENAME, 'rb') as f: 46 | s3.upload_fileobj(f, BUCKET_NAME, f"rawdata/{s3_dir_name}/{s3_obj_name}.csv.gz") 47 | 48 | 49 | def compare_nparray(previous_df, current_df, workload_cols, feature_cols): 50 | previous_df.loc[:,'Workload'] = previous_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 51 | previous_df.loc[:,'Feature'] = previous_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 52 | current_df.loc[:,'Workload'] = current_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 53 | current_df.loc[:,'Feature'] = current_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 54 | 55 | current_indices = current_df[['Workload', 'Feature']].sort_values(by='Workload').index 56 | current_values = current_df[['Workload', 'Feature']].sort_values(by='Workload').values 57 | previous_indices = previous_df[['Workload', 'Feature']].sort_values(by='Workload').index 58 | previous_values = previous_df[['Workload', 'Feature']].sort_values(by='Workload').values 59 | 60 | changed_indices = [] 61 | removed_indices = [] 62 | 63 | prev_idx = 0 64 | curr_idx = 0 65 | while True: 66 | if (curr_idx == len(current_indices)) and (prev_idx == len(previous_indices)): 67 | break 68 | elif curr_idx == len(current_indices): 69 | prev_workload = previous_values[prev_idx][0] 70 | if prev_workload not in current_values[:,0]: 71 | removed_indices.append(previous_indices[prev_idx]) 72 | prev_idx += 1 73 | continue 74 | else: 75 | raise Exception('workload error') 76 | break 77 | elif prev_idx == len(previous_indices): 78 | curr_workload = current_values[curr_idx][0] 79 | curr_feature = current_values[curr_idx][1] 80 | if curr_workload not in previous_values[:,0]: 81 | changed_indices.append(current_indices[curr_idx]) 82 | curr_idx += 1 83 | continue 84 | else: 85 | raise Exception('workload error') 86 | break 87 | 88 | prev_workload = previous_values[prev_idx][0] 89 | prev_feature = previous_values[prev_idx][1] 90 | curr_workload = current_values[curr_idx][0] 91 | curr_feature = current_values[curr_idx][1] 92 | 93 | if prev_workload != curr_workload: 94 | if curr_workload not in previous_values[:,0]: 95 | changed_indices.append(current_indices[curr_idx]) 96 | curr_idx += 1 97 | elif prev_workload not in current_values[:,0]: 98 | removed_indices.append(previous_indices[prev_idx]) 99 | prev_idx += 1 100 | else: 101 | raise Exception('workload error') 102 | else: 103 | if prev_feature != curr_feature: 104 | changed_indices.append(current_indices[curr_idx]) 105 | curr_idx += 1 106 | prev_idx += 1 107 | 108 | changed_df = current_df.loc[changed_indices].drop(['Workload', 'Feature'], axis=1) 109 | removed_df = previous_df.loc[removed_indices].drop(['Workload', 'Feature'], axis=1) 110 | for col in feature_cols: 111 | removed_df[col] = 0 112 | 113 | # removed_df have one more column, 'Ceased' 114 | removed_df['Ceased'] = True 115 | return changed_df, removed_df 116 | 117 | 118 | def date_range(start, end): 119 | delta = end - start 120 | days = [start + timedelta(days=i) for i in range(delta.days + 1)] 121 | return days 122 | 123 | 124 | def time_format(timestamp): 125 | return 'T'.join(str(timestamp).split()) 126 | 127 | 128 | perf_start_total = time.time() 129 | perf_start = time.time() 130 | 131 | day_df = tsquery.get_timestream(time_format(start_date), time_format(end_date)) 132 | frequency_map = {'<5%': 3.0, '5-10%': 2.5, '10-15%': 2.0, '15-20%': 1.5, '>20%': 1.0} 133 | day_df = day_df.replace({'IF': frequency_map}) 134 | day_df['SPS'] = day_df['SPS'].astype(int) 135 | day_df['SpotPrice'] = day_df['SpotPrice'].astype(float) 136 | day_df['SpotPrice'] = day_df['SpotPrice'].round(5) 137 | 138 | print(f"elapsed time - single day query: {time.time() - perf_start}") 139 | # day_df['OndemandPrice'] = (100 * day_df['SpotPrice']) / (100 - day_df['Savings']) 140 | 141 | day_timestamps = sorted(list(day_df['time'].unique())) 142 | for timestamp in day_timestamps: 143 | perf_start = time.time() 144 | current_df = day_df[day_df['time'] == timestamp].copy() 145 | print(f"elapsed time - select by time: {time.time() - perf_start}") 146 | if SAVE_FILENAME not in os.listdir('./'): 147 | save_gz_s3(current_df, timestamp) 148 | tsupload.upload_timestream(current_df) 149 | else: 150 | perf_start = time.time() 151 | previous_df = pd.read_csv(SAVE_FILENAME, compression='gzip', header=0, sep=',', quotechar='"') 152 | save_gz_s3(current_df, timestamp) 153 | print(f"elapsed time - read and save: {time.time() - perf_start}") 154 | perf_start = time.time() 155 | changed_df, removed_df = compare_nparray(previous_df, current_df, workload_cols, feature_cols) 156 | print(f"elapsed time - compare: {time.time() - perf_start}") 157 | perf_start = time.time() 158 | # changed_df and removed_df have different shape, because of 'Ceased' column 159 | tsupload.upload_timestream(changed_df) 160 | tsupload.upload_timestream(removed_df) 161 | print(f"elapsed time - upload: {time.time() - perf_start}") 162 | print(f"elapsed time - total single day: {time.time() - perf_start_total}") 163 | -------------------------------------------------------------------------------- /migration/aws/migration_loss.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | import tsupload 4 | import pandas as pd 5 | 6 | import time 7 | import pytz 8 | from datetime import datetime, timedelta 9 | 10 | SAVE_FILENAME = 'latest.csv.gz' 11 | PROFILE_NAME = 'default' 12 | BUCKET_NAME = 'spotlake' 13 | REGION_NAME = "us-west-2" 14 | DATABASE_NAME = 'spotlake' 15 | TABLE_NAME = 'aws' 16 | 17 | start_date = datetime(2022, 4, 13, 0, 0, 0, 0, pytz.UTC) 18 | end_date = datetime(2022, 5, 1, 0, 0, 0, 0, pytz.UTC) 19 | 20 | workload_cols = ['InstanceType', 'Region', 'AZ'] 21 | feature_cols = ['SPS', 'IF', 'SpotPrice'] 22 | 23 | tsupload.PROFILE_NAME = PROFILE_NAME 24 | tsupload.REGION_NAME = REGION_NAME 25 | tsupload.DATABASE_NAME = DATABASE_NAME 26 | tsupload.TABLE_NAME = TABLE_NAME 27 | 28 | # compress data as gzip file, save to local file system, upload file to s3 29 | def save_gz_s3(df, timestamp): 30 | # compress and save to LFS 31 | df.to_csv(SAVE_FILENAME, index=False, compression="gzip") 32 | 33 | # upload compressed file to S3 34 | session = boto3.Session(profile_name=PROFILE_NAME) 35 | s3 = session.client('s3') 36 | s3_dir_name = '/'.join(timestamp.split()[0].split('-')) 37 | s3_obj_name = timestamp.split()[1] 38 | 39 | with open(SAVE_FILENAME, 'rb') as f: 40 | s3.upload_fileobj(f, BUCKET_NAME, f"rawdata/{s3_dir_name}/{s3_obj_name}.csv.gz") 41 | 42 | def compare_nparray(previous_df, current_df, workload_cols, feature_cols): 43 | previous_df.loc[:,'Workload'] = previous_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 44 | previous_df.loc[:,'Feature'] = previous_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 45 | current_df.loc[:,'Workload'] = current_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 46 | current_df.loc[:,'Feature'] = current_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 47 | 48 | current_indices = current_df[['Workload', 'Feature']].sort_values(by='Workload').index 49 | current_values = current_df[['Workload', 'Feature']].sort_values(by='Workload').values 50 | previous_indices = previous_df[['Workload', 'Feature']].sort_values(by='Workload').index 51 | previous_values = previous_df[['Workload', 'Feature']].sort_values(by='Workload').values 52 | 53 | changed_indices = [] 54 | removed_indices = [] 55 | 56 | prev_idx = 0 57 | curr_idx = 0 58 | while True: 59 | if (curr_idx == len(current_indices)) and (prev_idx == len(previous_indices)): 60 | break 61 | elif curr_idx == len(current_indices): 62 | prev_workload = previous_values[prev_idx][0] 63 | if prev_workload not in current_values[:,0]: 64 | removed_indices.append(previous_indices[prev_idx]) 65 | prev_idx += 1 66 | continue 67 | else: 68 | raise Exception('workload error') 69 | break 70 | elif prev_idx == len(previous_indices): 71 | curr_workload = current_values[curr_idx][0] 72 | curr_feature = current_values[curr_idx][1] 73 | if curr_workload not in previous_values[:,0]: 74 | changed_indices.append(current_indices[curr_idx]) 75 | curr_idx += 1 76 | continue 77 | else: 78 | raise Exception('workload error') 79 | break 80 | 81 | prev_workload = previous_values[prev_idx][0] 82 | prev_feature = previous_values[prev_idx][1] 83 | curr_workload = current_values[curr_idx][0] 84 | curr_feature = current_values[curr_idx][1] 85 | 86 | if prev_workload != curr_workload: 87 | if curr_workload not in previous_values[:,0]: 88 | changed_indices.append(current_indices[curr_idx]) 89 | curr_idx += 1 90 | elif prev_workload not in current_values[:,0]: 91 | removed_indices.append(previous_indices[prev_idx]) 92 | prev_idx += 1 93 | else: 94 | raise Exception('workload error') 95 | else: 96 | if prev_feature != curr_feature: 97 | changed_indices.append(current_indices[curr_idx]) 98 | curr_idx += 1 99 | prev_idx += 1 100 | 101 | changed_df = current_df.loc[changed_indices].drop(['Workload', 'Feature'], axis=1) 102 | removed_df = previous_df.loc[removed_indices].drop(['Workload', 'Feature'], axis=1) 103 | for col in feature_cols: 104 | removed_df[col] = 0 105 | 106 | # removed_df have one more column, 'Ceased' 107 | removed_df['Ceased'] = True 108 | return changed_df, removed_df 109 | 110 | def date_range(start, end): 111 | delta = end - start 112 | days = [start + timedelta(days=i) for i in range(delta.days + 1)] 113 | return days 114 | 115 | def time_format(timestamp): 116 | return 'T'.join(str(timestamp).split()) 117 | 118 | days = date_range(start_date, end_date) 119 | all_df = pd.read_pickle('./df_0413_0501.pkl') 120 | perf_start_total = time.time() 121 | for day in days: 122 | perf_start = time.time() 123 | day_cond = (str(day) <= all_df['time'] ) & (all_df['time'] < str(day + timedelta(days=1))) 124 | day_df = all_df[day_cond].copy() 125 | frequency_map = {'<5%': 3.0, '5-10%': 2.5, '10-15%': 2.0, '15-20%': 1.5, '>20%': 1.0} 126 | day_df = day_df.replace({'IF': frequency_map}) 127 | day_df['SPS'] = day_df['SPS'].astype(int) 128 | day_df['SpotPrice'] = day_df['SpotPrice'].astype(float) 129 | day_df['SpotPrice'] = day_df['SpotPrice'].round(5) 130 | 131 | print(f"elapsed time - single day query: {time.time() - perf_start}") 132 | # day_df['OndemandPrice'] = (100 * day_df['SpotPrice']) / (100 - day_df['Savings']) 133 | 134 | day_timestamps = sorted(list(day_df['time'].unique())) 135 | for timestamp in day_timestamps: 136 | perf_start = time.time() 137 | current_df = day_df[day_df['time'] == timestamp].copy() 138 | print(f"elapsed time - select by time: {time.time() - perf_start}") 139 | if SAVE_FILENAME not in os.listdir('./'): 140 | save_gz_s3(current_df, timestamp) 141 | tsupload.upload_timestream(current_df) 142 | else: 143 | perf_start = time.time() 144 | previous_df = pd.read_csv(SAVE_FILENAME, compression='gzip', header=0, sep=',', quotechar='"') 145 | save_gz_s3(current_df, timestamp) 146 | print(f"elapsed time - read and save: {time.time() - perf_start}") 147 | perf_start = time.time() 148 | changed_df, removed_df = compare_nparray(previous_df, current_df, workload_cols, feature_cols) 149 | print(f"elapsed time - compare: {time.time() - perf_start}") 150 | perf_start = time.time() 151 | # changed_df and removed_df have different shape, because of 'Ceased' column 152 | tsupload.upload_timestream(changed_df) 153 | tsupload.upload_timestream(removed_df) 154 | print(f"elapsed time - upload: {time.time() - perf_start}") 155 | print(f"elapsed time - total single day: {time.time() - perf_start_total}") 156 | -------------------------------------------------------------------------------- /migration/aws/migration_kmubigdata_to_spotrank.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | import tsquery 4 | import tsupload 5 | import pandas as pd 6 | from multiprocessing import Pool 7 | 8 | import time 9 | import pytz 10 | from datetime import datetime, timedelta 11 | 12 | 13 | SAVE_FILENAME = 'latest.csv.gz' 14 | QUERY_PROFILE_NAME = 'kmubigdata' 15 | UPLOAD_PROFILE_NAME = 'spotrank' 16 | BUCKET_NAME = 'spotlake-test' 17 | REGION_NAME = "us-west-2" 18 | QUERY_DATABASE_NAME = "spotrank-timestream" 19 | QUERY_TABLE_NAME = "spot-table" 20 | UPLOAD_DATABASE_NAME = 'spotlake' 21 | UPLOAD_TABLE_NAME = 'aws' 22 | NUM_CPUS = 8 23 | if 24 % NUM_CPUS != 0: 24 | raise Exception('use only 1, 2, 3, 4, 6, 8, 12, 24') 25 | CHUNK_HOUR = 24 / NUM_CPUS 26 | 27 | start_date = datetime(2022, 9, 28, 16, 20, 0, 0, pytz.UTC) 28 | end_date = datetime(2022, 9, 29, 3, 10, 0, 0, pytz.UTC) 29 | 30 | tsquery.PROFILE_NAME = QUERY_PROFILE_NAME # tsquery.PROFILE_NAME must be credential of source database 31 | tsquery.REGION_NAME = REGION_NAME 32 | tsquery.DATABASE_NAME = QUERY_DATABASE_NAME 33 | tsquery.TABLE_NAME = QUERY_TABLE_NAME 34 | tsupload.PROFILE_NAME = UPLOAD_PROFILE_NAME 35 | tsupload.REGION_NAME = REGION_NAME 36 | tsupload.DATABASE_NAME = UPLOAD_DATABASE_NAME 37 | tsupload.TABLE_NAME = UPLOAD_TABLE_NAME 38 | 39 | workload_cols = ['InstanceType', 'Region', 'AZ'] 40 | feature_cols = ['SPS', 'IF', 'SpotPrice'] 41 | 42 | 43 | # compress data as gzip file, save to local file system, upload file to s3 44 | def save_gz_s3(df, timestamp): 45 | # compress and save to LFS 46 | df.to_csv(SAVE_FILENAME, index=False, compression="gzip") 47 | 48 | # upload compressed file to S3 49 | session = boto3.Session(profile_name=UPLOAD_PROFILE_NAME) 50 | s3 = session.client('s3') 51 | s3_dir_name = '/'.join(timestamp.split()[0].split('-')) 52 | s3_obj_name = timestamp.split()[1] 53 | 54 | with open(SAVE_FILENAME, 'rb') as f: 55 | s3.upload_fileobj(f, BUCKET_NAME, f"rawdata/{s3_dir_name}/{s3_obj_name}.csv.gz") 56 | 57 | 58 | def compare_nparray(previous_df, current_df, workload_cols, feature_cols): 59 | previous_df.loc[:,'Workload'] = previous_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 60 | previous_df.loc[:,'Feature'] = previous_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 61 | current_df.loc[:,'Workload'] = current_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 62 | current_df.loc[:,'Feature'] = current_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 63 | 64 | current_indices = current_df[['Workload', 'Feature']].sort_values(by='Workload').index 65 | current_values = current_df[['Workload', 'Feature']].sort_values(by='Workload').values 66 | previous_indices = previous_df[['Workload', 'Feature']].sort_values(by='Workload').index 67 | previous_values = previous_df[['Workload', 'Feature']].sort_values(by='Workload').values 68 | 69 | changed_indices = [] 70 | removed_indices = [] 71 | 72 | prev_idx = 0 73 | curr_idx = 0 74 | while True: 75 | if (curr_idx == len(current_indices)) and (prev_idx == len(previous_indices)): 76 | break 77 | elif curr_idx == len(current_indices): 78 | prev_workload = previous_values[prev_idx][0] 79 | if prev_workload not in current_values[:,0]: 80 | removed_indices.append(previous_indices[prev_idx]) 81 | prev_idx += 1 82 | continue 83 | else: 84 | raise Exception('workload error') 85 | break 86 | elif prev_idx == len(previous_indices): 87 | curr_workload = current_values[curr_idx][0] 88 | curr_feature = current_values[curr_idx][1] 89 | if curr_workload not in previous_values[:,0]: 90 | changed_indices.append(current_indices[curr_idx]) 91 | curr_idx += 1 92 | continue 93 | else: 94 | raise Exception('workload error') 95 | break 96 | 97 | prev_workload = previous_values[prev_idx][0] 98 | prev_feature = previous_values[prev_idx][1] 99 | curr_workload = current_values[curr_idx][0] 100 | curr_feature = current_values[curr_idx][1] 101 | 102 | if prev_workload != curr_workload: 103 | if curr_workload not in previous_values[:,0]: 104 | changed_indices.append(current_indices[curr_idx]) 105 | curr_idx += 1 106 | elif prev_workload not in current_values[:,0]: 107 | removed_indices.append(previous_indices[prev_idx]) 108 | prev_idx += 1 109 | else: 110 | raise Exception('workload error') 111 | else: 112 | if prev_feature != curr_feature: 113 | changed_indices.append(current_indices[curr_idx]) 114 | curr_idx += 1 115 | prev_idx += 1 116 | 117 | changed_df = current_df.loc[changed_indices].drop(['Workload', 'Feature'], axis=1) 118 | removed_df = previous_df.loc[removed_indices].drop(['Workload', 'Feature'], axis=1) 119 | for col in feature_cols: 120 | removed_df[col] = 0 121 | 122 | # removed_df have one more column, 'Ceased' 123 | removed_df['Ceased'] = True 124 | return changed_df 125 | 126 | 127 | def date_range(start, end): 128 | delta = end - start 129 | days = [start + timedelta(minutes=i*10) for i in range(delta.seconds//60//10 + 1)] 130 | return days 131 | 132 | 133 | def time_format(timestamp): 134 | return 'T'.join(str(timestamp).split()) 135 | 136 | 137 | days = date_range(start_date, end_date) 138 | 139 | perf_start_total = time.time() 140 | perf_start = time.time() 141 | 142 | start_end_time_process_list = [] 143 | 144 | day_df = tsquery.get_timestream(start_date, end_date) 145 | 146 | frequency_map = {'<5%': 3.0, '5-10%': 2.5, '10-15%': 2.0, '15-20%': 1.5, '>20%': 1.0} 147 | day_df = day_df.replace({'IF': frequency_map}) 148 | day_df['SPS'] = day_df['SPS'].astype(int) 149 | day_df['SpotPrice'] = day_df['SpotPrice'].astype(float) 150 | day_df['SpotPrice'] = day_df['SpotPrice'].round(5) 151 | 152 | print(f"elapsed time - single day query: {time.time() - perf_start}") 153 | day_df['OndemandPrice'] = (100 * day_df['SpotPrice']) / (100 - day_df['Savings']) 154 | day_df['OndemandPrice'] = day_df['OndemandPrice'].astype(float) 155 | day_df['OndemandPrice'] = day_df['OndemandPrice'].round(5) 156 | 157 | day_timestamps = sorted(list(day_df['time'].unique())) 158 | for timestamp in day_timestamps: 159 | perf_start = time.time() 160 | current_df = day_df[day_df['time'] == timestamp].copy() 161 | print(f"elapsed time - select by time: {time.time() - perf_start}") 162 | if SAVE_FILENAME not in os.listdir('./'): 163 | save_gz_s3(current_df, timestamp) 164 | tsupload.upload_timestream(current_df) 165 | else: 166 | perf_start = time.time() 167 | previous_df = pd.read_csv(SAVE_FILENAME, compression='gzip', header=0, sep=',', quotechar='"') 168 | save_gz_s3(current_df, timestamp) 169 | print(f"elapsed time - read and save: {time.time() - perf_start}") 170 | perf_start = time.time() 171 | changed_df = compare_nparray(previous_df, current_df, workload_cols, feature_cols) 172 | print(f"elapsed time - compare: {time.time() - perf_start}") 173 | perf_start = time.time() 174 | # changed_df and removed_df have different shape, because of 'Ceased' column 175 | tsupload.upload_timestream(changed_df) 176 | print(f"elapsed time - upload: {time.time() - perf_start}") 177 | print(f"elapsed time - total: {time.time() - perf_start_total}") 178 | -------------------------------------------------------------------------------- /migration/gcp/gcp_write_timestream.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from botocore.config import Config 3 | import time 4 | import os 5 | import pandas as pd 6 | import gzip 7 | from datetime import datetime 8 | 9 | ### write stored rawdata on tsdb 10 | ### before this, have to aws sync spotlake bucket 11 | ### plz check file paths, DATABASE_NAME and TABLE_NAME before running this 12 | 13 | session = boto3.session.Session(region_name='us-west-2') 14 | write_client = session.client('timestream-write', config=Config(read_timeout=20, max_pool_connections=5000, retries={'max_attempts':10})) 15 | client = session.client('timestream-query') 16 | 17 | DATABASE_NAME = 'spotlake' 18 | TABLE_NAME = 'gcp' 19 | 20 | FILE_PATH = '/home/ubuntu/gcp_rawdata' # 2022/MM/dd 21 | TEMP_FILE_PATH = './' 22 | NEW_FILE_PATH = '/home/ubuntu/gcp_newrawdata' 23 | workload_cols = ['InstanceType', 'Region'] 24 | feature_cols = ['Calculator OnDemand Price', 'Calculator Preemptible Price', 'VM Instance OnDemand Price', 'VM Instance Preemptible Price'] 25 | 26 | 27 | # Submit Batch To Timestream 28 | def submit_batch(records, counter, recursive): 29 | if recursive == 10: 30 | return 31 | try: 32 | result = write_client.write_records(DatabaseName=DATABASE_NAME, TableName = TABLE_NAME, Records=records, CommonAttributes={}) 33 | except write_client.exceptions.RejectedRecordsException as err: 34 | re_records = [] 35 | for rr in err.response["RejectedRecords"]: 36 | print(rr['Reason']) 37 | re_records.append(records[rr["RecordIndex"]]) 38 | submit_batch(re_records, counter, recursive + 1) 39 | except Exception as err: 40 | print(err) 41 | exit() 42 | 43 | 44 | # Check Database And Table Are Exist and Upload Data to Timestream 45 | def upload_timestream(data, timestamp): 46 | print(len(data)) 47 | 48 | time_value = time.strptime(timestamp.strftime("%Y-%m-%d %H:%M"), '%Y-%m-%d %H:%M') 49 | time_value = time.mktime(time_value) 50 | time_value = str(int(round(time_value * 1000))) 51 | 52 | 53 | records = [] 54 | counter = 0 55 | for idx, row in data.iterrows(): 56 | 57 | dimensions = [] 58 | for column in data.columns: 59 | if column in ['InstanceType', 'Region', 'Ceased']: 60 | dimensions.append({'Name':column, 'Value': str(row[column])}) 61 | 62 | submit_data = { 63 | 'Dimensions': dimensions, 64 | 'MeasureName': 'gcp_values', 65 | 'MeasureValues': [], 66 | 'MeasureValueType': 'MULTI', 67 | 'Time': time_value 68 | } 69 | for column, types in [('Calculator OnDemand Price', 'DOUBLE'), ('Calculator Preemptible Price', 'DOUBLE'), ('VM Instance OnDemand Price', 'DOUBLE'), ('VM Instance Preemptible Price', 'DOUBLE')]: 70 | submit_data['MeasureValues'].append({'Name': column, 'Value': str(row[column]), 'Type' : types}) 71 | records.append(submit_data) 72 | counter += 1 73 | if len(records) == 100: 74 | submit_batch(records, counter, 0) 75 | records = [] 76 | 77 | if len(records) != 0: 78 | submit_batch(records, counter, 0) 79 | 80 | print(f"end : {counter}") 81 | 82 | 83 | def compare(previous_df, current_df, workload_cols, feature_cols): 84 | previous_df.loc[:,'Workload'] = previous_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 85 | previous_df.loc[:,'Feature'] = previous_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 86 | current_df.loc[:,'Workload'] = current_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 87 | current_df.loc[:,'Feature'] = current_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 88 | 89 | current_indices = current_df[['Workload', 'Feature']].sort_values(by='Workload').index 90 | current_values = current_df[['Workload', 'Feature']].sort_values(by='Workload').values 91 | previous_indices = previous_df[['Workload', 'Feature']].sort_values(by='Workload').index 92 | previous_values = previous_df[['Workload', 'Feature']].sort_values(by='Workload').values 93 | 94 | changed_indices = [] 95 | removed_indices = [] 96 | 97 | prev_idx = 0 98 | curr_idx = 0 99 | while True: 100 | if (curr_idx == len(current_indices)) and (prev_idx == len(previous_indices)): 101 | break 102 | elif curr_idx == len(current_indices): 103 | prev_workload = previous_values[prev_idx][0] 104 | if prev_workload not in current_values[:,0]: 105 | removed_indices.append(previous_indices[prev_idx]) 106 | prev_idx += 1 107 | continue 108 | else: 109 | raise Exception('workload error') 110 | break 111 | elif prev_idx == len(previous_indices): 112 | curr_workload = current_values[curr_idx][0] 113 | curr_feature = current_values[curr_idx][1] 114 | if curr_workload not in previous_values[:,0]: 115 | changed_indices.append(current_indices[curr_idx]) 116 | curr_idx += 1 117 | continue 118 | else: 119 | raise Exception('workload error') 120 | break 121 | 122 | prev_workload = previous_values[prev_idx][0] 123 | prev_feature = previous_values[prev_idx][1] 124 | curr_workload = current_values[curr_idx][0] 125 | curr_feature = current_values[curr_idx][1] 126 | 127 | if prev_workload != curr_workload: 128 | if curr_workload not in previous_values[:,0]: 129 | changed_indices.append(current_indices[curr_idx]) 130 | curr_idx += 1 131 | elif prev_workload not in current_values[:,0]: 132 | removed_indices.append(previous_indices[prev_idx]) 133 | prev_idx += 1 134 | continue 135 | else: 136 | raise Exception('workload error') 137 | else: 138 | if prev_feature != curr_feature: 139 | changed_indices.append(current_indices[curr_idx]) 140 | curr_idx += 1 141 | prev_idx += 1 142 | changed_df = current_df.loc[changed_indices].drop(['Workload', 'Feature'], axis=1) 143 | removed_df = previous_df.loc[removed_indices].drop(['Workload', 'Feature'], axis=1) 144 | 145 | for col in feature_cols: 146 | removed_df[col] = 0 147 | 148 | # removed_df have one more column, 'Ceased' 149 | removed_df['Ceased'] = True 150 | 151 | return changed_df, removed_df 152 | 153 | 154 | # write first rawdata to timestream 155 | 156 | paths = [] 157 | for (path, dir, files) in os.walk(NEW_FILE_PATH): 158 | for filename in files: 159 | final_path = path + '/' + filename 160 | paths.append(final_path) 161 | paths.sort() 162 | 163 | 164 | path = paths[0] 165 | 166 | changed_time = path.split('gcp_newrawdata/')[1].split('.csv.gz')[0] 167 | timestamp = datetime.strptime(changed_time, '%Y/%m/%d/%H:%M:%S') 168 | print(timestamp) 169 | df = pd.DataFrame() 170 | with gzip.open(path, 'rb') as f: 171 | df = pd.read_csv(f) 172 | 173 | upload_timestream(df, timestamp) 174 | 175 | for i in range (1, len(paths)): 176 | prev_path = paths[i-1] 177 | curr_path = paths[i] 178 | changed_time = curr_path.split('gcp_newrawdata/')[1].split('.csv.gz')[0] 179 | timestamp = datetime.strptime(changed_time, '%Y/%m/%d/%H:%M:%S') 180 | 181 | df_prev = pd.DataFrame() 182 | df_curr = pd.DataFrame() 183 | 184 | with gzip.open(prev_path, 'rb') as f: 185 | df_prev = pd.read_csv(f) 186 | with gzip.open(curr_path, 'rb') as f: 187 | df_curr = pd.read_csv(f) 188 | 189 | changed_df, removed_df = compare(df_prev, df_curr, workload_cols, feature_cols) 190 | upload_timestream(changed_df, timestamp) 191 | upload_timestream(removed_df, timestamp) -------------------------------------------------------------------------------- /migration/aws/migration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | import tsquery 4 | import tsupload 5 | import pandas as pd 6 | from multiprocessing import Pool 7 | 8 | import time 9 | import pytz 10 | from datetime import datetime, timedelta 11 | 12 | 13 | SAVE_FILENAME = 'latest.csv.gz' 14 | PROFILE_NAME = 'default' 15 | BUCKET_NAME = 'spotlake' 16 | REGION_NAME = "us-west-2" 17 | DATABASE_NAME = 'spotlake' 18 | TABLE_NAME = 'aws' 19 | NUM_CPUS = 8 20 | if 24 % NUM_CPUS != 0: 21 | raise Exception('use only 1, 2, 3, 4, 6, 8, 12, 24') 22 | CHUNK_HOUR = 24 / NUM_CPUS 23 | 24 | start_date = datetime(2022, 1, 1, 0, 0, 0, 0, pytz.UTC) 25 | end_date = datetime(2022, 4, 13, 0, 0, 0, 0, pytz.UTC) 26 | 27 | workload_cols = ['InstanceType', 'Region', 'AZ'] 28 | feature_cols = ['SPS', 'IF', 'SpotPrice'] 29 | 30 | # tsquery.PROFILE_NAME = PROFILE_NAME # tsquery.PROFILE_NAME must be credential of source database 31 | tsquery.REGION_NAME = REGION_NAME 32 | tsupload.PROFILE_NAME = PROFILE_NAME 33 | tsupload.REGION_NAME = REGION_NAME 34 | tsupload.DATABASE_NAME = DATABASE_NAME 35 | tsupload.TABLE_NAME = TABLE_NAME 36 | 37 | 38 | # compress data as gzip file, save to local file system, upload file to s3 39 | def save_gz_s3(df, timestamp): 40 | # compress and save to LFS 41 | df.to_csv(SAVE_FILENAME, index=False, compression="gzip") 42 | 43 | # upload compressed file to S3 44 | session = boto3.Session(profile_name=PROFILE_NAME) 45 | s3 = session.client('s3') 46 | s3_dir_name = '/'.join(timestamp.split()[0].split('-')) 47 | s3_obj_name = timestamp.split()[1] 48 | 49 | with open(SAVE_FILENAME, 'rb') as f: 50 | s3.upload_fileobj(f, BUCKET_NAME, f"rawdata/{s3_dir_name}/{s3_obj_name}.csv.gz") 51 | 52 | 53 | def compare_nparray(previous_df, current_df, workload_cols, feature_cols): 54 | previous_df.loc[:,'Workload'] = previous_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 55 | previous_df.loc[:,'Feature'] = previous_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 56 | current_df.loc[:,'Workload'] = current_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 57 | current_df.loc[:,'Feature'] = current_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1) 58 | 59 | current_indices = current_df[['Workload', 'Feature']].sort_values(by='Workload').index 60 | current_values = current_df[['Workload', 'Feature']].sort_values(by='Workload').values 61 | previous_indices = previous_df[['Workload', 'Feature']].sort_values(by='Workload').index 62 | previous_values = previous_df[['Workload', 'Feature']].sort_values(by='Workload').values 63 | 64 | changed_indices = [] 65 | removed_indices = [] 66 | 67 | prev_idx = 0 68 | curr_idx = 0 69 | while True: 70 | if (curr_idx == len(current_indices)) and (prev_idx == len(previous_indices)): 71 | break 72 | elif curr_idx == len(current_indices): 73 | prev_workload = previous_values[prev_idx][0] 74 | if prev_workload not in current_values[:,0]: 75 | removed_indices.append(previous_indices[prev_idx]) 76 | prev_idx += 1 77 | continue 78 | else: 79 | raise Exception('workload error') 80 | break 81 | elif prev_idx == len(previous_indices): 82 | curr_workload = current_values[curr_idx][0] 83 | curr_feature = current_values[curr_idx][1] 84 | if curr_workload not in previous_values[:,0]: 85 | changed_indices.append(current_indices[curr_idx]) 86 | curr_idx += 1 87 | continue 88 | else: 89 | raise Exception('workload error') 90 | break 91 | 92 | prev_workload = previous_values[prev_idx][0] 93 | prev_feature = previous_values[prev_idx][1] 94 | curr_workload = current_values[curr_idx][0] 95 | curr_feature = current_values[curr_idx][1] 96 | 97 | if prev_workload != curr_workload: 98 | if curr_workload not in previous_values[:,0]: 99 | changed_indices.append(current_indices[curr_idx]) 100 | curr_idx += 1 101 | elif prev_workload not in current_values[:,0]: 102 | removed_indices.append(previous_indices[prev_idx]) 103 | prev_idx += 1 104 | else: 105 | raise Exception('workload error') 106 | else: 107 | if prev_feature != curr_feature: 108 | changed_indices.append(current_indices[curr_idx]) 109 | curr_idx += 1 110 | prev_idx += 1 111 | 112 | changed_df = current_df.loc[changed_indices].drop(['Workload', 'Feature'], axis=1) 113 | removed_df = previous_df.loc[removed_indices].drop(['Workload', 'Feature'], axis=1) 114 | for col in feature_cols: 115 | removed_df[col] = 0 116 | 117 | # removed_df have one more column, 'Ceased' 118 | removed_df['Ceased'] = True 119 | return changed_df, removed_df 120 | 121 | 122 | def date_range(start, end): 123 | delta = end - start 124 | days = [start + timedelta(days=i) for i in range(delta.days + 1)] 125 | return days 126 | 127 | 128 | def time_format(timestamp): 129 | return 'T'.join(str(timestamp).split()) 130 | 131 | 132 | days = date_range(start_date, end_date) 133 | 134 | perf_start_total = time.time() 135 | for idx in range(len(days)-1): 136 | perf_start = time.time() 137 | start_timestamp = days[idx] 138 | end_timestamp = days[idx+1] 139 | 140 | start_end_time_process_list = [] 141 | for i in range(NUM_CPUS): 142 | start_time_process = start_timestamp + timedelta(hours = CHUNK_HOUR*i) 143 | end_time_process = start_timestamp + timedelta(hours = CHUNK_HOUR*(i+1)) 144 | start_end_time_process_list.append((time_format(start_time_process), time_format(end_time_process))) 145 | 146 | with Pool(NUM_CPUS) as p: 147 | process_df_list = p.starmap(tsquery.get_timestream, start_end_time_process_list) 148 | 149 | day_df = pd.concat(process_df_list, axis=0, ignore_index=True) 150 | frequency_map = {'<5%': 3.0, '5-10%': 2.5, '10-15%': 2.0, '15-20%': 1.5, '>20%': 1.0} 151 | day_df = day_df.replace({'IF': frequency_map}) 152 | day_df['SPS'] = day_df['SPS'].astype(int) 153 | day_df['SpotPrice'] = day_df['SpotPrice'].astype(float) 154 | day_df['SpotPrice'] = day_df['SpotPrice'].round(5) 155 | 156 | print(f"elapsed time - single day query: {time.time() - perf_start}") 157 | # day_df['OndemandPrice'] = (100 * day_df['SpotPrice']) / (100 - day_df['Savings']) 158 | 159 | day_timestamps = sorted(list(day_df['time'].unique())) 160 | for timestamp in day_timestamps: 161 | perf_start = time.time() 162 | current_df = day_df[day_df['time'] == timestamp].copy() 163 | print(f"elapsed time - select by time: {time.time() - perf_start}") 164 | if SAVE_FILENAME not in os.listdir('./'): 165 | save_gz_s3(current_df, timestamp) 166 | tsupload.upload_timestream(current_df) 167 | else: 168 | perf_start = time.time() 169 | previous_df = pd.read_csv(SAVE_FILENAME, compression='gzip', header=0, sep=',', quotechar='"') 170 | save_gz_s3(current_df, timestamp) 171 | print(f"elapsed time - read and save: {time.time() - perf_start}") 172 | perf_start = time.time() 173 | changed_df, removed_df = compare_nparray(previous_df, current_df, workload_cols, feature_cols) 174 | print(f"elapsed time - compare: {time.time() - perf_start}") 175 | perf_start = time.time() 176 | # changed_df and removed_df have different shape, because of 'Ceased' column 177 | tsupload.upload_timestream(changed_df) 178 | tsupload.upload_timestream(removed_df) 179 | print(f"elapsed time - upload: {time.time() - perf_start}") 180 | print(f"elapsed time - total single day: {time.time() - perf_start_total}") 181 | --------------------------------------------------------------------------------