├── README.md
├── migration
    ├── aws
    │   ├── requirements.txt
    │   ├── settings.sh
    │   ├── README.md
    │   ├── migration_copy.py
    │   ├── tsupload.py
    │   ├── tsquery.py
    │   ├── migration_gap.py
    │   ├── migration_loss.py
    │   ├── migration_kmubigdata_to_spotrank.py
    │   └── migration.py
    ├── gcp
    │   ├── gcp_spotlake_migration.sh
    │   ├── gcp_preprocess_rawdata.py
    │   └── gcp_write_timestream.py
    └── azure
    │   ├── modify_s3_azure_data.py
    │   └── azure_data_changed.py
└── LICENSE


/README.md:
--------------------------------------------------------------------------------
1 | # spotlake-migration


--------------------------------------------------------------------------------
/migration/aws/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3==1.24.45
2 | pandas==1.4.3
3 | 


--------------------------------------------------------------------------------
/migration/gcp/gcp_spotlake_migration.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | aws s3 sync s3://tmp-gcp/rawdata /home/ubuntu/gcp_rawdata
4 | python3 gcp_preprocess_rawdata.py
5 | aws s3 sync s3://spotlake/rawdata/gcp /home/ubuntu/gcp_newrawdata
6 | python3 gcp_write_timestream.py


--------------------------------------------------------------------------------
/migration/aws/settings.sh:
--------------------------------------------------------------------------------
 1 | sudo DEBIAN_FRONTEND=noninteractive apt-get dist-upgrade -y
 2 | 
 3 | sudo apt-get update
 4 | sudo apt-get install awscli -y
 5 | sudo apt-get install python3-pip -y
 6 | aws --version
 7 | pip --version
 8 | 
 9 | pip install -r requirements.txt
10 | pip install --upgrade awscli
11 | 
12 | aws configure
13 | 


--------------------------------------------------------------------------------
/migration/aws/README.md:
--------------------------------------------------------------------------------
 1 | # Spotlake-Migration: AWS
 2 | 
 3 | This repository is code that move spot data from a timestream to other.
 4 | 
 5 | ### How To Use
 6 | 1. clone this repository
 7 | 2. run 'setting.sh'
 8 | 3. run migration codes you want
 9 | (optional) put latest file as 'latest.csv.gz' in this directory before running migration code
10 | 
11 | ### Migration codes
12 | * migration.py : Migration code with multi-processing
13 | * migration_gap.py : Migration code for data gap between migration and collector
14 | * migration_copy.py : Migration code to copy data from a Timestream DB to other in same account
15 | * migration_loss.py : Migration code to upload data from S3 Bucket to Timestream DB
16 | * migration_kmubigdata_to_spotrank : Migration code for data from Timestream DB in KMUBIGDATA account to Timestream DB in SPOTRANK account
17 | 
18 | 


--------------------------------------------------------------------------------
/migration/azure/modify_s3_azure_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | 
 5 | dir_path = "./"
 6 | 
 7 | for (root, directories, files) in os.walk(dir_path):
 8 |     for file in files:
 9 |         # 현재 위치에서 하위 디렉토리의 csv.gz의 파일에 경우
10 |         if '.csv.gz' in file:
11 |             file_path = os.path.join(root, file)
12 |             # 압축을 하제하고 df으로 만든다.
13 |             df = pd.read_csv(f'{file_path}', compression='gzip')
14 |             # spotprice가 없는 행은 drop하고
15 |             df = df.dropna(subset=['spotPrice'])
16 |             # 사용하지 않는 vendor행을 drop한다.
17 |             df = df.drop(columns=['vendor'], axis=1)
18 | 
19 |             # 반복문으로 instanceTier에 instanceType에 있는경우 nan값으로 바꾸어주고 instanceType에 값을 옮긴다.
20 |             for index, row in df.iterrows():
21 |                 if not(row['instanceTier'] == 'Standard' or row['instanceTier'] == 'Basic'):
22 |                     df.loc[index, 'instanceType'] = df.loc[index, 'instanceTier']
23 |                     df.loc[index, 'instanceTier'] = np.nan
24 |             # 다시 압축하여 저장한다.
25 |             df.to_csv(f'{file_path}', compression='gzip')
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Distributed Data Processing Systems Lab.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/migration/azure/azure_data_changed.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import boto3
 4 | from compare_data import compare
 5 | 
 6 | pd.set_option('display.max_columns', None)
 7 | WORKLOAD_COLS = ['instanceTier', 'instanceType', 'region']
 8 | FEATURE_COLS = ['ondemandPrice', 'spotPrice']
 9 | 
10 | # s3에 rawdata를 local로 가져옵니다.
11 | s3 = boto3.resource('s3', aws_access_key_id='', aws_secret_access_key='')
12 | bucket = s3.Bucket('tmp-azure')
13 | prefix = 'rawdata'
14 | for object in bucket.objects.filter(Prefix = 'rawdata'):
15 |     os.makedirs(os.path.dirname(f'./{object.key}'), exist_ok=True)
16 |     bucket.download_file(object.key, object.key)
17 | 
18 | # local에 잇는 rawdata의 filepath를 list로 저장합니다.
19 | file_list = []
20 | for (path, dir, files) in os.walk("./rawdata"):
21 |     for filename in files:
22 |         ext = os.path.splitext(filename)[-1]
23 |         if ext == '.gz':
24 |             file_list.append("%s/%s" % (path, filename))
25 | 
26 | # 순서대로 비교할 수 있게 정렬
27 | file_list.sort()
28 | 
29 | # 맨 처음 rawdata를 previous_df로 설정 10분뒤 데이터를 current_df로 하여 계속 10분뒤 데이터와 비교
30 | for i in range(1, len(file_list)):
31 |     previous_df = pd.read_csv(file_list[i-1], compression='gzip')
32 |     current_df = pd.read_csv(file_list[i], compression='gzip')
33 |     try:
34 |         changed_df = compare(previous_df, current_df, WORKLOAD_COLS, FEATURE_COLS)
35 |         if not changed_df.empty:
36 |             print(i)
37 |             print(changed_df)
38 |     except Exception as e:
39 |         print(f"exception{e} : {i}")
40 | 


--------------------------------------------------------------------------------
/migration/aws/migration_copy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import boto3
 3 | import tsquery
 4 | import tsupload
 5 | import pandas as pd
 6 | from multiprocessing import Pool
 7 | 
 8 | import time
 9 | import pytz
10 | from datetime import datetime, timedelta
11 | 
12 | 
13 | SAVE_FILENAME = 'latest.csv.gz'
14 | PROFILE_NAME = 'default'
15 | BUCKET_NAME = 'spotlake'
16 | REGION_NAME = "us-west-2"
17 | QUERY_DATABASE_NAME = "spotlake"
18 | QUERY_TABLE_NAME = "temp"
19 | UPLOAD_DATABASE_NAME = 'spotlake'
20 | UPLOAD_TABLE_NAME = 'aws'
21 | NUM_CPUS = 8
22 | if 24 % NUM_CPUS != 0:
23 |     raise Exception('use only 1, 2, 3, 4, 6, 8, 12, 24')
24 | CHUNK_HOUR = 24 / NUM_CPUS
25 | 
26 | start_date = datetime(2022, 1, 1, 0, 0, 0, 0, pytz.UTC)
27 | end_date = datetime(2022, 8, 23, 0, 0, 0, 0, pytz.UTC)
28 | 
29 | tsquery.PROFILE_NAME = PROFILE_NAME # tsquery.PROFILE_NAME must be credential of source database
30 | tsquery.REGION_NAME = REGION_NAME
31 | tsquery.DATABASE_NAME = QUERY_DATABASE_NAME
32 | tsquery.TABLE_NAME = QUERY_TABLE_NAME
33 | tsupload.PROFILE_NAME = PROFILE_NAME
34 | tsupload.REGION_NAME = REGION_NAME
35 | tsupload.DATABASE_NAME = UPLOAD_DATABASE_NAME
36 | tsupload.TABLE_NAME = UPLOAD_TABLE_NAME
37 | 
38 | 
39 | def date_range(start, end):
40 |     delta = end - start
41 |     days = [start + timedelta(days=i) for i in range(delta.days + 1)]
42 |     return days
43 | 
44 | 
45 | def time_format(timestamp):
46 |     return 'T'.join(str(timestamp).split())
47 | 
48 | 
49 | days = date_range(start_date, end_date)
50 | 
51 | perf_start_total = time.time()
52 | for idx in range(len(days)-1):
53 |     perf_start = time.time()
54 |     start_timestamp = days[idx]
55 |     end_timestamp = days[idx+1]
56 |     
57 |     start_end_time_process_list = []
58 |     for i in range(NUM_CPUS):
59 |         start_time_process = start_timestamp + timedelta(hours = CHUNK_HOUR*i)
60 |         end_time_process = start_timestamp + timedelta(hours = CHUNK_HOUR*(i+1))
61 |         start_end_time_process_list.append((time_format(start_time_process), time_format(end_time_process)))
62 |         
63 |     with Pool(NUM_CPUS) as p:
64 |         process_df_list = p.starmap(tsquery.get_timestream, start_end_time_process_list)
65 |         
66 |     day_df = pd.concat(process_df_list, axis=0, ignore_index=True)
67 |     day_df['SPS'] = day_df['SPS'].astype(int)
68 |     day_df['SpotPrice'] = day_df['SpotPrice'].astype(float)
69 |     day_df['SpotPrice'] = day_df['SpotPrice'].round(5)
70 | 
71 |     tsupload.upload_timestream(day_df)
72 |     print(f"elapsed time - single day query: {time.time() - perf_start}")
73 | print(f"elapsed time - total: {time.time() - perf_start_total}")
74 | 


--------------------------------------------------------------------------------
/migration/aws/tsupload.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import boto3
 3 | import pandas as pd
 4 | from botocore.config import Config
 5 | from botocore.exceptions import ClientError
 6 | 
 7 | 
 8 | PROFILE_NAME = 'default'
 9 | REGION_NAME = 'us-east-2'
10 | DATABASE_NAME = 'dbname'
11 | TABLE_NAME = 'tablename'
12 | 
13 | 
14 | # Submit Batch To Timestream
15 | def submit_batch(records, counter, recursive, write_client):
16 |     if recursive == 10:
17 |         return
18 |     try:
19 |         result = write_client.write_records(DatabaseName=DATABASE_NAME, TableName = TABLE_NAME, Records=records, CommonAttributes={})
20 |     except write_client.exceptions.RejectedRecordsException as err:
21 |         print(err)
22 |         re_records = []
23 |         for rr in err.response["RejectedRecords"]:
24 |             re_records.append(records[rr["RecordIndex"]])
25 |         submit_batch(re_records, counter, recursive + 1)
26 |     except Exception as err:
27 |         print(err)
28 |         exit()
29 | 
30 | 
31 | # Check Database And Table Are Exist and Upload Data to Timestream
32 | def upload_timestream(data):
33 |     session = boto3.Session(profile_name=PROFILE_NAME, region_name=REGION_NAME)
34 |     write_client = session.client('timestream-write', config=Config(read_timeout=20, max_pool_connections=5000, retries={'max_attempts':10}))
35 | 
36 |     records = []
37 |     counter = 0
38 |     for idx, row in data.iterrows():
39 |         time_value = str(row['time']).split('+')[0]
40 |         time_value = time.strptime(time_value, '%Y-%m-%d %H:%M:%S')
41 |         time_value = time.mktime(time_value)
42 |         time_value = str(int(round(time_value * 1000)))
43 | 
44 |         dimensions = []
45 |         for column in data.columns:
46 |             if column in ['InstanceType', 'Region', 'AZ', 'Ceased']:
47 |                 dimensions.append({'Name':column, 'Value': str(row[column])})
48 | 
49 |         measures = []
50 |         for column, types in [('SPS', 'BIGINT'), ('IF', 'DOUBLE'), ('SpotPrice', 'DOUBLE')]:
51 |             measures.append({'Name': column, 'Value': str(row[column]), 'Type': types})
52 |             
53 |         submit_data = {
54 |                 'Dimensions': dimensions,
55 |                 'MeasureName': 'aws_values',
56 |                 'MeasureValues': measures,
57 |                 'MeasureValueType': 'MULTI',
58 |                 'Time': time_value
59 |         }
60 |         
61 |         records.append(submit_data)
62 |         counter += 1
63 |         if len(records) == 100:
64 |             submit_batch(records, counter, 0, write_client)
65 |             records = []
66 | 
67 |     if len(records) != 0:
68 |         submit_batch(records, counter, 0, write_client)
69 | 


--------------------------------------------------------------------------------
/migration/gcp/gcp_preprocess_rawdata.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from botocore.config import Config
 3 | import os
 4 | import pandas as pd
 5 | import gzip
 6 | from datetime import datetime, timezone
 7 | 
 8 | ### preprocess rawdata from s3 temp bucket and save 'spotlake' bucket
 9 | ### before this, have to aws sync from s3 temp bucket 
10 | ### plz check NEW_BUCKET_NAME, file paths before running this
11 | 
12 | session = boto3.session.Session(region_name='us-west-2')
13 | write_client = session.client('timestream-write', config=Config(read_timeout=20, max_pool_connections=5000, retries={'max_attempts':10}))
14 | client = session.client('timestream-query')
15 | 
16 | NEW_BUCKET_NAME = 'spotlake'
17 | 
18 | FILE_PATH = '/home/ubuntu/gcp_rawdata'          # 2022/MM/dd
19 | TEMP_FILE_PATH = './'
20 | NEW_FILE_PATH = '/home/ubuntu/gcp_newrawdata'  
21 | workload_cols = ['InstanceType', 'Region']
22 | feature_cols = ['Calculator OnDemand Price', 'Calculator Preemptible Price', 'VM Instance OnDemand Price', 'VM Instance Preemptible Price']
23 | 
24 | def save_raw(data, timestamp):
25 |     SAVE_FILENAME = f"{TEMP_FILE_PATH}/spotlake_"+f"{timestamp}.csv.gz"
26 |     data.to_csv(SAVE_FILENAME, index=False, compression='gzip')
27 |     session = boto3.Session()
28 |     s3 = session.client('s3')
29 |     s3_dir_name = timestamp.strftime("%Y/%m/%d")
30 |     s3_obj_name = timestamp.strftime("%H:%M:%S")
31 |     with open(SAVE_FILENAME, 'rb') as f:
32 |         s3.upload_fileobj(
33 |             f, NEW_BUCKET_NAME, f"rawdata/gcp/{s3_dir_name}/{s3_obj_name}.csv.gz")
34 | 
35 |     for filename in os.listdir(f"{TEMP_FILE_PATH}/"):
36 |         if "spotlake_" in filename:
37 |             os.remove(f"{TEMP_FILE_PATH}/{filename}")
38 | 
39 | 
40 | # sort gcp_rawdata folder paths
41 | 
42 | paths = []
43 | for (path, dir, files) in os.walk(FILE_PATH):
44 |     for filename in files:
45 |         final_path = path + '/' + filename
46 |         paths.append(final_path)
47 | paths.sort()
48 | 
49 | # remove unnecessary vendor, change nan into -1 and save into tmp-change-bucket
50 | for path in paths:
51 |     changed_time = path.split('gcp_rawdata/')[1].split('.csv.gz')[0]
52 |     timestamp = datetime.strptime(changed_time, '%Y/%m/%d/%H:%M:%S')
53 | 
54 |     df_old = pd.DataFrame()
55 | 
56 |     with gzip.open(path, 'rb') as f:
57 |         df_old = pd.read_csv(f)
58 | 
59 |     # remove Vendor, Calculator Savings, VM Instance Savings
60 |     df_new = pd.DataFrame()
61 |     try :
62 |         df_new = df_old.drop(['Vendor', 'Calculator Savings', 'VM Instance Savings'], axis=1)
63 |     except:
64 |         df_new = df_old
65 | 
66 |     # # have to change nan into -1    
67 |     df_new = df_new.replace(float('nan'), -1)
68 | 
69 |     # # write to tmp-changed-gcp
70 |     save_raw(df_new, timestamp)
71 |     print(timestamp)
72 | 
73 | 


--------------------------------------------------------------------------------
/migration/aws/tsquery.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import pandas as pd
  3 | 
  4 | 
  5 | PROFILE_NAME = "source-profile"
  6 | REGION_NAME = "us-west-2"
  7 | 
  8 | 
  9 | def run_query(query_string):
 10 |     try:
 11 |         session = boto3.Session(profile_name=PROFILE_NAME, region_name=REGION_NAME)
 12 |         query_client = session.client('timestream-query')
 13 |         paginator = query_client.get_paginator('query')
 14 |         page_iterator = paginator.paginate(QueryString=query_string)
 15 |         for page in page_iterator:
 16 |             _parse_query_result(page)
 17 |     except Exception as err:
 18 |         print("Exception while running query:", err)
 19 | 
 20 |         
 21 | def _parse_query_result(query_result):
 22 |     query_status = query_result["QueryStatus"]
 23 |     column_info = query_result['ColumnInfo']
 24 |     for row in query_result['Rows']:
 25 |         _parse_row(column_info, row)
 26 | 
 27 |         
 28 | def _parse_row(column_info, row):
 29 |     data = row['Data']
 30 |     row_output = []
 31 |     for j in range(len(data)):
 32 |         info = column_info[j]
 33 |         datum = data[j]
 34 |         row_output.append(_parse_datum(info, datum))
 35 |     return "{%s}" % str(row_output)
 36 | 
 37 | 
 38 | def _parse_datum(info, datum):
 39 |     if datum.get('NullValue', False):
 40 |         return "%s=NULL" % info['Name'],
 41 |     
 42 |     column_type = info['Type']
 43 | 
 44 |     # If the column is of TimeSeries Type
 45 |     if 'TimeSeriesMeasureValueColumnInfo' in column_type:
 46 |         return _parse_time_series(info, datum)
 47 | 
 48 |     # If the column is of Array Type
 49 |     elif 'ArrayColumnInfo' in column_type:
 50 |         array_values = datum['ArrayValue']
 51 |         return "%s=%s" % (info['Name'], _parse_array(info['Type']['ArrayColumnInfo'], array_values))
 52 | 
 53 |     # If the column is of Row Type
 54 |     elif 'RowColumnInfo' in column_type:
 55 |         row_column_info = info['Type']['RowColumnInfo']
 56 |         row_values = datum['RowValue']
 57 |         return _parse_row(row_column_info, row_values)
 58 | 
 59 |     # If the column is of Scalar Type
 60 |     else:
 61 |         global timestream_data
 62 |         if info['Name'] == "time":
 63 |             timestream_data[info['Name']].append(datum['ScalarValue'].split('.')[0]+"+00:00")
 64 |         elif info['Name'] != "measure_name" and info['Name'] != "measure_value::double":
 65 |             timestream_data[info['Name']].append(datum['ScalarValue'])
 66 |         return _parse_column_name(info) + datum['ScalarValue']
 67 | 
 68 |     
 69 | def _parse_time_series(info, datum):
 70 |     time_series_output = []
 71 |     for data_point in datum['TimeSeriesValue']:
 72 |         time_series_output.append("{time=%s, value=%s}"
 73 |                                     % (data_point['Time'],
 74 |                                         _parse_datum(info['Type']['TimeSeriesMeasureValueColumnInfo'],
 75 |                                                         data_point['Value'])))
 76 |     return "[%s]" % str(time_series_output)
 77 | 
 78 | 
 79 | def _parse_array(array_column_info, array_values):
 80 |     array_output = []
 81 |     for datum in array_values:
 82 |         array_output.append(_parse_datum(array_column_info, datum))
 83 | 
 84 |     return "[%s]" % str(array_output)
 85 | 
 86 | 
 87 | def _parse_column_name(info):
 88 |     if 'Name' in info:
 89 |         return info['Name'] + "="
 90 |     else:
 91 |         return ""
 92 | 
 93 |     
 94 | def get_timestream(start_date, end_date):
 95 |     global timestream_data
 96 |     timestream_data = {"SpotPrice" : [], "Savings" : [], "SPS" : [], "AZ" : [], "Region" : [], "InstanceType" : [], "IF" : [], "time" : []}
 97 |     
 98 |     print(f"Start query ({start_date}~{end_date})")
 99 |     query_string = f"""SELECT * FROM "spotrank-timestream"."spot-table" WHERE time between from_iso8601_timestamp('{start_date}') and from_iso8601_timestamp('{end_date}') ORDER BY time"""
100 |     run_query(query_string)
101 |     print(start_date + "~" + end_date + " is end")
102 |     timestream_df = pd.DataFrame(timestream_data)
103 |     timestream_df.drop_duplicates(inplace=True)
104 |     return timestream_df
105 | 
106 | 
107 | def get_timestamps(start_date, end_date):
108 |     global timestream_data
109 |     timestream_data = {"time" : []}
110 | 
111 |     print(f"Start query ({start_date}~{end_date})")
112 |     query_string = f"""SELECT DISTINCT time FROM "spotrank-timestream"."spot-table" WHERE time between from_iso8601_date('{start_date}') and from_iso8601_date('{end_date}') ORDER BY time"""
113 |     run_query(query_string)
114 |     print(start_date + "~" + end_date + " is end")
115 |     return timestream_data['time']
116 | 


--------------------------------------------------------------------------------
/migration/aws/migration_gap.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import boto3
  3 | import tsquery
  4 | import tsupload
  5 | import pandas as pd
  6 | from multiprocessing import Pool
  7 | 
  8 | import time
  9 | import pytz
 10 | from datetime import datetime, timedelta
 11 | 
 12 | 
 13 | SAVE_FILENAME = 'latest.csv.gz'
 14 | PROFILE_NAME = 'default'
 15 | BUCKET_NAME = 'spotlake'
 16 | REGION_NAME = "us-west-2"
 17 | DATABASE_NAME = 'spotlake'
 18 | TABLE_NAME = 'aws'
 19 | 
 20 | start_date = datetime(2022, 8, 23, 0, 0, 0, 0, pytz.UTC)
 21 | end_date = datetime(2022, 8, 23, 7, 50, 0, 0, pytz.UTC)
 22 | 
 23 | workload_cols = ['InstanceType', 'Region', 'AZ']
 24 | feature_cols = ['SPS', 'IF', 'SpotPrice']
 25 | 
 26 | # tsquery.PROFILE_NAME = PROFILE_NAME # tsquery.PROFILE_NAME must be credential of source database
 27 | tsquery.REGION_NAME = REGION_NAME
 28 | tsupload.PROFILE_NAME = PROFILE_NAME
 29 | tsupload.REGION_NAME = REGION_NAME
 30 | tsupload.DATABASE_NAME = DATABASE_NAME
 31 | tsupload.TABLE_NAME = TABLE_NAME
 32 | 
 33 | 
 34 | # compress data as gzip file, save to local file system, upload file to s3
 35 | def save_gz_s3(df, timestamp):
 36 |     # compress and save to LFS
 37 |     df.to_csv(SAVE_FILENAME, index=False, compression="gzip")
 38 |     
 39 |     # upload compressed file to S3
 40 |     session = boto3.Session(profile_name=PROFILE_NAME)
 41 |     s3 = session.client('s3')
 42 |     s3_dir_name = '/'.join(timestamp.split()[0].split('-'))
 43 |     s3_obj_name = timestamp.split()[1]
 44 |     
 45 |     with open(SAVE_FILENAME, 'rb') as f:
 46 |         s3.upload_fileobj(f, BUCKET_NAME, f"rawdata/{s3_dir_name}/{s3_obj_name}.csv.gz")
 47 |         
 48 | 
 49 | def compare_nparray(previous_df, current_df, workload_cols, feature_cols):  
 50 |     previous_df.loc[:,'Workload'] = previous_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 51 |     previous_df.loc[:,'Feature'] = previous_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 52 |     current_df.loc[:,'Workload'] = current_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 53 |     current_df.loc[:,'Feature'] = current_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 54 | 
 55 |     current_indices = current_df[['Workload', 'Feature']].sort_values(by='Workload').index
 56 |     current_values = current_df[['Workload', 'Feature']].sort_values(by='Workload').values
 57 |     previous_indices = previous_df[['Workload', 'Feature']].sort_values(by='Workload').index
 58 |     previous_values = previous_df[['Workload', 'Feature']].sort_values(by='Workload').values
 59 |     
 60 |     changed_indices = []
 61 |     removed_indices = []
 62 |     
 63 |     prev_idx = 0
 64 |     curr_idx = 0
 65 |     while True:
 66 |         if (curr_idx == len(current_indices)) and (prev_idx == len(previous_indices)):
 67 |             break
 68 |         elif curr_idx == len(current_indices):
 69 |             prev_workload = previous_values[prev_idx][0]
 70 |             if prev_workload not in current_values[:,0]:
 71 |                 removed_indices.append(previous_indices[prev_idx])
 72 |                 prev_idx += 1
 73 |                 continue
 74 |             else:
 75 |                 raise Exception('workload error')
 76 |             break
 77 |         elif prev_idx == len(previous_indices):
 78 |             curr_workload = current_values[curr_idx][0]
 79 |             curr_feature = current_values[curr_idx][1]
 80 |             if curr_workload not in previous_values[:,0]:
 81 |                 changed_indices.append(current_indices[curr_idx])
 82 |                 curr_idx += 1
 83 |                 continue
 84 |             else:
 85 |                 raise Exception('workload error')
 86 |             break
 87 |             
 88 |         prev_workload = previous_values[prev_idx][0]
 89 |         prev_feature = previous_values[prev_idx][1]
 90 |         curr_workload = current_values[curr_idx][0]
 91 |         curr_feature = current_values[curr_idx][1]
 92 |         
 93 |         if prev_workload != curr_workload:
 94 |             if curr_workload not in previous_values[:,0]:
 95 |                 changed_indices.append(current_indices[curr_idx])
 96 |                 curr_idx += 1
 97 |             elif prev_workload not in current_values[:,0]:
 98 |                 removed_indices.append(previous_indices[prev_idx])
 99 |                 prev_idx += 1
100 |             else:
101 |                 raise Exception('workload error')
102 |         else:
103 |             if prev_feature != curr_feature:
104 |                 changed_indices.append(current_indices[curr_idx])
105 |             curr_idx += 1
106 |             prev_idx += 1
107 |     
108 |     changed_df = current_df.loc[changed_indices].drop(['Workload', 'Feature'], axis=1)
109 |     removed_df = previous_df.loc[removed_indices].drop(['Workload', 'Feature'], axis=1)
110 |     for col in feature_cols:
111 |         removed_df[col] = 0
112 | 
113 |     # removed_df have one more column, 'Ceased'
114 |     removed_df['Ceased'] = True
115 |     return changed_df, removed_df
116 | 
117 | 
118 | def date_range(start, end):
119 |     delta = end - start
120 |     days = [start + timedelta(days=i) for i in range(delta.days + 1)]
121 |     return days
122 | 
123 | 
124 | def time_format(timestamp):
125 |     return 'T'.join(str(timestamp).split())
126 | 
127 | 
128 | perf_start_total = time.time()
129 | perf_start = time.time()
130 | 
131 | day_df = tsquery.get_timestream(time_format(start_date), time_format(end_date))
132 | frequency_map = {'<5%': 3.0, '5-10%': 2.5, '10-15%': 2.0, '15-20%': 1.5, '>20%': 1.0}
133 | day_df = day_df.replace({'IF': frequency_map})
134 | day_df['SPS'] = day_df['SPS'].astype(int)
135 | day_df['SpotPrice'] = day_df['SpotPrice'].astype(float)
136 | day_df['SpotPrice'] = day_df['SpotPrice'].round(5)
137 | 
138 | print(f"elapsed time - single day query: {time.time() - perf_start}")
139 | # day_df['OndemandPrice'] = (100 * day_df['SpotPrice']) / (100 - day_df['Savings'])
140 | 
141 | day_timestamps = sorted(list(day_df['time'].unique()))
142 | for timestamp in day_timestamps:
143 |     perf_start = time.time()
144 |     current_df = day_df[day_df['time'] == timestamp].copy()
145 |     print(f"elapsed time - select by time: {time.time() - perf_start}")
146 |     if SAVE_FILENAME not in os.listdir('./'):
147 |         save_gz_s3(current_df, timestamp)
148 |         tsupload.upload_timestream(current_df)
149 |     else:
150 |         perf_start = time.time()
151 |         previous_df = pd.read_csv(SAVE_FILENAME, compression='gzip', header=0, sep=',', quotechar='"')
152 |         save_gz_s3(current_df, timestamp)
153 |         print(f"elapsed time - read and save: {time.time() - perf_start}")
154 |         perf_start = time.time()
155 |         changed_df, removed_df = compare_nparray(previous_df, current_df, workload_cols, feature_cols)
156 |         print(f"elapsed time - compare: {time.time() - perf_start}")
157 |         perf_start = time.time()
158 |         # changed_df and removed_df have different shape, because of 'Ceased' column
159 |         tsupload.upload_timestream(changed_df)
160 |         tsupload.upload_timestream(removed_df)
161 |         print(f"elapsed time - upload: {time.time() - perf_start}")
162 | print(f"elapsed time - total single day: {time.time() - perf_start_total}")
163 | 


--------------------------------------------------------------------------------
/migration/aws/migration_loss.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import boto3
  3 | import tsupload
  4 | import pandas as pd
  5 | 
  6 | import time
  7 | import pytz
  8 | from datetime import datetime, timedelta
  9 | 
 10 | SAVE_FILENAME = 'latest.csv.gz'
 11 | PROFILE_NAME = 'default'
 12 | BUCKET_NAME = 'spotlake'
 13 | REGION_NAME = "us-west-2"
 14 | DATABASE_NAME = 'spotlake'
 15 | TABLE_NAME = 'aws'
 16 | 
 17 | start_date = datetime(2022, 4, 13, 0, 0, 0, 0, pytz.UTC)
 18 | end_date = datetime(2022, 5, 1, 0, 0, 0, 0, pytz.UTC)
 19 | 
 20 | workload_cols = ['InstanceType', 'Region', 'AZ']
 21 | feature_cols = ['SPS', 'IF', 'SpotPrice']
 22 | 
 23 | tsupload.PROFILE_NAME = PROFILE_NAME
 24 | tsupload.REGION_NAME = REGION_NAME
 25 | tsupload.DATABASE_NAME = DATABASE_NAME
 26 | tsupload.TABLE_NAME = TABLE_NAME
 27 | 
 28 | # compress data as gzip file, save to local file system, upload file to s3
 29 | def save_gz_s3(df, timestamp):
 30 |     # compress and save to LFS
 31 |     df.to_csv(SAVE_FILENAME, index=False, compression="gzip")
 32 |     
 33 |     # upload compressed file to S3
 34 |     session = boto3.Session(profile_name=PROFILE_NAME)
 35 |     s3 = session.client('s3')
 36 |     s3_dir_name = '/'.join(timestamp.split()[0].split('-'))
 37 |     s3_obj_name = timestamp.split()[1]
 38 |     
 39 |     with open(SAVE_FILENAME, 'rb') as f:
 40 |         s3.upload_fileobj(f, BUCKET_NAME, f"rawdata/{s3_dir_name}/{s3_obj_name}.csv.gz")
 41 | 
 42 | def compare_nparray(previous_df, current_df, workload_cols, feature_cols):  
 43 |     previous_df.loc[:,'Workload'] = previous_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 44 |     previous_df.loc[:,'Feature'] = previous_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 45 |     current_df.loc[:,'Workload'] = current_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 46 |     current_df.loc[:,'Feature'] = current_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 47 | 
 48 |     current_indices = current_df[['Workload', 'Feature']].sort_values(by='Workload').index
 49 |     current_values = current_df[['Workload', 'Feature']].sort_values(by='Workload').values
 50 |     previous_indices = previous_df[['Workload', 'Feature']].sort_values(by='Workload').index
 51 |     previous_values = previous_df[['Workload', 'Feature']].sort_values(by='Workload').values
 52 |     
 53 |     changed_indices = []
 54 |     removed_indices = []
 55 |     
 56 |     prev_idx = 0
 57 |     curr_idx = 0
 58 |     while True:
 59 |         if (curr_idx == len(current_indices)) and (prev_idx == len(previous_indices)):
 60 |             break
 61 |         elif curr_idx == len(current_indices):
 62 |             prev_workload = previous_values[prev_idx][0]
 63 |             if prev_workload not in current_values[:,0]:
 64 |                 removed_indices.append(previous_indices[prev_idx])
 65 |                 prev_idx += 1
 66 |                 continue
 67 |             else:
 68 |                 raise Exception('workload error')
 69 |             break
 70 |         elif prev_idx == len(previous_indices):
 71 |             curr_workload = current_values[curr_idx][0]
 72 |             curr_feature = current_values[curr_idx][1]
 73 |             if curr_workload not in previous_values[:,0]:
 74 |                 changed_indices.append(current_indices[curr_idx])
 75 |                 curr_idx += 1
 76 |                 continue
 77 |             else:
 78 |                 raise Exception('workload error')
 79 |             break
 80 |             
 81 |         prev_workload = previous_values[prev_idx][0]
 82 |         prev_feature = previous_values[prev_idx][1]
 83 |         curr_workload = current_values[curr_idx][0]
 84 |         curr_feature = current_values[curr_idx][1]
 85 |         
 86 |         if prev_workload != curr_workload:
 87 |             if curr_workload not in previous_values[:,0]:
 88 |                 changed_indices.append(current_indices[curr_idx])
 89 |                 curr_idx += 1
 90 |             elif prev_workload not in current_values[:,0]:
 91 |                 removed_indices.append(previous_indices[prev_idx])
 92 |                 prev_idx += 1
 93 |             else:
 94 |                 raise Exception('workload error')
 95 |         else:
 96 |             if prev_feature != curr_feature:
 97 |                 changed_indices.append(current_indices[curr_idx])
 98 |             curr_idx += 1
 99 |             prev_idx += 1
100 |     
101 |     changed_df = current_df.loc[changed_indices].drop(['Workload', 'Feature'], axis=1)
102 |     removed_df = previous_df.loc[removed_indices].drop(['Workload', 'Feature'], axis=1)
103 |     for col in feature_cols:
104 |         removed_df[col] = 0
105 | 
106 |     # removed_df have one more column, 'Ceased'
107 |     removed_df['Ceased'] = True
108 |     return changed_df, removed_df
109 |         
110 | def date_range(start, end):
111 |     delta = end - start
112 |     days = [start + timedelta(days=i) for i in range(delta.days + 1)]
113 |     return days
114 | 
115 | def time_format(timestamp):
116 |     return 'T'.join(str(timestamp).split())
117 |   
118 | days = date_range(start_date, end_date)
119 | all_df = pd.read_pickle('./df_0413_0501.pkl')
120 | perf_start_total = time.time()
121 | for day in days:
122 |     perf_start = time.time()
123 |     day_cond = (str(day) <= all_df['time'] ) & (all_df['time'] < str(day + timedelta(days=1)))
124 |     day_df = all_df[day_cond].copy()
125 |     frequency_map = {'<5%': 3.0, '5-10%': 2.5, '10-15%': 2.0, '15-20%': 1.5, '>20%': 1.0}
126 |     day_df = day_df.replace({'IF': frequency_map})
127 |     day_df['SPS'] = day_df['SPS'].astype(int)
128 |     day_df['SpotPrice'] = day_df['SpotPrice'].astype(float)
129 |     day_df['SpotPrice'] = day_df['SpotPrice'].round(5)
130 |     
131 |     print(f"elapsed time - single day query: {time.time() - perf_start}")
132 |     # day_df['OndemandPrice'] = (100 * day_df['SpotPrice']) / (100 - day_df['Savings'])
133 |     
134 |     day_timestamps = sorted(list(day_df['time'].unique()))
135 |     for timestamp in day_timestamps:
136 |         perf_start = time.time()
137 |         current_df = day_df[day_df['time'] == timestamp].copy()
138 |         print(f"elapsed time - select by time: {time.time() - perf_start}")
139 |         if SAVE_FILENAME not in os.listdir('./'):
140 |             save_gz_s3(current_df, timestamp)
141 |             tsupload.upload_timestream(current_df)
142 |         else:
143 |             perf_start = time.time()
144 |             previous_df = pd.read_csv(SAVE_FILENAME, compression='gzip', header=0, sep=',', quotechar='"')
145 |             save_gz_s3(current_df, timestamp)
146 |             print(f"elapsed time - read and save: {time.time() - perf_start}")
147 |             perf_start = time.time()
148 |             changed_df, removed_df = compare_nparray(previous_df, current_df, workload_cols, feature_cols)
149 |             print(f"elapsed time - compare: {time.time() - perf_start}")
150 |             perf_start = time.time()
151 |             # changed_df and removed_df have different shape, because of 'Ceased' column
152 |             tsupload.upload_timestream(changed_df)
153 |             tsupload.upload_timestream(removed_df)
154 |             print(f"elapsed time - upload: {time.time() - perf_start}")
155 | print(f"elapsed time - total single day: {time.time() - perf_start_total}")
156 | 


--------------------------------------------------------------------------------
/migration/aws/migration_kmubigdata_to_spotrank.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import boto3
  3 | import tsquery
  4 | import tsupload
  5 | import pandas as pd
  6 | from multiprocessing import Pool
  7 | 
  8 | import time
  9 | import pytz
 10 | from datetime import datetime, timedelta
 11 | 
 12 | 
 13 | SAVE_FILENAME = 'latest.csv.gz'
 14 | QUERY_PROFILE_NAME = 'kmubigdata'
 15 | UPLOAD_PROFILE_NAME = 'spotrank'
 16 | BUCKET_NAME = 'spotlake-test'
 17 | REGION_NAME = "us-west-2"
 18 | QUERY_DATABASE_NAME = "spotrank-timestream"
 19 | QUERY_TABLE_NAME = "spot-table"
 20 | UPLOAD_DATABASE_NAME = 'spotlake'
 21 | UPLOAD_TABLE_NAME = 'aws'
 22 | NUM_CPUS = 8
 23 | if 24 % NUM_CPUS != 0:
 24 |     raise Exception('use only 1, 2, 3, 4, 6, 8, 12, 24')
 25 | CHUNK_HOUR = 24 / NUM_CPUS
 26 | 
 27 | start_date = datetime(2022, 9, 28, 16, 20, 0, 0, pytz.UTC)
 28 | end_date = datetime(2022, 9, 29, 3, 10, 0, 0, pytz.UTC)
 29 | 
 30 | tsquery.PROFILE_NAME = QUERY_PROFILE_NAME # tsquery.PROFILE_NAME must be credential of source database
 31 | tsquery.REGION_NAME = REGION_NAME
 32 | tsquery.DATABASE_NAME = QUERY_DATABASE_NAME
 33 | tsquery.TABLE_NAME = QUERY_TABLE_NAME
 34 | tsupload.PROFILE_NAME = UPLOAD_PROFILE_NAME
 35 | tsupload.REGION_NAME = REGION_NAME
 36 | tsupload.DATABASE_NAME = UPLOAD_DATABASE_NAME
 37 | tsupload.TABLE_NAME = UPLOAD_TABLE_NAME
 38 | 
 39 | workload_cols = ['InstanceType', 'Region', 'AZ']
 40 | feature_cols = ['SPS', 'IF', 'SpotPrice']
 41 | 
 42 | 
 43 | # compress data as gzip file, save to local file system, upload file to s3
 44 | def save_gz_s3(df, timestamp):
 45 |     # compress and save to LFS
 46 |     df.to_csv(SAVE_FILENAME, index=False, compression="gzip")
 47 |     
 48 |     # upload compressed file to S3
 49 |     session = boto3.Session(profile_name=UPLOAD_PROFILE_NAME)
 50 |     s3 = session.client('s3')
 51 |     s3_dir_name = '/'.join(timestamp.split()[0].split('-'))
 52 |     s3_obj_name = timestamp.split()[1]
 53 |     
 54 |     with open(SAVE_FILENAME, 'rb') as f:
 55 |         s3.upload_fileobj(f, BUCKET_NAME, f"rawdata/{s3_dir_name}/{s3_obj_name}.csv.gz")
 56 | 
 57 | 
 58 | def compare_nparray(previous_df, current_df, workload_cols, feature_cols):  
 59 |     previous_df.loc[:,'Workload'] = previous_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 60 |     previous_df.loc[:,'Feature'] = previous_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 61 |     current_df.loc[:,'Workload'] = current_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 62 |     current_df.loc[:,'Feature'] = current_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 63 | 
 64 |     current_indices = current_df[['Workload', 'Feature']].sort_values(by='Workload').index
 65 |     current_values = current_df[['Workload', 'Feature']].sort_values(by='Workload').values
 66 |     previous_indices = previous_df[['Workload', 'Feature']].sort_values(by='Workload').index
 67 |     previous_values = previous_df[['Workload', 'Feature']].sort_values(by='Workload').values
 68 |     
 69 |     changed_indices = []
 70 |     removed_indices = []
 71 |     
 72 |     prev_idx = 0
 73 |     curr_idx = 0
 74 |     while True:
 75 |         if (curr_idx == len(current_indices)) and (prev_idx == len(previous_indices)):
 76 |             break
 77 |         elif curr_idx == len(current_indices):
 78 |             prev_workload = previous_values[prev_idx][0]
 79 |             if prev_workload not in current_values[:,0]:
 80 |                 removed_indices.append(previous_indices[prev_idx])
 81 |                 prev_idx += 1
 82 |                 continue
 83 |             else:
 84 |                 raise Exception('workload error')
 85 |             break
 86 |         elif prev_idx == len(previous_indices):
 87 |             curr_workload = current_values[curr_idx][0]
 88 |             curr_feature = current_values[curr_idx][1]
 89 |             if curr_workload not in previous_values[:,0]:
 90 |                 changed_indices.append(current_indices[curr_idx])
 91 |                 curr_idx += 1
 92 |                 continue
 93 |             else:
 94 |                 raise Exception('workload error')
 95 |             break
 96 |             
 97 |         prev_workload = previous_values[prev_idx][0]
 98 |         prev_feature = previous_values[prev_idx][1]
 99 |         curr_workload = current_values[curr_idx][0]
100 |         curr_feature = current_values[curr_idx][1]
101 |         
102 |         if prev_workload != curr_workload:
103 |             if curr_workload not in previous_values[:,0]:
104 |                 changed_indices.append(current_indices[curr_idx])
105 |                 curr_idx += 1
106 |             elif prev_workload not in current_values[:,0]:
107 |                 removed_indices.append(previous_indices[prev_idx])
108 |                 prev_idx += 1
109 |             else:
110 |                 raise Exception('workload error')
111 |         else:
112 |             if prev_feature != curr_feature:
113 |                 changed_indices.append(current_indices[curr_idx])
114 |             curr_idx += 1
115 |             prev_idx += 1
116 |     
117 |     changed_df = current_df.loc[changed_indices].drop(['Workload', 'Feature'], axis=1)
118 |     removed_df = previous_df.loc[removed_indices].drop(['Workload', 'Feature'], axis=1)
119 |     for col in feature_cols:
120 |         removed_df[col] = 0
121 | 
122 |     # removed_df have one more column, 'Ceased'
123 |     removed_df['Ceased'] = True
124 |     return changed_df
125 | 
126 | 
127 | def date_range(start, end):
128 |     delta = end - start
129 |     days = [start + timedelta(minutes=i*10) for i in range(delta.seconds//60//10 + 1)]
130 |     return days
131 | 
132 | 
133 | def time_format(timestamp):
134 |     return 'T'.join(str(timestamp).split())
135 | 
136 | 
137 | days = date_range(start_date, end_date)
138 | 
139 | perf_start_total = time.time()
140 | perf_start = time.time()
141 | 
142 | start_end_time_process_list = []
143 | 
144 | day_df = tsquery.get_timestream(start_date, end_date)
145 | 
146 | frequency_map = {'<5%': 3.0, '5-10%': 2.5, '10-15%': 2.0, '15-20%': 1.5, '>20%': 1.0}
147 | day_df = day_df.replace({'IF': frequency_map})
148 | day_df['SPS'] = day_df['SPS'].astype(int)
149 | day_df['SpotPrice'] = day_df['SpotPrice'].astype(float)
150 | day_df['SpotPrice'] = day_df['SpotPrice'].round(5)
151 | 
152 | print(f"elapsed time - single day query: {time.time() - perf_start}")
153 | day_df['OndemandPrice'] = (100 * day_df['SpotPrice']) / (100 - day_df['Savings'])
154 | day_df['OndemandPrice'] = day_df['OndemandPrice'].astype(float)
155 | day_df['OndemandPrice'] = day_df['OndemandPrice'].round(5)
156 | 
157 | day_timestamps = sorted(list(day_df['time'].unique()))
158 | for timestamp in day_timestamps:
159 |     perf_start = time.time()
160 |     current_df = day_df[day_df['time'] == timestamp].copy()
161 |     print(f"elapsed time - select by time: {time.time() - perf_start}")
162 |     if SAVE_FILENAME not in os.listdir('./'):
163 |         save_gz_s3(current_df, timestamp)
164 |         tsupload.upload_timestream(current_df)
165 |     else:
166 |         perf_start = time.time()
167 |         previous_df = pd.read_csv(SAVE_FILENAME, compression='gzip', header=0, sep=',', quotechar='"')
168 |         save_gz_s3(current_df, timestamp)
169 |         print(f"elapsed time - read and save: {time.time() - perf_start}")
170 |         perf_start = time.time()
171 |         changed_df = compare_nparray(previous_df, current_df, workload_cols, feature_cols)
172 |         print(f"elapsed time - compare: {time.time() - perf_start}")
173 |         perf_start = time.time()
174 |         # changed_df and removed_df have different shape, because of 'Ceased' column
175 |         tsupload.upload_timestream(changed_df)
176 |         print(f"elapsed time - upload: {time.time() - perf_start}")
177 | print(f"elapsed time - total: {time.time() - perf_start_total}")
178 | 


--------------------------------------------------------------------------------
/migration/gcp/gcp_write_timestream.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | from botocore.config import Config
  3 | import time
  4 | import os
  5 | import pandas as pd
  6 | import gzip
  7 | from datetime import datetime
  8 | 
  9 | ### write stored rawdata on tsdb
 10 | ### before this, have to aws sync spotlake bucket
 11 | ### plz check file paths, DATABASE_NAME and TABLE_NAME before running this
 12 | 
 13 | session = boto3.session.Session(region_name='us-west-2')
 14 | write_client = session.client('timestream-write', config=Config(read_timeout=20, max_pool_connections=5000, retries={'max_attempts':10}))
 15 | client = session.client('timestream-query')
 16 | 
 17 | DATABASE_NAME = 'spotlake'
 18 | TABLE_NAME = 'gcp'
 19 | 
 20 | FILE_PATH = '/home/ubuntu/gcp_rawdata'          # 2022/MM/dd
 21 | TEMP_FILE_PATH = './'
 22 | NEW_FILE_PATH = '/home/ubuntu/gcp_newrawdata'  
 23 | workload_cols = ['InstanceType', 'Region']
 24 | feature_cols = ['Calculator OnDemand Price', 'Calculator Preemptible Price', 'VM Instance OnDemand Price', 'VM Instance Preemptible Price']
 25 | 
 26 | 
 27 | # Submit Batch To Timestream
 28 | def submit_batch(records, counter, recursive):
 29 |     if recursive == 10:
 30 |         return
 31 |     try:
 32 |         result = write_client.write_records(DatabaseName=DATABASE_NAME, TableName = TABLE_NAME, Records=records, CommonAttributes={})
 33 |     except write_client.exceptions.RejectedRecordsException as err:
 34 |         re_records = []
 35 |         for rr in err.response["RejectedRecords"]:
 36 |             print(rr['Reason'])
 37 |             re_records.append(records[rr["RecordIndex"]])
 38 |         submit_batch(re_records, counter, recursive + 1)
 39 |     except Exception as err:
 40 |         print(err)
 41 |         exit()
 42 | 
 43 | 
 44 | # Check Database And Table Are Exist and Upload Data to Timestream
 45 | def upload_timestream(data, timestamp):
 46 |     print(len(data))
 47 | 
 48 |     time_value = time.strptime(timestamp.strftime("%Y-%m-%d %H:%M"), '%Y-%m-%d %H:%M')
 49 |     time_value = time.mktime(time_value)
 50 |     time_value = str(int(round(time_value * 1000)))
 51 | 
 52 | 
 53 |     records = []
 54 |     counter = 0
 55 |     for idx, row in data.iterrows():
 56 | 
 57 |         dimensions = []
 58 |         for column in data.columns:
 59 |             if column in ['InstanceType', 'Region', 'Ceased']:
 60 |                 dimensions.append({'Name':column, 'Value': str(row[column])})
 61 | 
 62 |         submit_data = {
 63 |                 'Dimensions': dimensions,
 64 |                 'MeasureName': 'gcp_values',
 65 |                 'MeasureValues': [],
 66 |                 'MeasureValueType': 'MULTI',
 67 |                 'Time': time_value
 68 |         }
 69 |         for column, types in [('Calculator OnDemand Price', 'DOUBLE'), ('Calculator Preemptible Price', 'DOUBLE'), ('VM Instance OnDemand Price', 'DOUBLE'), ('VM Instance Preemptible Price', 'DOUBLE')]:
 70 |             submit_data['MeasureValues'].append({'Name': column, 'Value': str(row[column]), 'Type' : types})
 71 |         records.append(submit_data)
 72 |         counter += 1
 73 |         if len(records) == 100:
 74 |             submit_batch(records, counter, 0)
 75 |             records = []
 76 | 
 77 |     if len(records) != 0:
 78 |         submit_batch(records, counter, 0)
 79 |     
 80 |     print(f"end : {counter}")
 81 | 
 82 | 
 83 | def compare(previous_df, current_df, workload_cols, feature_cols):  
 84 |     previous_df.loc[:,'Workload'] = previous_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 85 |     previous_df.loc[:,'Feature'] = previous_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 86 |     current_df.loc[:,'Workload'] = current_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 87 |     current_df.loc[:,'Feature'] = current_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 88 | 
 89 |     current_indices = current_df[['Workload', 'Feature']].sort_values(by='Workload').index
 90 |     current_values = current_df[['Workload', 'Feature']].sort_values(by='Workload').values
 91 |     previous_indices = previous_df[['Workload', 'Feature']].sort_values(by='Workload').index
 92 |     previous_values = previous_df[['Workload', 'Feature']].sort_values(by='Workload').values
 93 |     
 94 |     changed_indices = []
 95 |     removed_indices = []
 96 |     
 97 |     prev_idx = 0
 98 |     curr_idx = 0
 99 |     while True:
100 |         if (curr_idx == len(current_indices)) and (prev_idx == len(previous_indices)):
101 |             break
102 |         elif curr_idx == len(current_indices):
103 |             prev_workload = previous_values[prev_idx][0]
104 |             if prev_workload not in current_values[:,0]:
105 |                 removed_indices.append(previous_indices[prev_idx])
106 |                 prev_idx += 1
107 |                 continue
108 |             else:
109 |                 raise Exception('workload error')
110 |             break
111 |         elif prev_idx == len(previous_indices):
112 |             curr_workload = current_values[curr_idx][0]
113 |             curr_feature = current_values[curr_idx][1]
114 |             if curr_workload not in previous_values[:,0]:
115 |                 changed_indices.append(current_indices[curr_idx])
116 |                 curr_idx += 1
117 |                 continue
118 |             else:
119 |                 raise Exception('workload error')
120 |             break
121 |             
122 |         prev_workload = previous_values[prev_idx][0]
123 |         prev_feature = previous_values[prev_idx][1]
124 |         curr_workload = current_values[curr_idx][0]
125 |         curr_feature = current_values[curr_idx][1]
126 |         
127 |         if prev_workload != curr_workload:
128 |             if curr_workload not in previous_values[:,0]:
129 |                 changed_indices.append(current_indices[curr_idx])
130 |                 curr_idx += 1
131 |             elif prev_workload not in current_values[:,0]:
132 |                 removed_indices.append(previous_indices[prev_idx])
133 |                 prev_idx += 1
134 |                 continue
135 |             else:
136 |                 raise Exception('workload error')
137 |         else:
138 |             if prev_feature != curr_feature:
139 |                 changed_indices.append(current_indices[curr_idx])
140 |             curr_idx += 1
141 |             prev_idx += 1
142 |     changed_df = current_df.loc[changed_indices].drop(['Workload', 'Feature'], axis=1)
143 |     removed_df = previous_df.loc[removed_indices].drop(['Workload', 'Feature'], axis=1)
144 |     
145 |     for col in feature_cols:
146 |         removed_df[col] = 0
147 | 
148 |     # removed_df have one more column, 'Ceased'
149 |     removed_df['Ceased'] = True
150 | 
151 |     return changed_df, removed_df
152 | 
153 | 
154 | # write first rawdata to timestream
155 | 
156 | paths = []
157 | for (path, dir, files) in os.walk(NEW_FILE_PATH):
158 |     for filename in files:
159 |         final_path = path + '/' + filename
160 |         paths.append(final_path)
161 | paths.sort()
162 | 
163 | 
164 | path  = paths[0]
165 | 
166 | changed_time = path.split('gcp_newrawdata/')[1].split('.csv.gz')[0]
167 | timestamp = datetime.strptime(changed_time, '%Y/%m/%d/%H:%M:%S')
168 | print(timestamp)
169 | df = pd.DataFrame()
170 | with gzip.open(path, 'rb') as f:
171 |     df = pd.read_csv(f)
172 | 
173 | upload_timestream(df, timestamp)
174 | 
175 | for i in range (1, len(paths)):
176 |     prev_path = paths[i-1]
177 |     curr_path = paths[i]
178 |     changed_time = curr_path.split('gcp_newrawdata/')[1].split('.csv.gz')[0]
179 |     timestamp = datetime.strptime(changed_time, '%Y/%m/%d/%H:%M:%S')
180 | 
181 |     df_prev = pd.DataFrame()
182 |     df_curr = pd.DataFrame()
183 |     
184 |     with gzip.open(prev_path, 'rb') as f:
185 |         df_prev = pd.read_csv(f)
186 |     with gzip.open(curr_path, 'rb') as f:
187 |         df_curr = pd.read_csv(f)
188 |     
189 |     changed_df, removed_df = compare(df_prev, df_curr, workload_cols, feature_cols)
190 |     upload_timestream(changed_df, timestamp)
191 |     upload_timestream(removed_df, timestamp)


--------------------------------------------------------------------------------
/migration/aws/migration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import boto3
  3 | import tsquery
  4 | import tsupload
  5 | import pandas as pd
  6 | from multiprocessing import Pool
  7 | 
  8 | import time
  9 | import pytz
 10 | from datetime import datetime, timedelta
 11 | 
 12 | 
 13 | SAVE_FILENAME = 'latest.csv.gz'
 14 | PROFILE_NAME = 'default'
 15 | BUCKET_NAME = 'spotlake'
 16 | REGION_NAME = "us-west-2"
 17 | DATABASE_NAME = 'spotlake'
 18 | TABLE_NAME = 'aws'
 19 | NUM_CPUS = 8
 20 | if 24 % NUM_CPUS != 0:
 21 |     raise Exception('use only 1, 2, 3, 4, 6, 8, 12, 24')
 22 | CHUNK_HOUR = 24 / NUM_CPUS
 23 | 
 24 | start_date = datetime(2022, 1, 1, 0, 0, 0, 0, pytz.UTC)
 25 | end_date = datetime(2022, 4, 13, 0, 0, 0, 0, pytz.UTC)
 26 | 
 27 | workload_cols = ['InstanceType', 'Region', 'AZ']
 28 | feature_cols = ['SPS', 'IF', 'SpotPrice']
 29 | 
 30 | # tsquery.PROFILE_NAME = PROFILE_NAME # tsquery.PROFILE_NAME must be credential of source database
 31 | tsquery.REGION_NAME = REGION_NAME
 32 | tsupload.PROFILE_NAME = PROFILE_NAME
 33 | tsupload.REGION_NAME = REGION_NAME
 34 | tsupload.DATABASE_NAME = DATABASE_NAME
 35 | tsupload.TABLE_NAME = TABLE_NAME
 36 | 
 37 | 
 38 | # compress data as gzip file, save to local file system, upload file to s3
 39 | def save_gz_s3(df, timestamp):
 40 |     # compress and save to LFS
 41 |     df.to_csv(SAVE_FILENAME, index=False, compression="gzip")
 42 |     
 43 |     # upload compressed file to S3
 44 |     session = boto3.Session(profile_name=PROFILE_NAME)
 45 |     s3 = session.client('s3')
 46 |     s3_dir_name = '/'.join(timestamp.split()[0].split('-'))
 47 |     s3_obj_name = timestamp.split()[1]
 48 |     
 49 |     with open(SAVE_FILENAME, 'rb') as f:
 50 |         s3.upload_fileobj(f, BUCKET_NAME, f"rawdata/{s3_dir_name}/{s3_obj_name}.csv.gz")
 51 |         
 52 | 
 53 | def compare_nparray(previous_df, current_df, workload_cols, feature_cols):  
 54 |     previous_df.loc[:,'Workload'] = previous_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 55 |     previous_df.loc[:,'Feature'] = previous_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 56 |     current_df.loc[:,'Workload'] = current_df[workload_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 57 |     current_df.loc[:,'Feature'] = current_df[feature_cols].apply(lambda row: ':'.join(row.values.astype(str)), axis=1)
 58 | 
 59 |     current_indices = current_df[['Workload', 'Feature']].sort_values(by='Workload').index
 60 |     current_values = current_df[['Workload', 'Feature']].sort_values(by='Workload').values
 61 |     previous_indices = previous_df[['Workload', 'Feature']].sort_values(by='Workload').index
 62 |     previous_values = previous_df[['Workload', 'Feature']].sort_values(by='Workload').values
 63 |     
 64 |     changed_indices = []
 65 |     removed_indices = []
 66 |     
 67 |     prev_idx = 0
 68 |     curr_idx = 0
 69 |     while True:
 70 |         if (curr_idx == len(current_indices)) and (prev_idx == len(previous_indices)):
 71 |             break
 72 |         elif curr_idx == len(current_indices):
 73 |             prev_workload = previous_values[prev_idx][0]
 74 |             if prev_workload not in current_values[:,0]:
 75 |                 removed_indices.append(previous_indices[prev_idx])
 76 |                 prev_idx += 1
 77 |                 continue
 78 |             else:
 79 |                 raise Exception('workload error')
 80 |             break
 81 |         elif prev_idx == len(previous_indices):
 82 |             curr_workload = current_values[curr_idx][0]
 83 |             curr_feature = current_values[curr_idx][1]
 84 |             if curr_workload not in previous_values[:,0]:
 85 |                 changed_indices.append(current_indices[curr_idx])
 86 |                 curr_idx += 1
 87 |                 continue
 88 |             else:
 89 |                 raise Exception('workload error')
 90 |             break
 91 |             
 92 |         prev_workload = previous_values[prev_idx][0]
 93 |         prev_feature = previous_values[prev_idx][1]
 94 |         curr_workload = current_values[curr_idx][0]
 95 |         curr_feature = current_values[curr_idx][1]
 96 |         
 97 |         if prev_workload != curr_workload:
 98 |             if curr_workload not in previous_values[:,0]:
 99 |                 changed_indices.append(current_indices[curr_idx])
100 |                 curr_idx += 1
101 |             elif prev_workload not in current_values[:,0]:
102 |                 removed_indices.append(previous_indices[prev_idx])
103 |                 prev_idx += 1
104 |             else:
105 |                 raise Exception('workload error')
106 |         else:
107 |             if prev_feature != curr_feature:
108 |                 changed_indices.append(current_indices[curr_idx])
109 |             curr_idx += 1
110 |             prev_idx += 1
111 |     
112 |     changed_df = current_df.loc[changed_indices].drop(['Workload', 'Feature'], axis=1)
113 |     removed_df = previous_df.loc[removed_indices].drop(['Workload', 'Feature'], axis=1)
114 |     for col in feature_cols:
115 |         removed_df[col] = 0
116 | 
117 |     # removed_df have one more column, 'Ceased'
118 |     removed_df['Ceased'] = True
119 |     return changed_df, removed_df
120 | 
121 | 
122 | def date_range(start, end):
123 |     delta = end - start
124 |     days = [start + timedelta(days=i) for i in range(delta.days + 1)]
125 |     return days
126 | 
127 | 
128 | def time_format(timestamp):
129 |     return 'T'.join(str(timestamp).split())
130 | 
131 | 
132 | days = date_range(start_date, end_date)
133 | 
134 | perf_start_total = time.time()
135 | for idx in range(len(days)-1):
136 |     perf_start = time.time()
137 |     start_timestamp = days[idx]
138 |     end_timestamp = days[idx+1]
139 |     
140 |     start_end_time_process_list = []
141 |     for i in range(NUM_CPUS):
142 |         start_time_process = start_timestamp + timedelta(hours = CHUNK_HOUR*i)
143 |         end_time_process = start_timestamp + timedelta(hours = CHUNK_HOUR*(i+1))
144 |         start_end_time_process_list.append((time_format(start_time_process), time_format(end_time_process)))
145 |         
146 |     with Pool(NUM_CPUS) as p:
147 |         process_df_list = p.starmap(tsquery.get_timestream, start_end_time_process_list)
148 |         
149 |     day_df = pd.concat(process_df_list, axis=0, ignore_index=True)
150 |     frequency_map = {'<5%': 3.0, '5-10%': 2.5, '10-15%': 2.0, '15-20%': 1.5, '>20%': 1.0}
151 |     day_df = day_df.replace({'IF': frequency_map})
152 |     day_df['SPS'] = day_df['SPS'].astype(int)
153 |     day_df['SpotPrice'] = day_df['SpotPrice'].astype(float)
154 |     day_df['SpotPrice'] = day_df['SpotPrice'].round(5)
155 |     
156 |     print(f"elapsed time - single day query: {time.time() - perf_start}")
157 |     # day_df['OndemandPrice'] = (100 * day_df['SpotPrice']) / (100 - day_df['Savings'])
158 |     
159 |     day_timestamps = sorted(list(day_df['time'].unique()))
160 |     for timestamp in day_timestamps:
161 |         perf_start = time.time()
162 |         current_df = day_df[day_df['time'] == timestamp].copy()
163 |         print(f"elapsed time - select by time: {time.time() - perf_start}")
164 |         if SAVE_FILENAME not in os.listdir('./'):
165 |             save_gz_s3(current_df, timestamp)
166 |             tsupload.upload_timestream(current_df)
167 |         else:
168 |             perf_start = time.time()
169 |             previous_df = pd.read_csv(SAVE_FILENAME, compression='gzip', header=0, sep=',', quotechar='"')
170 |             save_gz_s3(current_df, timestamp)
171 |             print(f"elapsed time - read and save: {time.time() - perf_start}")
172 |             perf_start = time.time()
173 |             changed_df, removed_df = compare_nparray(previous_df, current_df, workload_cols, feature_cols)
174 |             print(f"elapsed time - compare: {time.time() - perf_start}")
175 |             perf_start = time.time()
176 |             # changed_df and removed_df have different shape, because of 'Ceased' column
177 |             tsupload.upload_timestream(changed_df)
178 |             tsupload.upload_timestream(removed_df)
179 |             print(f"elapsed time - upload: {time.time() - perf_start}")
180 | print(f"elapsed time - total single day: {time.time() - perf_start_total}")
181 | 


--------------------------------------------------------------------------------