├── .gitignore ├── CODE_OF_CONDUCT.md ├── Code ├── Plotting │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── plots.cpython-37.pyc │ └── plots.py ├── Profiling │ ├── Intermittent │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── intermittent.cpython-37.pyc │ │ └── intermittent.py │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── Regressors │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── regressors.cpython-37.pyc │ ├── regressors.py │ ├── similar_day.py │ └── temperatures.py ├── Scoring │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── forecast.cpython-37.pyc │ │ ├── kpi.cpython-37.pyc │ │ ├── scoring.cpython-37.pyc │ │ ├── train.cpython-37.pyc │ │ └── train_test.cpython-37.pyc │ ├── forecast.py │ ├── kpi.py │ ├── scoring.py │ ├── train.py │ └── train_test.py ├── Utils │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── utils.cpython-37.pyc │ └── utils.py ├── __init__.py └── __pycache__ │ └── __init__.cpython-37.pyc ├── Configuration ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── config.cpython-37.pyc ├── config.py └── config.yaml ├── Dashboards └── EnergyDashboard.pbix ├── Docs ├── Images │ ├── banner.jpg │ ├── calendar.png │ ├── elbow.png │ ├── intermittent_TS.png │ ├── panel_data.png │ ├── sliding_plot.png │ └── thermal.png └── Slides │ └── ds_toolkit_forecasting_2.0_memo.pdf ├── Environment └── forecasting_energy.yml ├── LICENSE ├── Notebooks ├── EnergyClusteringRegular.ipynb ├── EnergyDataExploration.ipynb ├── EnergyPredictionDataPreparation.ipynb ├── EnergyPredictionScoring.ipynb └── EnergyProfilingIntermittent.ipynb ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── Tests ├── InsuranceClaimsDataPreparation.ipynb └── InsuranceClaimsProfilingIntermittent.ipynb └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | Data/ 2 | __pycache__ 3 | **/__pycache__ 4 | *.pyc 5 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /Code/Plotting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Plotting/__init__.py -------------------------------------------------------------------------------- /Code/Plotting/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Plotting/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Plotting/__pycache__/plots.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Plotting/__pycache__/plots.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Plotting/plots.py: -------------------------------------------------------------------------------- 1 | # data elaboration functions 2 | import pandas as pd 3 | import numpy as np 4 | import re 5 | 6 | # file management functions 7 | import os 8 | import glob 9 | 10 | # time management functions 11 | import datetime as dt 12 | 13 | # plot functions 14 | from matplotlib import pyplot as plt 15 | import matplotlib.dates as mdates 16 | import plotly.graph_objects as go 17 | 18 | # custom functions 19 | from Configuration.config import cfg_path 20 | from Code.Utils.utils import Utils 21 | from Code.Scoring.kpi import Kpi 22 | 23 | class Plots: 24 | 25 | def sliding_line_plot(df, serie_to_plot, id, i, chart_title=""): 26 | """ 27 | Creates a time series plot with sliding dates 28 | :params: df as pandas dataframe 29 | :return: html file with plot 30 | """ 31 | 32 | ### Setup 33 | date = Utils.find_date(df) 34 | 35 | ## Sort 36 | df.sort_values(date, inplace=True) 37 | 38 | ## Create figure 39 | fig = go.Figure() 40 | fig.add_trace(go.Scatter(x=list(df.loc[df[id] == i, date]), y=list(df.loc[df[id] == i, serie_to_plot]), name=str(i))) 41 | 42 | # Set title 43 | if chart_title!="": 44 | fig.update_layout( 45 | title_text=chart_title 46 | ) 47 | 48 | else: 49 | chart_title = serie_to_plot.capitalize() + ' ' + str(id) + ' ' + str(i) 50 | fig.update_layout( 51 | title_text=chart_title 52 | ) 53 | 54 | print('sliding_line_plot: plotting', chart_title) 55 | 56 | # Add range slider 57 | fig.update_layout( 58 | xaxis=dict( 59 | rangeselector=dict( 60 | buttons=list([ 61 | dict(count=1, 62 | label="1m", 63 | step="month", 64 | stepmode="backward"), 65 | dict(count=3, 66 | label="3m", 67 | step="month", 68 | stepmode="backward"), 69 | dict(count=6, 70 | label="6m", 71 | step="month", 72 | stepmode="backward"), 73 | dict(count=1, 74 | label="YTD", 75 | step="year", 76 | stepmode="todate"), 77 | dict(count=1, 78 | label="1y", 79 | step="year", 80 | stepmode="backward"), 81 | dict(step="all") 82 | ]) 83 | ), 84 | rangeslider=dict( 85 | visible=True 86 | ), 87 | type="date" 88 | ) 89 | ) 90 | return fig 91 | 92 | 93 | def sliding_fcst_plot(df, predict_col, expected_values, chart_title="", kpi=True): 94 | """ 95 | Creates a time series plot with sliding dates 96 | :params: df as pandas dataframe, chart_title as string, kpi as boolean 97 | :return: html file with plot 98 | """ 99 | 100 | ### Setup 101 | date = Utils.find_date(df) 102 | 103 | if isinstance(date, list): 104 | date = list(set(Utils.find_date(df)) - set(['train_start_date', 'train_end_date', 'test_start_date', 'test_end_date']))[0] 105 | 106 | y = predict_col.copy() 107 | fcst = expected_values.copy() 108 | 109 | ## Sort 110 | df = df.sort_values(date).copy() 111 | 112 | ## Adding model info to chart title 113 | if 'best_model' in list(df.columns): 114 | model = df['best_model'].unique()[0] 115 | chart_title = str(chart_title) + ' - ' + model 116 | else: 117 | chart_title = str(chart_title) 118 | 119 | ## Checking KPI 120 | if kpi == True: 121 | try: 122 | mape = str(round(Kpi.compute_mape(df, 'fcst', y), 2)*100) 123 | min_mape_date = min(df.loc[~df.absolute_percentage_error.isnull(), date]).strftime("%d-%m-%Y") 124 | max_mape_date = max(df.loc[~df.absolute_percentage_error.isnull(), date]).strftime("%d-%m-%Y") 125 | chart_title = chart_title + ' - MAPE: ' + mape + "% from " + min_mape_date + ' to ' + max_mape_date 126 | except: 127 | chart_title = str(chart_title) 128 | else: 129 | chart_title = str(chart_title) 130 | 131 | ## Create figure 132 | fig = go.Figure() 133 | 134 | fig.add_trace(go.Scatter(x=list(df[date]), y=list(df[y]), name=y)) 135 | fig.add_trace(go.Scatter(x=list(df[date]), y=list(df[fcst]), name=fcst)) 136 | 137 | # Set title 138 | if chart_title!="": 139 | fig.update_layout( 140 | title_text=chart_title 141 | ) 142 | else: 143 | fig.update_layout( 144 | title_text="Forecasting " + y.capitalize() 145 | ) 146 | 147 | # Add annotations 148 | for col in ['train_start_date', 'train_end_date', 'test_start_date', 'test_end_date']: 149 | if col in list(df.columns) and col in ['train_end_date', 'test_end_date']: 150 | col_date = pd.to_datetime(str(df[col].unique()[0])).strftime('%Y-%m-%d') 151 | closest_date = df[col].unique()[0] 152 | x_value = pd.to_datetime(df.loc[df[date]==closest_date, date].reset_index(drop=True)[0], format='%Y-%m-%d') 153 | y_value = pd.to_numeric(df.loc[df[date]==closest_date, y].reset_index(drop=True)[0]) 154 | fig.add_annotation( 155 | x=x_value, 156 | y=y_value, 157 | text= col + ': ' + str(col_date), 158 | showarrow=True, 159 | arrowhead=1, 160 | arrowsize=1, 161 | arrowwidth=2, 162 | font = dict( 163 | color="black", 164 | size=16 165 | )) 166 | elif col in list(df.columns) and col in ['train_start_date']: 167 | col_date = pd.to_datetime(str(df[col].unique()[0])).strftime('%Y-%m-%d') 168 | closest_date = df[col].unique()[0] 169 | x_value = pd.to_datetime(df.loc[df[date]==closest_date, date].reset_index(drop=True)[0], format='%Y-%m-%d') 170 | y_value = pd.to_numeric(df.loc[df[date]==closest_date, y].reset_index(drop=True)[0]) 171 | fig.add_annotation( 172 | x=x_value, 173 | y=y_value*2, 174 | text= col + ': ' + str(col_date), 175 | showarrow=True, 176 | arrowhead=1, 177 | arrowsize=1, 178 | arrowwidth=2, 179 | font = dict( 180 | color="black", 181 | size=16 182 | )) 183 | elif col in list(df.columns) and col in ['test_start_date']: 184 | col_date = pd.to_datetime(str(df[col].unique()[0])).strftime('%Y-%m-%d') 185 | closest_date = df[col].unique()[0] 186 | x_value = pd.to_datetime(df.loc[df[date]==closest_date, date].reset_index(drop=True)[0], format='%Y-%m-%d') 187 | y_value = pd.to_numeric(df.loc[df[date]==closest_date, y].reset_index(drop=True)[0]) 188 | fig.add_annotation( 189 | x=x_value, 190 | y=y_value*1.5, 191 | text= col + ': ' + str(col_date), 192 | showarrow=True, 193 | arrowhead=1, 194 | arrowsize=1, 195 | arrowwidth=2, 196 | font = dict( 197 | color="black", 198 | size=16 199 | )) 200 | else: 201 | print('No annotation available for', col) 202 | 203 | # Add range slider 204 | fig.update_layout( 205 | xaxis=dict( 206 | rangeselector=dict( 207 | buttons=list([ 208 | dict(count=1, 209 | label="1m", 210 | step="month", 211 | stepmode="backward"), 212 | dict(count=3, 213 | label="3m", 214 | step="month", 215 | stepmode="backward"), 216 | dict(count=6, 217 | label="6m", 218 | step="month", 219 | stepmode="backward"), 220 | dict(count=1, 221 | label="YTD", 222 | step="year", 223 | stepmode="todate"), 224 | dict(count=1, 225 | label="1y", 226 | step="year", 227 | stepmode="backward"), 228 | dict(step="all") 229 | ]) 230 | ), 231 | rangeslider=dict( 232 | visible=True 233 | ), 234 | type="date" 235 | ) 236 | ) 237 | 238 | return fig 239 | 240 | -------------------------------------------------------------------------------- /Code/Profiling/Intermittent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Profiling/Intermittent/__init__.py -------------------------------------------------------------------------------- /Code/Profiling/Intermittent/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Profiling/Intermittent/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Profiling/Intermittent/__pycache__/intermittent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Profiling/Intermittent/__pycache__/intermittent.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Profiling/Intermittent/intermittent.py: -------------------------------------------------------------------------------- 1 | # data elaboration functions 2 | import pandas as pd 3 | import numpy as np 4 | 5 | # statistical functions 6 | from scipy.stats.mstats import winsorize 7 | 8 | class Intermittent: 9 | def cv2_by_group(df, y, grouping_var, highest=0.05, lowest=0.05): 10 | ''' Computes cv2 by group 11 | :params: df as pandas dataframe, y as string, grouping_var as list, highest and lowest as scalars 0<=x<=1 as winsorization percentages 12 | :return: a dataframe''' 13 | cv2_by_freq = df.loc[:, [grouping_var, y]].groupby(grouping_var).apply(lambda x: Intermittent.cv2(x, highest, lowest)).reset_index(level=grouping_var) 14 | cv2_by_freq.columns = [grouping_var, 'cv2_by_group'] 15 | return cv2_by_freq 16 | 17 | def cv2(array, highest=0.05, lowest=0.05): 18 | ''' Winsorization is the process of replacing the extreme values of statistical data in order to limit 19 | the effect of the outliers on the calculations or the results obtained by using that data. 20 | The mean value calculated after such replacement of the extreme values is called winsorized mean. 21 | :params: array as numpy array, highest and lowest as scalars 0<=x<=1 as winsorization percentages 22 | :return: a scalar''' 23 | winsorized_array = winsorize(array,(highest,lowest)) 24 | cv2 = (np.std(winsorized_array)/np.mean(winsorized_array))**2 25 | return cv2 26 | 27 | def adi(array, highest=0.05, lowest=0.05): 28 | ''' Winsorization is the process of replacing the extreme values of statistical data in order to limit 29 | the effect of the outliers on the calculations or the results obtained by using that data. 30 | The mean value calculated after such replacement of the extreme values is called winsorized mean. 31 | :params: array as numpy array, highest and lowest as scalars 0<=x<=1 as winsorization percentages 32 | :return: a scalar''' 33 | winsorized_array = winsorize(array,(highest,lowest)) 34 | adi = np.mean(winsorized_array) 35 | return adi 36 | 37 | def sddi(array, highest=0.05, lowest=0.05): 38 | ''' Winsorization is the process of replacing the extreme values of statistical data in order to limit 39 | the effect of the outliers on the calculations or the results obtained by using that data. 40 | The mean value calculated after such replacement of the extreme values is called winsorized mean. 41 | :params: array as numpy array, highest and lowest as scalars 0<=x<=1 as winsorization percentages 42 | :return: a scalar''' 43 | winsorized_array = winsorize(array,(highest,lowest)) 44 | sddi = np.std(winsorized_array) 45 | return sddi 46 | 47 | def compute_indicator_values(vect, threshold, perc, quant, highest, lowest): 48 | ''' Computes indicator values 49 | :params: vect as numpy array, threshold as numeric, perc as numeric, quant as numeric, highest and lowest as scalars 0<=x<=1 as winsorization percentages 50 | :return: a dictionary 51 | ''' 52 | 53 | if isinstance(vect,(np.ndarray))==False: 54 | try: 55 | vect = np.array(vect) 56 | except: 57 | raise Exception("identify_intermittent: input vect is not numeric and could not be converted") 58 | if threshold=='': 59 | print("No threshold provided. Using vect[0] to compute scores with OFF threshold as percentage of threshold and excluding vect[0] from score computation for all OFF thesholds.") 60 | threshold = vect[0] 61 | vect = vect[1:len(vect)] 62 | print('Threshold:', threshold) 63 | 64 | ### Removing nan 65 | vect = vect[vect!=np.nan] 66 | vect = vect.astype(np.float) 67 | 68 | ### Create low demand list names 69 | list_low_demand = ["zero", "perc_threshold"] 70 | for ind in ["floor_perc_quant_", "perc_quant_"]: 71 | list_low_demand.append(ind + str(quant).replace('0.', '')) 72 | 73 | for LD in list_low_demand: 74 | if LD=="zero": 75 | low_demand = 0 76 | elif LD=="perc_threshold": 77 | low_demand = perc*threshold 78 | elif LD=="floor_perc_quant_"+ str(quant).replace('0.', ''): 79 | low_demand = max([0.250, 0.001*np.quantile(vect, quant)]) 80 | elif LD=="perc_quant_"+ str(quant).replace('0.', ''): 81 | low_demand = perc*np.quantile(vect, quant) 82 | 83 | nzd = vect[vect>low_demand] 84 | k = len(nzd) 85 | 86 | if (sum(vect[vect>low_demand])>=2) & (k>1): 87 | x = np.append([nzd[0]], [nzd[1:k] - nzd[0:(k-1)]]) 88 | 89 | cv2 = Intermittent.cv2(nzd, highest, lowest) 90 | adi = Intermittent.adi(x, highest, lowest) 91 | sddi = Intermittent.sddi(x, highest, lowest) 92 | else: 93 | cv2 = np.nan 94 | adi = np.nan 95 | sddi = np.nan 96 | 97 | res = pd.DataFrame.from_dict({'type': [LD], 'k': [k], 'low_demand': [low_demand], 'cv2': [cv2], 'adi': [adi], 'sddi': [sddi]}) 98 | 99 | return res 100 | 101 | def enhanced_compute_indicator_values(vect, threshold, perc, quant, highest, lowest): 102 | ''' Computes indicator values (enhanced) 103 | :params: vect as numpy array, threshold as numeric, perc as numeric, quant as numeric, highest and lowest as scalars 0<=x<=1 as winsorization percentages 104 | :return: a dictionary 105 | ''' 106 | if isinstance(vect,(np.ndarray))==False: 107 | try: 108 | vect = np.array(vect) 109 | except: 110 | raise Exception("identify_intermittent: input vect is not numeric and could not be converted") 111 | if threshold=='': 112 | print("No threshold provided. Using vect[0] to compute scores with OFF threshold as percentage of threshold and excluding vect[0] from score computation for all OFF thesholds.") 113 | threshold = vect[0] 114 | vect = vect[1:len(vect)] 115 | print('Threshold:', threshold) 116 | 117 | ### Removing nan and selecting float 118 | vect = vect[(vect!=np.nan)] 119 | vect = vect.astype(np.float) 120 | 121 | ### Z function 122 | def Z(quant): 123 | cond1 = max([perc * np.quantile(vect, quant), 0.1*perc*np.quantile(vect, quant), 0.25]) 124 | cond2 = min([perc * np.quantile(vect, quant), 0.1*perc*np.quantile(vect, quant)]) 125 | if 0.25 >= cond1: 126 | return 0.25 127 | elif 0.25low_demand] 139 | k = len(nzd) 140 | 141 | if (sum(vect[vect>low_demand])>=2) & (k>1): 142 | x = np.array([nzd[0]]) + [nzd[1:k] - nzd[0:(k-1)]] + np.array([len(vect)+1-nzd[k-1]]) 143 | 144 | cv2 = Intermittent.cv2(nzd, highest, lowest) 145 | adi = Intermittent.adi(x, highest, lowest) 146 | sddi = Intermittent.sddi(x, highest, lowest) 147 | else: 148 | cv2 = np.nan 149 | adi = np.nan 150 | sddi = np.nan 151 | 152 | res = pd.DataFrame.from_dict({'type': [LD], 'k': [k], 'low_demand': [low_demand], 'cv2': [cv2], 'adi': [adi], 'sddi': [sddi]}) 153 | 154 | return res 155 | 156 | def classify_intermittent(df, type, thres_cv2_constant, thres_cv2, thres_adi, thres_sddi, min_time_cons): 157 | ''' Classifies intermittent time series based on indicator values 158 | :params: df as pandas dataframe, type as string, thres_cv2_constant as numeric, thres_cv2 as numeric, thres_adi as numeric, thres_sddi as numeric, min_time_cons as numeric 159 | :return: a pandas dataframe 160 | ''' 161 | # Excluding the ids for which the indicators are np.nan 162 | score_no_nan = df.dropna() 163 | 164 | # Regular 165 | mask_regular = (score_no_nan.type == type) &\ 166 | (score_no_nan.k > min_time_cons) &\ 167 | (score_no_nan.cv2 >= thres_cv2_constant) &\ 168 | (score_no_nan.cv2 < thres_cv2) &\ 169 | (score_no_nan.cv2 < thres_adi) &\ 170 | (score_no_nan.cv2 < thres_sddi) 171 | df_regular = score_no_nan.loc[mask_regular, ] 172 | try: 173 | df_regular.loc[:, 'profile'] = 'regular' 174 | print('classify_intermittent: regular ids', len(df_regular)) 175 | except: 176 | print('classify_intermittent: no regular ids') 177 | 178 | # Constant at zero 179 | mask_constant_zero = (score_no_nan.type == type) &\ 180 | (score_no_nan.k <= min_time_cons) 181 | df_constant_zero = score_no_nan.loc[mask_constant_zero, ] 182 | try: 183 | df_constant_zero.loc[:, 'profile'] = 'constant_zero' 184 | print('classify_intermittent: constant_zero ids', len(df_constant_zero)) 185 | except: 186 | print('classify_intermittent: no constant_zero ids') 187 | 188 | # Constant 189 | mask_constant = (score_no_nan.type == type) &\ 190 | (score_no_nan.k > min_time_cons) &\ 191 | (score_no_nan.cv2 < thres_cv2_constant) &\ 192 | (score_no_nan.cv2 < thres_adi) &\ 193 | (score_no_nan.cv2 < thres_sddi) 194 | df_constant = score_no_nan.loc[mask_constant, ] 195 | try: 196 | df_constant.loc[:, 'profile'] = 'constant' 197 | print('classify_intermittent: constant ids', len(df_constant)) 198 | except: 199 | print('classify_intermittent: no constant ids') 200 | 201 | # Spikes 202 | mask_spikes = (score_no_nan.type == type) &\ 203 | (score_no_nan.k > min_time_cons) &\ 204 | (score_no_nan.cv2 < thres_cv2) &\ 205 | (score_no_nan.cv2 >= thres_adi) &\ 206 | (score_no_nan.cv2 < thres_sddi) 207 | df_spikes = score_no_nan.loc[mask_spikes, ] 208 | try: 209 | df_spikes.loc[:, 'profile'] = 'spikes' 210 | print('classify_intermittent: spikes ids', len(df_spikes)) 211 | except: 212 | print('classify_intermittent: no spikes ids') 213 | 214 | # Lumpy 215 | mask_lumpy = (score_no_nan.type == type) &\ 216 | (score_no_nan.k > min_time_cons) &\ 217 | (score_no_nan.cv2 >= thres_cv2) &\ 218 | (score_no_nan.cv2 >= thres_adi) &\ 219 | (score_no_nan.cv2 < thres_sddi) 220 | df_lumpy = score_no_nan.loc[mask_lumpy, ] 221 | try: 222 | df_lumpy.loc[:, 'profile'] = 'lumpy' 223 | print('classify_intermittent: lumpy', len(df_lumpy)) 224 | except: 225 | print('classify_intermittent: no lumpy ids') 226 | 227 | # Erratic 228 | mask_erratic = (score_no_nan.type == type) &\ 229 | (score_no_nan.k > min_time_cons) &\ 230 | (score_no_nan.cv2 >= thres_cv2) &\ 231 | (score_no_nan.cv2 < thres_adi) &\ 232 | (score_no_nan.cv2 < thres_sddi) 233 | df_erratic = score_no_nan.loc[mask_erratic, ] 234 | try: 235 | df_erratic.loc[:, 'profile'] = 'erratic' 236 | print('classify_intermittent: erratic ids', len(df_erratic)) 237 | except: 238 | print('classify_intermittent: no erratic ids') 239 | 240 | # Unforecastable time 241 | mask_unforecastable_time = (score_no_nan.type == type) &\ 242 | (score_no_nan.k > min_time_cons) &\ 243 | (score_no_nan.cv2 < thres_cv2) &\ 244 | (score_no_nan.cv2 >= thres_sddi) 245 | df_unforecastable_time = score_no_nan.loc[mask_unforecastable_time, ] 246 | try: 247 | df_unforecastable_time.loc[:, 'profile'] = 'unforecastable_time' 248 | print('classify_intermittent: unforecastable_time ids', len(df_unforecastable_time)) 249 | except: 250 | print('classify_intermittent: no unforecastable_time ids') 251 | 252 | # Unforecastable quantity 253 | mask_unforecastable_quantity = (score_no_nan.type == type) &\ 254 | (score_no_nan.k > min_time_cons) &\ 255 | (score_no_nan.cv2 >= thres_cv2) &\ 256 | (score_no_nan.cv2 >= thres_sddi) 257 | df_unforecastable_quantity = score_no_nan.loc[mask_unforecastable_quantity, ] 258 | try: 259 | df_unforecastable_quantity.loc[:, 'profile'] = 'unforecastable_quantity' 260 | print('classify_intermittent: unforecastable_quantity ids', len(df_unforecastable_quantity)) 261 | except: 262 | print('classify_intermittent: no unforecastable_quantity ids') 263 | 264 | # df_profiling 265 | df_profiling = pd.concat([df_regular, df_constant_zero, df_constant, df_spikes, df_lumpy, df_erratic, df_unforecastable_time, df_unforecastable_quantity], axis=0) 266 | 267 | return df_profiling 268 | 269 | def call_intermittent_function(func, *args): 270 | from Code.Profiling.Intermittent.intermittent import Intermittent 271 | func_dict = {'enhanced_compute_indicator_values': Intermittent.enhanced_compute_indicator_values, 'compute_indicator_values': Intermittent.compute_indicator_values} 272 | result = func_dict.get(func)(*args) 273 | return result 274 | 275 | -------------------------------------------------------------------------------- /Code/Profiling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Profiling/__init__.py -------------------------------------------------------------------------------- /Code/Profiling/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Profiling/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Regressors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Regressors/__init__.py -------------------------------------------------------------------------------- /Code/Regressors/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Regressors/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Regressors/__pycache__/regressors.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Regressors/__pycache__/regressors.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Regressors/regressors.py: -------------------------------------------------------------------------------- 1 | # data elaboration functions 2 | import pandas as pd 3 | import numpy as np 4 | import re 5 | 6 | # file management functions 7 | import os 8 | import glob 9 | import holidays as h 10 | 11 | # time management functions 12 | import datetime as dt 13 | 14 | # custom functions 15 | from Configuration.config import cfg_path 16 | from Code.Utils.utils import Utils 17 | 18 | class Regressors: 19 | def create_interactions(df, var1, var2): 20 | """ 21 | Adds interaction terms between two variables as var1*var2 to dataframe 22 | :params: dataframe, var1 and var 2 as string 23 | :return: a Pandas dataframe 24 | """ 25 | variables = df[[var1, var2]] 26 | for i in range(0, variables.columns.size): 27 | for j in range(0, variables.columns.size): 28 | col1 = variables.columns[i] 29 | col2 = variables.columns[j] 30 | if i <= j: 31 | name = col1 + "*" + col2 32 | df.loc[:, name] = variables[col1] * variables[col2] 33 | 34 | df.drop(columns = [var1 + "*" + var1], inplace=True) 35 | df.drop(columns = [var2 + "*" + var2], inplace=True) 36 | return df 37 | 38 | def create_non_linear_terms(df, var, n): 39 | """ 40 | Adds non linear terms as var^2 to dataframe 41 | :params: dataframe, var as string and n as int 42 | :return: a Pandas dataframe 43 | """ 44 | name = var + "^" + str(n) 45 | df.loc[:, name] = df.loc[:, var]**n 46 | return df 47 | 48 | def add_holidays_by_country(df, date_var, country): 49 | """ 50 | Adds holidays a dummy variable (0/1) to dataframe 51 | :params: dataframe, date_var as string, country as string 52 | :return: a Pandas dataframe 53 | """ 54 | if 'holidays' in list(df.columns): 55 | print('add_holidays_by_country: holidays column already present') 56 | else: 57 | holidays = eval("h." + country.capitalize() + "()") 58 | date_holidays = df.loc[:, date_var].apply(lambda x: int(1) if x in holidays else int(0)) 59 | date_holidays = pd.DataFrame(date_holidays) 60 | date_holidays.columns = pd.Index(['holidays']) 61 | df = pd.concat([df, date_holidays], axis=1) 62 | return df 63 | 64 | def add_weekdays(df, date_var): 65 | """ 66 | Adds weekdays a dummy variables (0/1) for each weekday to dataframe 67 | :params: dataframe, date_var as string 68 | :return: a Pandas dataframe 69 | """ 70 | df.loc[:,'wd_mon'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 0 else int(0)) 71 | df.loc[:,'wd_tue'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 1 else int(0)) 72 | df.loc[:,'wd_wed'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 2 else int(0)) 73 | df.loc[:,'wd_thu'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 3 else int(0)) 74 | df.loc[:,'wd_fri'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 4 else int(0)) 75 | df.loc[:,'wd_sat'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 5 else int(0)) 76 | df.loc[:,'wd_sun'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 6 else int(0)) 77 | return df 78 | 79 | def add_months(df, date_var): 80 | """ 81 | Adds months a dummy variables (0/1) for each month to dataframe 82 | :params: dataframe, date_var as string 83 | :return: a Pandas dataframe 84 | """ 85 | for i in range(1, 13): 86 | if i < 10: 87 | varname = 'month_0' + str(i) 88 | else: 89 | varname = 'month_' + str(i) 90 | 91 | df.loc[:, varname] = df.loc[:, date_var].apply(lambda x: int(1) if x.month == i else int(0)) 92 | return df 93 | 94 | def calculate_degree_days(df, base_temperature, temperature): 95 | """ 96 | Calculate the Degree Days Heating and Cooling values 97 | :params: dataframe, base temperature to start and actual temperature as string 98 | :return: a pandas dataframe 99 | """ 100 | df['DDC_temperature'] = (df[temperature] - df[base_temperature]).clip(lower=0) 101 | df['DDH_temperature'] = (df[base_temperature] - df[temperature]).clip(lower=0) 102 | 103 | return df 104 | 105 | def merge_holidays_by_date(df, df_holidays, id): 106 | """ 107 | Merge Holiday df with the train df 108 | :params: df as dataframe, df_holidays as df containing info on holidays, id as string 109 | :return: a pandas dataframe 110 | """ 111 | date_var = Utils.find_date(df) 112 | date_var_holidays = Utils.find_date(df_holidays) 113 | 114 | cols_to_keep = list(df.columns) 115 | 116 | df['date_key'] = df[date_var].dt.year.astype(str) + df[date_var].dt.month.astype(str) + df[date_var].dt.day.astype(str) 117 | df_holidays['date_key'] = df_holidays[date_var_holidays].dt.year.astype(str) + df_holidays[date_var_holidays].dt.month.astype(str) + df_holidays[date_var_holidays].dt.day.astype(str) 118 | 119 | df.loc[:, 'holidays'] = int(0) 120 | df_merge = pd.merge(df, df_holidays, how="left", on=["date_key", id], indicator=True) 121 | df_merge.loc[df_merge._merge=='both', 'holidays'] = int(1) 122 | 123 | cols_to_keep = cols_to_keep + ['holidays'] 124 | df = df_merge[cols_to_keep].copy() 125 | 126 | return df 127 | 128 | def merge_additional_days_off(df, df_metadata, id, dict_days_off): 129 | """ 130 | Merge Site Weekend data with train df 131 | :params: df as dataframe, df_metadata as df containing additional info, id as string, dict_days_off as dictionary 132 | :return: a pandas dataframe 133 | """ 134 | date_var = Utils.find_date(df) 135 | 136 | # Sites only had weekly leaves on Friday, Saturday and Sunday 137 | list_days_off = list(dict_days_off.keys()) 138 | df.loc[:, 'day_off'] = int(0) 139 | for d in list_days_off: 140 | leave = (df[date_var].dt.dayofweek == dict_days_off[d]) & (df[id].isin(df_metadata[df_metadata[d]][id])) 141 | df.loc[leave==True, 'day_off'] = int(1) 142 | 143 | df['day_off'] = df['day_off'].astype("int8") 144 | 145 | return df 146 | 147 | def merge_weather(df, weather, date_var, id): 148 | """ 149 | Merge weather data into the train df 150 | :params: df as dataframe, weather as dataframe with weather info, date_var as string, id as string 151 | :return: a pandas dataframe 152 | 153 | """ 154 | 155 | date_var = Utils.find_date(df) 156 | date_var_weather = Utils.find_date(weather) 157 | 158 | # drop duplicate values in weather and pick the closest weather station 159 | weather_cleaned = weather.sort_values([date_var, id, "distance"]).groupby([date_var, id]).first().reset_index() 160 | assert weather_cleaned.groupby([date_var, id]).count().max().max() == 1 161 | 162 | df = pd.merge(df.sort_values([date_var, id]), weather_cleaned.sort_values([date_var_weather]), left_on=[date_var, id], right_on= [date_var_weather, id], how='left', validate="m:1") 163 | 164 | return df -------------------------------------------------------------------------------- /Code/Regressors/similar_day.py: -------------------------------------------------------------------------------- 1 | # data elaboration functions 2 | import numpy as np 3 | import pandas as pd 4 | import holidays as h 5 | from functools import reduce 6 | 7 | # datetime functions 8 | import dateutil 9 | import datetime 10 | from dateutil.relativedelta import relativedelta 11 | 12 | # custom functions 13 | from Code.Regressors.regressors import Regressors 14 | from Code.Utils.utils import AlphabeticalCombinations, Utils 15 | 16 | class SimilarDay: 17 | def get_similar_days_in_previous_year(dates, country): 18 | """ 19 | Retrieves the similar day for a given date. 20 | :param dates: a list-like object of dates, country as string 21 | :return: a Pandas series of similar days 22 | """ 23 | d = pd.to_datetime(pd.Series(dates)) 24 | holidays = eval("h." + country.capitalize() + "()") 25 | return d.apply(lambda x: SimilarDay.get_similar_day_in_previous_year(x, holidays)) 26 | 27 | def get_similar_days_in_previous_week(dates, country): 28 | """ 29 | Retrieves the similar day for a given date. 30 | :param dates: a list-like object of dates, country as string 31 | :return: a Pandas series of similar days 32 | """ 33 | d = pd.to_datetime(pd.Series(dates)) 34 | holidays = eval("h." + country.capitalize() + "()") 35 | return d.apply(lambda x: SimilarDay.get_similar_day_in_previous_week(x, holidays)) 36 | 37 | 38 | def get_similar_day_in_previous_year(d, holiday_calendar): 39 | """ 40 | Retrieves the similar day for a given date. If the given date is not an holiday, the similar day is the 41 | closest day of the previous year in terms of calendar position which shares the weekday. If such a date is an holiday, 42 | the same weekday of the week before is considered. 43 | If the given date is an holiday, its similar day is the closest holiday to the given date in the previous year. 44 | :param d: a date 45 | :param holiday_calendar: a calendar from holidays package 46 | :return: the similar day 47 | """ 48 | if not d or pd.isna(d): 49 | return None 50 | 51 | new_date = d - relativedelta(years=1) 52 | holiday = holiday_calendar.get(d) 53 | diff = d.weekday() - new_date.weekday() if d.weekday() >= new_date.weekday() \ 54 | else d.weekday() - new_date.weekday() + 7 55 | 56 | if not holiday: 57 | new_date = new_date + datetime.timedelta(days=diff) 58 | while holiday_calendar.get(new_date): 59 | new_date = new_date - datetime.timedelta(days=7) 60 | # elif holiday == 'Pasqua di Resurrezione': 61 | # new_date = dateutil.easter.easter(new_date.year) 62 | # elif holiday == "Lunedì dell'Angelo": 63 | # new_date = dateutil.easter.easter(new_date.year) + datetime.timedelta(days=1) 64 | 65 | return new_date 66 | 67 | def get_similar_day_in_previous_week(d, holiday_calendar): 68 | """ 69 | Retrieves the similar day for a given date. If the given date is not an holiday, the similar day is the 70 | closest day of the previous year in terms of calendar position which shares the weekday. If such a date is an holiday, 71 | the same weekday of the week before is considered. 72 | If the given date is an holiday, its similar day is the closest holiday to the given date in the previous year. 73 | :param d: a date 74 | :param holiday_calendar: a calendar from holidays package 75 | :return: the similar day 76 | """ 77 | if not d or pd.isna(d): 78 | return None 79 | 80 | new_date = d - relativedelta(weeks=1) 81 | holiday = holiday_calendar.get(d) 82 | diff = d.weekday() - new_date.weekday() if d.weekday() >= new_date.weekday() \ 83 | else d.weekday() - new_date.weekday() + 7 84 | 85 | if not holiday: 86 | new_date = new_date + datetime.timedelta(days=diff) 87 | while holiday_calendar.get(new_date): 88 | new_date = new_date - datetime.timedelta(days=7) 89 | # elif holiday == 'Pasqua di Resurrezione': 90 | # new_date = dateutil.easter.easter(new_date.year) 91 | # elif holiday == "Lunedì dell'Angelo": 92 | # new_date = dateutil.easter.easter(new_date.year) + datetime.timedelta(days=1) 93 | 94 | return new_date 95 | 96 | class StandardConsumption: 97 | def get_standard_consumption_as_mean(df, id, date_var, var, country): 98 | """ 99 | Retrieves the standard consumption for a given date as hourly monthly mean differentiated by holiday, weekend, weekdays. 100 | :params: dataframe and date_var as string, var as string, country as string 101 | :return: the similar day 102 | """ 103 | 104 | df = Regressors.add_holidays_by_country(df, date_var, country) 105 | df = Regressors.add_weekdays(df, date_var) 106 | df.loc[:, 'day'] = df.loc[:, date_var].dt.day 107 | df.loc[:, 'hour'] = df.loc[:, date_var].dt.hour 108 | df.loc[:, 'month'] = df.loc[:, date_var].dt.month 109 | 110 | timedelta = Utils.delta_format(abs(np.diff(df[date_var])).mean()) 111 | freq = Utils.find_freq(timedelta) 112 | 113 | if freq == 'D': 114 | freq_var='day' 115 | else: 116 | freq_var='hour' 117 | 118 | # Compute standard consumption as means 119 | mask = (~df[var].isnull()) & ((df.wd_mon==1) | (df.wd_tue==1) | (df.wd_wed==1) | (df.wd_thu==1) | (df.wd_fri==1)) & (df.holidays==0) 120 | df_mean_weekdays = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.mean).reset_index() 121 | new_var = var + '_std_weekdays' 122 | df_mean_weekdays.rename(columns={var: new_var}, inplace=True) 123 | df_mean_weekdays.loc[df_mean_weekdays[new_var]<0, new_var] = 0 124 | 125 | mask = (~df[var].isnull()) & ((df.wd_sat==1) | (df.wd_sun==1)) & (df.holidays==0) 126 | df_mean_weekend = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.mean).reset_index() 127 | new_var = var + '_std_weekend' 128 | df_mean_weekend.rename(columns={var: new_var}, inplace=True) 129 | df_mean_weekend.loc[df_mean_weekend[new_var]<0, new_var] = 0 130 | 131 | mask = (~df[var].isnull()) & (df.holidays==1) 132 | df_mean_holidays = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.mean).reset_index() 133 | new_var = var + '_std_holidays' 134 | df_mean_holidays.rename(columns={var: new_var}, inplace=True) 135 | df_mean_holidays.loc[df_mean_holidays[new_var]<0, new_var] = 0 136 | 137 | # Merging 138 | dfs = [df_mean_holidays, df_mean_weekdays, df_mean_weekend] 139 | df_mean = reduce(lambda left,right: pd.merge(left,right,how='outer', on=[id, 'month', freq_var], validate='1:1'), dfs) 140 | df = pd.merge(df, df_mean, how='left', on=[id, 'month', freq_var], validate='m:1') 141 | 142 | return df 143 | 144 | 145 | def get_minimum_consumption(df, date_var, var, country): 146 | """ 147 | Retrieves the minimum consumption for a given date as hourly monthly minimum value differentiated by holiday, weekend, night. 148 | :params: dataframe and date_var as string, var as string, country as string 149 | :return: the similar day 150 | """ 151 | 152 | df = Regressors.add_holidays_by_country(df, date_var, country) 153 | df = Regressors.add_weekdays(df, date_var) 154 | df.loc[:, 'day'] = df.loc[:, date_var].dt.day 155 | df.loc[:, 'hour'] = df.loc[:, date_var].dt.hour 156 | df.loc[:, 'month'] = df.loc[:, date_var].dt.month 157 | 158 | timedelta = Utils.delta_format(abs(np.diff(df[date_var])).mean()) 159 | freq = Utils.find_freq(timedelta) 160 | 161 | if freq == 'D': 162 | freq_var='day' 163 | else: 164 | freq_var='hour' 165 | 166 | # Compute min consumption 167 | mask = (~df[var].isnull()) & (df.holidays==0) & ((df.wd_mon==1) | (df.wd_tue==1) | (df.wd_wed==1) | (df.wd_thu==1) | (df.wd_fri==1)) 168 | df_min_weekdays = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.min).reset_index() 169 | new_var = var + '_min_weekdays' 170 | df_min_weekdays.rename(columns={var: new_var}, inplace=True) 171 | df_min_weekdays.loc[df_min_weekdays[new_var]<0, new_var] = 0 172 | 173 | mask = (~df[var].isnull()) & ((df.wd_sat==1) | (df.wd_sun==1)) & (df.holidays==0) 174 | df_min_weekend = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.min).reset_index() 175 | new_var = var + '_min_weekend' 176 | df_min_weekend.rename(columns={var: new_var}, inplace=True) 177 | df_min_weekend.loc[df_min_weekend[new_var]<0, new_var] = 0 178 | 179 | mask = (~df[var].isnull()) & (df.holidays==1) 180 | df_min_holidays = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.min).reset_index() 181 | new_var = var + '_min_holidays' 182 | df_min_holidays.rename(columns={var: new_var}, inplace=True) 183 | df_min_holidays.loc[df_min_holidays[new_var]<0, new_var] = 0 184 | 185 | # Merging 186 | dfs = [df_min_holidays, df_min_weekdays, df_min_weekend] 187 | df_min = reduce(lambda left,right: pd.merge(left,right,how='outer', on=[id, 'month', freq_var], validate='1:1'), dfs) 188 | df = pd.merge(df, df_min, how='left', on=[id, 'month', freq_var], validate='m:1') 189 | 190 | return df 191 | 192 | 193 | -------------------------------------------------------------------------------- /Code/Regressors/temperatures.py: -------------------------------------------------------------------------------- 1 | # selenium for web driving 2 | from logging import raiseExceptions 3 | from selenium import webdriver 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support.ui import WebDriverWait 6 | from selenium.webdriver.support import expected_conditions as EC 7 | from selenium.webdriver import ActionChains 8 | from selenium.webdriver.common.keys import Keys 9 | from selenium.webdriver.chrome.options import Options 10 | 11 | # time for pausing between navigation 12 | import time 13 | import glob 14 | import shutil 15 | 16 | # datetime functions 17 | import datetime as dt 18 | 19 | # file management functions 20 | import os 21 | import configparser 22 | import ctypes 23 | 24 | # data elaboration functions 25 | import pandas as pd 26 | import numpy as np 27 | from openpyxl import load_workbook 28 | from functools import reduce 29 | 30 | # custom functions 31 | from Code.Utils.utils import Utils, AlphabeticalCombinations 32 | 33 | class Temperatures: 34 | 35 | def ten_year(df, id, date_var, freq, temperature_list, start_date, end_date): 36 | """ 37 | Computes ten year temperatures and asis temperatures 38 | :params: dataframe 39 | :return: a Pandas dataframe, a .pkl file and a .xlsx file 40 | """ 41 | ten_year_list = [] 42 | ten_year_overall_list = [] 43 | for t in temperature_list: 44 | ten_year_list = ten_year_list + [t + '_ten_year'] 45 | ten_year_overall_list = ten_year_overall_list + [t + '_ten_year_overall'] 46 | 47 | df_seq = Utils.add_seq(df, date_var = date_var, serie=id, freq = freq, start_date=start_date, end_date=end_date) 48 | df_seq.loc[:, 'months_days'] = df_seq.loc[:, date_var].dt.strftime('%m/%d') 49 | 50 | # Defining averages by id 51 | df_to_merge = pd.pivot_table(df_seq, values=temperature_list, index=[id, 'months_days'], aggfunc=np.mean).reset_index() 52 | col_list = [id, 'months_days'] + ten_year_list 53 | df_to_merge.columns = col_list 54 | 55 | # Defining overall averages 56 | df_to_merge_overall = pd.pivot_table(df_seq, values=temperature_list, index=['months_days'], aggfunc=np.mean).reset_index() 57 | col_list_overall = ['months_days'] + ten_year_overall_list 58 | df_to_merge_overall.columns = col_list_overall 59 | 60 | # Merging 61 | df_merge = pd.merge(df_seq, df_to_merge, on=[id, 'months_days'], how='left', validate='m:1') 62 | df_merge_overall = pd.merge(df_merge, df_to_merge_overall, on=['months_days'], how='left', validate='m:1') 63 | 64 | ### Creating As-Is temperatures: where available use actual temp, if not use ten year 65 | for t in temperature_list: 66 | asis_name = t + '_asis' 67 | ten_year_name = t + '_ten_year' 68 | ten_year_overall_name = t + '_ten_year_overall' 69 | df_merge_overall.loc[:, asis_name] = df_merge_overall.loc[:, t] 70 | df_merge_overall.loc[df_merge_overall[asis_name].isnull(), asis_name] = df_merge_overall.loc[:, ten_year_name] 71 | df_merge_overall.loc[df_merge_overall[asis_name].isnull(), asis_name] = df_merge_overall.loc[:, ten_year_overall_name] 72 | 73 | if (any(df_merge_overall[asis_name].isnull())): 74 | print('ten_year: asis temperatures still CONTAIN nan value: removing') 75 | df_merge_overall = df_merge_overall.loc[df_merge_overall[asis_name].isnull()==False, ] 76 | else: 77 | print('ten_year: asis temperatures do NOT contain any nan value') 78 | 79 | df_ten_year = df_merge_overall.loc[:, ['site_id', 'timestamp', 'temperature', 'distance', 'months_days', 80 | 'temperature_ten_year', 'temperature_asis']] 81 | 82 | return df_ten_year 83 | 84 | 85 | -------------------------------------------------------------------------------- /Code/Scoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__init__.py -------------------------------------------------------------------------------- /Code/Scoring/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Scoring/__pycache__/forecast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/forecast.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Scoring/__pycache__/kpi.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/kpi.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Scoring/__pycache__/scoring.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/scoring.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Scoring/__pycache__/train.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/train.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Scoring/__pycache__/train_test.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/train_test.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Scoring/forecast.py: -------------------------------------------------------------------------------- 1 | # file management functions 2 | import os 3 | import glob 4 | from pyexpat.errors import XML_ERROR_UNEXPECTED_STATE 5 | 6 | # data elaboration functions 7 | import numpy as np 8 | import pandas as pd 9 | from openpyxl import load_workbook 10 | import re 11 | import pickle 12 | 13 | # datetime functions 14 | import datetime as dt 15 | 16 | # AI functions 17 | import xgboost as xgb 18 | from sklearn.linear_model import LinearRegression 19 | from sklearn.metrics import mean_squared_error 20 | from sklearn.model_selection import train_test_split 21 | 22 | # custom functions 23 | from Code.Utils.utils import Utils 24 | 25 | class Forecasting: 26 | def forecast(dict_test, trained_model): 27 | """ 28 | Generate forecast 29 | :params: dict_test as dictionary, trained_model as dictionary from training 30 | :return: a dictionary 31 | """ 32 | X_test = dict_test['X_test'] 33 | date_array_test = dict_test["date_array"] 34 | list_id = dict_test['list_id'] 35 | date = Utils.find_date(dict_test['y_tilda']) 36 | 37 | # Regressors list 38 | regressors_list = sorted(list(set(list(X_test.columns)) - set(list_id))) 39 | 40 | # Forecasting 41 | print('Forecasting') 42 | 43 | y_test = X_test.loc[:, regressors_list].copy() 44 | y_hat = trained_model.predict(y_test) 45 | 46 | ### Adjusting negative values 47 | y_hat_series_pos = y_hat.copy() 48 | y_hat_series_pos[y_hat_series_pos < 0] = 0 49 | 50 | forecasted_model = {'df_fcst': pd.DataFrame({date: date_array_test, 'fcst': y_hat_series_pos})} 51 | 52 | print('Forecasting completed') 53 | return forecasted_model 54 | 55 | -------------------------------------------------------------------------------- /Code/Scoring/kpi.py: -------------------------------------------------------------------------------- 1 | 2 | # data elaboration functions 3 | from attr import validate 4 | import pandas as pd 5 | from six.moves import collections_abc 6 | import string 7 | import numpy as np 8 | import math 9 | 10 | # datetime functions 11 | import datetime as dt 12 | 13 | # file management functions 14 | import os 15 | import sys 16 | import opendatasets as od 17 | import pickle 18 | from pathlib import Path 19 | 20 | # data science functions 21 | from sklearn.metrics import mean_absolute_error 22 | 23 | # custom functions 24 | from Code.Utils.utils import Utils 25 | from Code.Scoring.train import Training 26 | from Code.Scoring.forecast import Forecasting 27 | 28 | class Kpi: 29 | def find_mae(y, dict_train, dict_test, dict_models): 30 | """ 31 | Compute mean absolute error 32 | :params: y as string, dict_train as dictionary, dict_test as dictionary, dict_models as dictionary 33 | :return: a dictionary 34 | """ 35 | 36 | dict_test_no_nan = dict_test.copy() 37 | dict_test_no_nan['X_test'] = dict_test['X_test'].dropna() 38 | dict_test_no_nan['y_tilda'] = dict_test['y_tilda'].dropna() 39 | 40 | date_var_y_tilda = Utils.find_date(dict_test_no_nan['y_tilda']) 41 | dict_test_no_nan['date_array'] = dict_test_no_nan['y_tilda'].loc[:, date_var_y_tilda] 42 | 43 | # Training and forecasting 44 | dict_kpi = {} 45 | for m in list(dict_models.keys()): 46 | print('kpi for model', m) 47 | try: 48 | model = dict_models[m] 49 | trained_model = Training.train(dict_train, model) 50 | forecasted_model = Forecasting.forecast(dict_test, trained_model = trained_model) 51 | y_tilda = dict_test['y_tilda'].copy() 52 | y_tilda_date = Utils.find_date(y_tilda) 53 | y_hat = forecasted_model['df_fcst'].copy() 54 | y_hat_date = Utils.find_date(y_hat) 55 | 56 | df_merge = pd.merge(y_tilda, y_hat, left_on=y_tilda_date, right_on=y_hat_date, how='inner', validate='1:1').dropna() 57 | mae = mean_absolute_error(df_merge[y], df_merge['fcst']) 58 | dict_kpi[m] = mae 59 | except: 60 | print('kpi for model', m, 'could not be computed') 61 | 62 | return dict_kpi 63 | 64 | def compute_error(df, fcst, y): 65 | """ 66 | Compute error as forecast-actual 67 | :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns, 68 | :return: a dataframe 69 | """ 70 | if 'error' in df.columns: 71 | df = df.drop(columns='error') 72 | 73 | df.loc[:, 'error'] = (df[fcst] - df[y]) 74 | return df 75 | 76 | def compute_absolute_error(df, fcst, y): 77 | """ 78 | Compute absolute error as abs(forecast-actual) 79 | :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns, 80 | :return: a dataframe 81 | """ 82 | if 'absolute_error' in df.columns: 83 | df = df.drop(columns='absolute_error') 84 | 85 | df.loc[:, 'absolute_error'] = abs(df[fcst] - df[y]) 86 | return df 87 | 88 | def compute_absolute_percentage_error(df, fcst, y): 89 | """ 90 | Compute absolute % error 91 | :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns, 92 | :return: a dataframe 93 | """ 94 | if 'absolute_error' in df.columns: 95 | df = df.drop(columns='absolute_error') 96 | 97 | if 'absolute_percentage_error' in df.columns: 98 | df = df.drop(columns='absolute_percentage_error') 99 | 100 | df = Kpi.compute_absolute_error(df, fcst, y) 101 | df.loc[:, 'absolute_percentage_error'] = df.loc[:, 'absolute_error']/df.loc[:, y] 102 | return df 103 | 104 | def compute_mean_error(df, fcst, y): 105 | """ 106 | Compute mean error 107 | :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns, 108 | :return: a scalar 109 | """ 110 | df = Kpi.compute_error(df, fcst, y) 111 | mean_error = df.loc[:, 'error'].mean() 112 | return mean_error 113 | 114 | def compute_mae(df, fcst, y): 115 | """ 116 | Compute mean absolute error 117 | :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns, 118 | :return: a scalar 119 | """ 120 | df = Kpi.compute_absolute_error(df, fcst, y) 121 | var = 'absolute_error' 122 | mask = (df[var].isnull()==False) & (np.isneginf(df[var])==False) & (np.isposinf(df[var])==False) 123 | mae = df.loc[mask==True, var].mean() 124 | return mae 125 | 126 | def compute_mape(df, fcst, y): 127 | """ 128 | Compute mean absolute % error 129 | :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns, 130 | :return: a scalar 131 | """ 132 | df = Kpi.compute_absolute_percentage_error(df, fcst, y) 133 | var = 'absolute_percentage_error' 134 | mask = (df[var].isnull()==False) & (np.isneginf(df[var])==False) & (np.isposinf(df[var])==False) 135 | mape = df.loc[mask==True, var].mean() 136 | return mape 137 | -------------------------------------------------------------------------------- /Code/Scoring/scoring.py: -------------------------------------------------------------------------------- 1 | 2 | # data elaboration functions 3 | import pandas as pd 4 | from six.moves import collections_abc 5 | import string 6 | import numpy as np 7 | 8 | # datetime functions 9 | import datetime as dt 10 | 11 | # file management functions 12 | import os 13 | import sys 14 | import opendatasets as od 15 | import pickle 16 | from pathlib import Path 17 | 18 | # data science functions 19 | # custom functions 20 | from Code.Utils.utils import Utils 21 | from Code.Scoring.kpi import Kpi 22 | 23 | class Scoring: 24 | def find_best_algorithm(y, dict_train, dict_test, dict_algorithms, out_of_sample): 25 | """ 26 | Finds the best performing algorithm in terms of min mean absolute error 27 | :params: y as string, dict_train as dictionary, dict_test as dictionary, dict_algorithm as dictionary, out_of_sample as string 28 | :return: a string 29 | """ 30 | try: 31 | dict_kpi = Kpi.find_mae(y, dict_train, dict_test, dict_algorithms) 32 | # Best model 33 | df_best_model = pd.DataFrame.from_dict(dict_kpi, orient='index').reset_index() 34 | df_best_model.rename(columns={'index': 'model', 0: 'mae'}, inplace=True) 35 | best_model = df_best_model.loc[df_best_model.mae==df_best_model.mae.min(), 'model'].reset_index(drop=True)[0] 36 | except: 37 | print('best model could not be computed, no KPI available, using out of sample algorithm. Check to have an overlap between training and test sets dates!') 38 | best_model = out_of_sample 39 | return best_model 40 | 41 | def stats_per_site(df, id, date_var): 42 | """ 43 | Helper function to identify amount of data per site 44 | :params: df as pandas dataframe, id as string, date_var as string 45 | :return: a pandas dataframe 46 | """ 47 | return pd.DataFrame( 48 | [{ 49 | id: site, 50 | "Years": df.loc[(df[id] == site), date_var].dt.year.unique(), 51 | "Max Timestamp": df.loc[(df[id] == site), date_var].max(), 52 | "Min Timestamp": df.loc[(df[id] == site), date_var].min(), 53 | "Samples": df[(df[id] == site)].count().sum() 54 | } for site in df[id].unique()] 55 | ).sort_values("Samples", ascending=False) 56 | 57 | def resample_train_data(df, date_var, id, predict_col, sampling="D"): 58 | """ 59 | Resample the data to a particular frequency 60 | :params: df as pandas dataframe, date_var as string, id as string, sampling as string of frequency 61 | """ 62 | try: 63 | df_resampled = df.groupby(id) \ 64 | .apply(lambda group: group.set_index(date_var).resample(sampling).interpolate(method="time")) \ 65 | .reset_index(level=1) \ 66 | .reset_index(drop=True) \ 67 | .dropna(subset=[predict_col]) 68 | except: 69 | print('resample_train_data: data are already at', sampling, 'frequency') 70 | df_resampled = df.copy() 71 | 72 | return df_resampled 73 | 74 | 75 | -------------------------------------------------------------------------------- /Code/Scoring/train.py: -------------------------------------------------------------------------------- 1 | # file management functions 2 | import os 3 | import glob 4 | 5 | # data elaboration functions 6 | import numpy as np 7 | import pandas as pd 8 | from openpyxl import load_workbook 9 | import re 10 | import pickle 11 | 12 | # datetime functions 13 | import datetime as dt 14 | 15 | # AI functions 16 | import xgboost as xgb 17 | from sklearn.linear_model import LinearRegression 18 | from sklearn.metrics import mean_absolute_error 19 | 20 | # custom functions 21 | from Code.Utils.utils import Utils 22 | 23 | class Training: 24 | def train(dict_model_to_train, model): 25 | """ 26 | Generate train 27 | :params: dict_model_to_train as dictionary, model as string 28 | :return: a pandas dictionary 29 | """ 30 | y = dict_model_to_train['y'] 31 | X_train = dict_model_to_train['X_train'] 32 | Y_train = dict_model_to_train['Y_train'] 33 | list_id = dict_model_to_train['list_id'] 34 | regressors_list = sorted(list(set(list(X_train.columns)) - set(list_id))) 35 | 36 | # Training 37 | print('Training') 38 | 39 | X = X_train.loc[:, sorted(regressors_list)].copy().reset_index(drop=True) 40 | Y = Y_train.loc[:, y].copy().reset_index(drop=True) 41 | 42 | trained_model = model.fit(X,Y) 43 | 44 | print('Training completed') 45 | return trained_model 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /Code/Scoring/train_test.py: -------------------------------------------------------------------------------- 1 | # file management functions 2 | import os 3 | import glob 4 | 5 | # data elaboration functions 6 | import numpy as np 7 | import pandas as pd 8 | from openpyxl import load_workbook 9 | 10 | # datetime functions 11 | import datetime as dt 12 | 13 | # custom functions 14 | from Code.Utils.utils import Utils 15 | 16 | 17 | class TrainTest: 18 | def define_train_test_set_dates(df, y, train_start_date, train_end_date, test_start_date, test_end_date, test_size=0.33): 19 | """ 20 | Defines train and test dates if left blank 21 | :params: df as pandas dataframe, y as string, train_start_date as string in format '%Y-%m-%d', train_end_date as string in format '%Y-%m-%d', test_start_date as string in format '%Y-%m-%d', test_end_date as string in format '%Y-%m-%d', test_size as percentage 22 | :return: a dictionary 23 | """ 24 | date_var = Utils.find_date(df) 25 | min_train_start_date = df.loc[(df[y].isnull()==False), date_var].min() 26 | max_train_end_date = df.loc[(df[y].isnull()==False), date_var].max() 27 | min_test_start_date = df.loc[(df[y].isnull()==True), date_var].min() 28 | max_test_end_date = df.loc[(df[y].isnull()==True), date_var].max() 29 | range = pd.date_range(start=min_train_start_date,end=max_train_end_date) 30 | 31 | # Test set: identify latest date and set test set as latest date - test size offset 32 | if test_end_date=='': 33 | test_end_date = max_test_end_date 34 | else: 35 | test_end_date = pd.to_datetime(test_end_date, format='%Y-%m-%d') 36 | 37 | if test_start_date=='': 38 | offset_date = pd.to_datetime(max_train_end_date, format='%Y-%m-%d') - pd.DateOffset(n = round(len(range)*test_size, 0) ) 39 | test_start_date = offset_date 40 | else: 41 | test_start_date = pd.to_datetime(test_start_date, format='%Y-%m-%d') 42 | 43 | # Train set: set train set from test start date -1 to test to minimum date available 44 | if train_start_date=='': 45 | train_start_date = min_train_start_date 46 | else: 47 | train_start_date = pd.to_datetime(train_start_date, format='%Y-%m-%d') 48 | 49 | if train_end_date=='': 50 | train_end_date = test_start_date - pd.DateOffset(n = 1) 51 | else: 52 | train_end_date = pd.to_datetime(train_end_date, format='%Y-%m-%d') 53 | 54 | dict_train_test_set = {'train_start_date': train_start_date, 'train_end_date': train_end_date, 'test_start_date':test_start_date, 'test_end_date': test_end_date} 55 | return dict_train_test_set 56 | 57 | def def_train(df, y, list_id, train_start_date='', train_end_date=''): 58 | """ 59 | Define train dataset 60 | :params: dataset as dataframe, y as string, list_id as list, train_start_date as string, train_end_date as string 61 | :return: a Pandas dataframe 62 | """ 63 | date_var = Utils.find_date(df) 64 | df.loc[:, date_var] = df.loc[:, date_var].apply(lambda x: pd.to_datetime(dt.datetime.strftime(x, '%Y-%m-%d'), dayfirst=True)) 65 | 66 | if train_start_date == '': 67 | train_start_date = min(df.loc[df[y].notnull(), date_var]) 68 | elif (train_start_date != '') & (isinstance(train_start_date, str)): 69 | train_start_date = pd.to_datetime(train_start_date, dayfirst=True) 70 | else: 71 | print('Train start date is already a date') 72 | 73 | print('Train start date is', train_start_date) 74 | 75 | if train_end_date == '': 76 | train_end_date = max(df.loc[df[y].notnull(), date_var]) 77 | elif (train_end_date != '') & (isinstance(train_end_date, str)): 78 | train_end_date = pd.to_datetime(train_end_date, dayfirst=True) 79 | else: 80 | print('Train end date is already a date') 81 | 82 | print('Train end date is', train_end_date) 83 | 84 | ### Slicing by observation 85 | df_sliced = df.loc[(~df.loc[:, y].isnull()) & (df.loc[:, date_var]>=train_start_date) & (df.loc[:, date_var]<=train_end_date), ].reset_index(drop=True) 86 | print('Train shape before removing nan is', df_sliced.shape[0]) 87 | 88 | # Removing additional nan 89 | train = df_sliced[df_sliced.isnull()==False].sort_values(by=date_var).reset_index(drop=True) 90 | train_start_date = min(df_sliced.loc[:, date_var]) 91 | print('Min date AFTER removing nan is', train_start_date) 92 | train_end_date = max(df_sliced.loc[:, date_var]) 93 | print('Max date AFTER removing nan is', train_end_date) 94 | print('Shape AFTER removing nan is', df_sliced.shape[0]) 95 | 96 | ### Slicing by feature 97 | # Features set 98 | train_features = sorted(list(set(list(train.columns)) - set(list_id + [y]))) 99 | y_plus_train_features = [y] + train_features 100 | 101 | # X_train and Y_train 102 | X_train = train.loc[:, train_features].reset_index(drop=True) 103 | Y_train = train.loc[:, y_plus_train_features].reset_index(drop=True) 104 | 105 | # Date array 106 | date_array = train.loc[:, date_var].reset_index(drop=True) 107 | 108 | # Historical data 109 | historical_data = df.loc[df[date_var]>=min(df.loc[df[y].notnull(), date_var]), [date_var, y]].reset_index(drop=True) 110 | 111 | ### Create final dict 112 | dict_train = {'X_train': X_train, 'Y_train': Y_train, 'date_array': date_array, 'y': y, 'list_id': list_id, 'train_start_date': train_start_date, 'train_end_date': train_end_date, 'historical_data': historical_data} 113 | 114 | return dict_train 115 | 116 | def def_test(df, y, list_id, test_start_date='', test_end_date=''): 117 | """ 118 | Define test dataset 119 | :params: dataset as dataframe, y as string, list_id as list, test_start_date as string, test_end_date as string 120 | :return: a Pandas dictionary 121 | """ 122 | date_var = Utils.find_date(df) 123 | df.loc[:, date_var] = df.loc[:, date_var].apply(lambda x: pd.to_datetime(dt.datetime.strftime(x, '%Y-%m-%d'), dayfirst=True)) 124 | if test_start_date == '': 125 | test_start_date = min(df.loc[df[y].isnull()==False, date_var]) + dt.timedelta(1) 126 | else: 127 | test_start_date = pd.to_datetime(test_start_date, dayfirst=True) 128 | print('Test start date is', test_start_date) 129 | 130 | if test_end_date == '': 131 | test_end_date = df.loc[(df[y].isnull()==True), date_var].max() 132 | else: 133 | test_end_date = pd.to_datetime(test_end_date, dayfirst=True) 134 | print('Test end date is', test_end_date) 135 | 136 | ### Slicing by observation 137 | df_sliced = df.loc[(df[date_var]>= test_start_date) & (df[date_var] <= test_end_date), ].reset_index(drop=True) 138 | test = df_sliced.sort_values(by=date_var) 139 | test_start_date = min(df_sliced.loc[:, date_var]) 140 | test_end_date = max(df_sliced.loc[:, date_var]) 141 | 142 | ### Slicing by feature 143 | # Features set 144 | test_features = sorted(list(set(list(test.columns)) - set(list_id + [y]))) 145 | y_plus_date = [date_var] + [y] 146 | 147 | # X_train, y_tilda 148 | X_test = test.loc[:, test_features].copy().reset_index(drop=True) 149 | y_tilda = test.loc[:, y_plus_date].copy().reset_index(drop=True) 150 | 151 | # Date array 152 | date_array = test.loc[:, date_var].copy().reset_index(drop=True) 153 | 154 | # Historical data 155 | historical_data = df.loc[:, [date_var, y]].reset_index(drop=True) 156 | 157 | dict_test = {'X_test': X_test, 'y_tilda' : y_tilda, 'date_array': date_array, 'y': y, 'list_id': list_id, 'test_start_date': test_start_date, 'test_end_date': test_end_date, 'historical_data': historical_data} 158 | 159 | return dict_test 160 | 161 | -------------------------------------------------------------------------------- /Code/Utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Utils/__init__.py -------------------------------------------------------------------------------- /Code/Utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Utils/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Utils/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /Code/Utils/utils.py: -------------------------------------------------------------------------------- 1 | # data elaboration functions 2 | import pandas as pd 3 | import string 4 | import numpy as np 5 | import re 6 | from functools import reduce 7 | from pandasql import sqldf 8 | 9 | # datetime functions 10 | import datetime as dt 11 | 12 | # file management functions 13 | import os 14 | import sys 15 | import opendatasets as od 16 | import pickle 17 | from pathlib import Path 18 | 19 | from sklearn.utils import column_or_1d 20 | 21 | class Utils: 22 | def camel_to_snake(name): 23 | """ 24 | Changes string from camel case to snake case 25 | :params: a string 26 | :return: a string 27 | """ 28 | list_words = re.findall('([A-Z][a-z]*)', name) 29 | 30 | if len(list_words)>1: 31 | new_name = list_words[0].lower() 32 | for w in range(1, len(list_words)): 33 | new_name = new_name + '_' + list_words[w].lower() 34 | else: 35 | new_name = name.lower() 36 | return new_name 37 | 38 | def columns_camel_to_snake(df): 39 | """ 40 | Changes dataframe columns from camel case to snake case 41 | :params: df as dataframe 42 | :return: a pandas dataframe 43 | """ 44 | list_cols = list(df.columns) 45 | for name in list_cols: 46 | new_name = Utils.camel_to_snake(name) 47 | df.rename(columns = {name: new_name}, inplace=True) 48 | return df 49 | 50 | def find_date(df): 51 | """ 52 | Finds date columns in a dataframe 53 | :params: df as dataframe 54 | :return: a string 55 | """ 56 | dates = list(df.select_dtypes(include=['datetime','datetime64[ns, UTC]']).drop_duplicates().columns) 57 | 58 | if len(dates)==1: 59 | print('find_date, date_col found:', dates) 60 | date_col = dates[0] 61 | elif len(dates)==0: 62 | dates = list(df.select_dtypes(include=['period[M]']).drop_duplicates().columns) 63 | print('find_date, date_col found:', dates) 64 | date_col = dates[0] 65 | else: 66 | date_col = dates.copy() 67 | 68 | if (len(date_col)==0): 69 | raise Exception('find_date, no date_col found') 70 | 71 | return date_col 72 | 73 | def find_match_in_list(list_to_match, match_to_find): 74 | """ 75 | Finds a match in a list given a list of possible words to match 76 | :params: list to match as a list, match_to_find as a list of words to match 77 | :return: a list 78 | """ 79 | 80 | list_to_match = list(dict.fromkeys(list_to_match)) 81 | match_list = list() 82 | for m in match_to_find: 83 | match_list.extend([el for el in list_to_match if isinstance(el, collections_abc.Iterable) and (m in el)]) 84 | 85 | match_list = list(dict.fromkeys(match_list)) 86 | return match_list 87 | 88 | def delta_format(delta: np.timedelta64) -> str: 89 | """ 90 | Identifies frequency in numpy timedelta 91 | :params: numpy timedelta 92 | :return: a string 93 | """ 94 | try: 95 | days = delta.astype("timedelta64[D]") / np.timedelta64(1, 'D') 96 | hours = int(delta.astype("timedelta64[h]") / np.timedelta64(1, 'h') % 24) 97 | except: 98 | days = delta / np.timedelta64(1, 'D') 99 | hours = int(delta / np.timedelta64(1, 'h') % 24) 100 | 101 | if days > 0 and hours > 0: 102 | return f"{days:.0f} d, {hours:.0f} h" 103 | elif days > 0: 104 | return f"{days:.0f} d" 105 | else: 106 | return f"{hours:.0f} h" 107 | 108 | def find_freq(timedelta): 109 | """ 110 | Finds frequency in numpy timedelta 111 | :params: numpy timedelta 112 | :return: a string 113 | """ 114 | if ('d' in timedelta): 115 | return 'D' 116 | elif ('h' in timedelta) & ('d' not in timedelta): 117 | return 'H' 118 | else: 119 | print('find_freq: could not infer frequency') 120 | 121 | def find_freq_in_dataframe(df, date_var): 122 | """ 123 | Finds frequency in pandas dataframe 124 | :params: df as pandas dataframe, date_var as string 125 | :return: a string 126 | """ 127 | freq = pd.Series(df[date_var].unique()).dt.freq 128 | return freq 129 | 130 | def get_project_root(Path): 131 | """ 132 | Finds the parent folder of the parent folder 133 | :params: Path 134 | :return: Path 135 | """ 136 | return Path(__file__).parent.parent 137 | 138 | def create_folder_tree(folder_name): 139 | try: 140 | os.makedirs(os.path.join(folder_name)) 141 | except OSError: 142 | print("Creation of the directory failed or already present", folder_name) 143 | else: 144 | print("Successfully created the directory", folder_name) 145 | return 146 | 147 | def add_daily_date(df): 148 | """ 149 | Adds a date variable at daily frequency to dataframe 150 | :params: pandas dataframe 151 | :return: pandas dataframe 152 | """ 153 | 154 | date_var = Utils.find_date(df) 155 | delta = abs(np.diff(df[date_var])).mean() 156 | timedelta = Utils.delta_format(delta) 157 | freq = Utils.find_freq(timedelta) 158 | 159 | # Creating date_daily 160 | 161 | if (freq == 'H'): 162 | if isinstance(date_var,list)==False: 163 | new_var_hour_str = date_var + '_hour_str' 164 | new_var = date_var + '_daily' 165 | df.loc[:, new_var_hour_str] = df.loc[:, date_var].dt.strftime('%Y-%m-%d %H:%M:%S') 166 | df.loc[:, new_var] = pd.to_datetime(df.date_hour_str.apply(lambda x: x.split(' ')[0]), format = '%Y-%m-%d') 167 | df.drop(columns=new_var_hour_str, inplace=True) 168 | else: 169 | for d in date_var: 170 | new_var_hour_str = d + '_hour_str' 171 | new_var = d + '_daily' 172 | df.loc[:, new_var_hour_str] = df.loc[:, d].dt.strftime('%Y-%m-%d %H:%M:%S') 173 | df.loc[:, new_var] = pd.to_datetime(df.date_hour_str.apply(lambda x: x.split(' ')[0]), format = '%Y-%m-%d') 174 | df.drop(columns=new_var_hour_str, inplace=True) 175 | elif (freq == 'D'): 176 | if (isinstance(date_var,list)==False): 177 | new_var = date_var + '_daily' 178 | if (new_var not in list(df.columns)): 179 | df.rename(columns = {date_var: date_var + '_daily'}, inplace=True) 180 | else: 181 | print('add_daily_date: data are in daily format') 182 | else: 183 | for d in date_var: 184 | new_var = d + '_daily' 185 | if (new_var not in list(df.columns)): 186 | df.rename(columns = {date_var: date_var + '_daily'}, inplace=True) 187 | else: 188 | print('add_daily_date: data are in daily format') 189 | return df 190 | 191 | def find_categorical_variables(df): 192 | """ 193 | Finds categorical variables in pandas dataframe 194 | :params: pandas dataframe 195 | :return: pandas dataframe 196 | """ 197 | 198 | categorical_dtypes = ['category', 'bool'] 199 | date_dtypes = ["datetime64[ns, UTC]"] 200 | list_categorical = [] 201 | for col in list(df.columns): 202 | try: 203 | df[col] = df[col].apply(lambda x: int(x)) 204 | if (df[col].dtype.name in categorical_dtypes) & (df[col].dtype.name not in date_dtypes): 205 | list_categorical = list_categorical + [col] 206 | elif all(df[col].isin([0, 1])) & (df[col].dtype.name not in date_dtypes): 207 | list_categorical = list_categorical + [col] 208 | elif (df[col].dtype.name not in date_dtypes): 209 | list_categorical = list_categorical.copy() 210 | except: 211 | list_categorical = list_categorical.copy() 212 | 213 | return list_categorical 214 | 215 | def resample_data(df, id, date_var, sampling, dict_grouping): 216 | """ 217 | Resample the data to a particular frequency 218 | :params: df as pandas dataframe, id as string, date_var as string, 219 | sampling as string of frequency and dict_grouping as dictionary as {variable_to_resample: 'function_to_apply'} 220 | :return: a Pandas dataframe 221 | """ 222 | 223 | wanted_keys = list(set(dict_grouping.keys()) - set([id, date_var])) 224 | dictfilt = lambda x, y: dict([ (i,x[i]) for i in x if i in wanted_keys]) 225 | list_variables = list(dictfilt(dict_grouping, wanted_keys).keys()) 226 | 227 | # df setup for merge 228 | id_list = list(df[id].unique()) 229 | df_resampled = df.loc[df[id] == id_list[0], [date_var, id, list_variables[0]]].drop_duplicates([date_var]).resample(sampling, on=date_var).agg({list_variables[0]: dict_grouping[list_variables[0]]}).reset_index() 230 | df_resampled.loc[:, id] = id_list[0] 231 | print('resample_data: variable', list_variables[0]) 232 | for i in range(1, len(id_list)): 233 | m = df.loc[df[id] == id_list[i], [date_var, id, list_variables[0]]].drop_duplicates([date_var]).resample(sampling, on=date_var).agg({list_variables[0]: dict_grouping[list_variables[0]]}).reset_index() 234 | m.loc[:, id] = id_list[i] 235 | df_resampled = pd.merge(df_resampled, m, on=[id, date_var, list_variables[0]], how='outer', validate = '1:1') 236 | print('resample_data: variable', list_variables[0], 'completed' ) 237 | 238 | # df loop for merge 239 | for k in range(1, len(list_variables)): 240 | df_m = df.loc[df[id] == id_list[0], [date_var, id, list_variables[k]]].drop_duplicates([date_var]).resample(sampling, on=date_var).agg({list_variables[k]: dict_grouping[list_variables[k]]}).reset_index() 241 | df_m.loc[:, id] = id_list[0] 242 | print('resample_data: variable', list_variables[k]) 243 | for i in range(1, len(id_list)): 244 | m = df.loc[df[id] == id_list[i], [date_var, id, list_variables[k]]].drop_duplicates([date_var]).resample(sampling, on=date_var).agg({list_variables[k]: dict_grouping[list_variables[k]]}).reset_index() 245 | m.loc[:, id] = id_list[i] 246 | df_m = pd.merge(df_m, m, on=[id, date_var, list_variables[k]], how='outer', validate = '1:1') 247 | 248 | df_resampled = pd.merge(df_resampled, df_m, on=[id, date_var], how='outer', validate = '1:1') 249 | print('resample_data: variable', list_variables[k], 'completed' ) 250 | print(df_resampled) 251 | return df_resampled 252 | 253 | def resample_data_pandassql(df_name, id_column, date_column, freq, aggregation_per_col): 254 | """ 255 | Resample the data to a particular frequency 256 | :params: df_name as string name of a pandas dataframe, id as string, date_var as string, 257 | the sampling as string freq (e.g. 3-m, 5-h, 1-D) and aggregation_per_col as dictionary as {variable_to_resample: 'function_to_apply'} 258 | :return: a Pandas dataframe 259 | """ 260 | # TO-DO: check for interval of original series 261 | pysqldf = lambda q: sqldf(q, globals()) 262 | 263 | num = freq.split('-')[0] 264 | window = freq.split('-')[1] 265 | 266 | 267 | for i in set(aggregation_per_col.values()): 268 | if i.upper() not in ['MAX','MIN','LAST', 'AVG', 'SUM' ]: 269 | print('''Aggregation not supported: Use one of these: 270 | 'MAX','MIN','LAST', 'AVG', 'SUM''') 271 | return 272 | 273 | if window == 'm': 274 | helper = f'''WITH helper AS( 275 | SELECT *, Substr(date({date_column}), 1,Instr(date({date_column}),'-')-1) AS year, 276 | Substr(date({date_column}), -5,Instr(date({date_column}),'-')-3) AS month, 277 | Substr(date({date_column}), -2,Instr(date({date_column}),'-')-1) AS day, 278 | Substr(time({date_column}), 1,Instr(time({date_column}),':')-1) AS hour, 279 | CAST(Substr(time({date_column}), -5,Instr(time({date_column}),':')-1)/{num} AS modu) AS mod 280 | FROM {df_name} 281 | )\n''' 282 | groupby = 'year, month, day, hour, mod, '+str(id_column) 283 | 284 | if window == 'h': 285 | helper = f'''WITH helper AS( 286 | SELECT *, Substr(date({date_column}), 1,Instr(date({date_column}),'-')-1) AS year, 287 | Substr(date({date_column}), -5,Instr(date({date_column}),'-')-3) AS month, 288 | Substr(date({date_column}), -2,Instr(date({date_column}),'-')-1) AS day, 289 | CAST(Substr(time({date_column}), 1,Instr(time({date_column}),':')-1)/{num} AS modu) as mod 290 | FROM {df_name} 291 | )\n''' 292 | groupby = 'year, month, day, mod, '+str(id_column) 293 | 294 | if window == 'D': 295 | helper = f'''WITH helper AS( 296 | SELECT*, Substr(date({date_column}), 1,Instr(date({date_column}),'-')-1) AS year, 297 | Substr(date({date_column}), -5,Instr(date({date_column}),'-')-3) AS month, 298 | CAST(Substr(date({date_column}), -2,Instr(date({date_column}),'-')-1)/{num} AS modu) as mod 299 | FROM {df_name} 300 | )\n''' 301 | groupby = 'year, month, mod, '+str(id_column) 302 | 303 | list_select = [] 304 | for i in aggregation_per_col: 305 | aggElement = aggregation_per_col[i].upper()+'('+i+')' +' AS '+i 306 | list_select.append(aggElement) 307 | string_select = ',\n'.join(list_select) 308 | 309 | agg = 'SELECT '+ date_column+ ','+string_select + '\n FROM helper\n GROUP BY '+ groupby 310 | query = helper + agg 311 | 312 | return pysqldf(query) 313 | 314 | 315 | 316 | def add_seq(df, date_var, serie, freq, end_date='', start_date=''): 317 | """ 318 | Creates a sequence of completes date/hours to a dataframe 319 | :params: dataframe in long format to add date/hour observations, date_var as string, 320 | serie or id as string or list, freq as datetime.timedelta end and start date in format "%dd/%mm/%YYYY" 321 | :return: a Pandas dataframe 322 | """ 323 | 324 | df.loc[:, date_var] = df[date_var].apply(lambda x: x.tz_localize(None)) 325 | 326 | if isinstance(serie, list)==False: 327 | seq = pd.DataFrame() 328 | serie_list = list(df.loc[:, serie].unique()) 329 | for i in serie_list: 330 | if start_date == '': 331 | start_date = min(df.loc[df[serie]==i, date_var]).tz_localize(None) 332 | else: 333 | start_date = pd.to_datetime(start_date, dayfirst=True).tz_localize(None) 334 | 335 | if end_date == '': 336 | end_date = max(df.loc[df[serie]==i, date_var]).tz_localize(None) 337 | else: 338 | end_date = pd.to_datetime(end_date, dayfirst=True).tz_localize(None) 339 | 340 | # Sequence 341 | time_range = pd.Series(pd.date_range( 342 | start=start_date, end=end_date, freq=freq)) 343 | 344 | print('Adding sequence to serie', i, 'as', 345 | serie_list.index(i) + 1, 'of', len(serie_list)) 346 | temp = pd.DataFrame.from_dict({serie: [i] * len(time_range), 'date': time_range}) 347 | temp.rename(columns={'date': date_var}, inplace=True) 348 | seq = pd.concat([seq, temp], axis=0, ignore_index=True) 349 | 350 | serie = [serie, date_var] 351 | else: 352 | seq = pd.DataFrame() 353 | serie_list = df.loc[:, serie].drop_duplicates().reset_index(drop=True) 354 | 355 | row_list = serie_list.shape[0] 356 | col_list = serie_list.shape[1] 357 | for i in range(0, row_list, 1): 358 | print('Adding sequence to serie', i + 1, 'of', row_list) 359 | dict = {} 360 | for c in range(0, col_list, 1): 361 | col_name = serie_list.columns[c] 362 | id_col = serie_list.loc[i,col_name] 363 | if start_date == '': 364 | start_date = min(df.loc[(df[col_name]==id_col), date_var]).tz_localize(None) 365 | else: 366 | start_date = pd.to_datetime(start_date, dayfirst=True).tz_localize(None) 367 | 368 | if end_date == '': 369 | end_date = max(df.loc[(df[col_name]==id_col), date_var]).tz_localize(None) 370 | else: 371 | end_date = pd.to_datetime(end_date, dayfirst=True).tz_localize(None) 372 | 373 | # Sequence 374 | time_range = pd.Series(pd.date_range( 375 | start=start_date, end=end_date, freq=freq)) 376 | 377 | temp_col = {col_name: [serie_list.loc[i,col_name]]* len(time_range)} 378 | dict.update(temp_col) 379 | 380 | temp = pd.DataFrame.from_dict(dict) 381 | temp.loc[:, date_var] = time_range 382 | seq = pd.concat([seq, temp], axis=0, ignore_index=True) 383 | serie.extend([date_var]) 384 | 385 | duplicates = seq.loc[:, serie].duplicated().any() 386 | if duplicates==True: 387 | raise Exception(print("add_seq: there are duplicates in sequence")) 388 | else: 389 | print("add_seq: there are NO duplicates in sequence") 390 | df_seq = pd.merge(seq, df, on=serie, how='left', validate='1:1') 391 | 392 | duplicates_in_df_seq = df_seq.loc[:, serie].duplicated().any() 393 | if duplicates_in_df_seq==True: 394 | raise Exception(print("add_seq: there are duplicates when adding sequence")) 395 | else: 396 | print("add_seq: there are NO duplicates when adding sequence") 397 | 398 | print('Total serie to forecast:', len(df_seq.loc[:, serie].drop_duplicates())) 399 | 400 | return df_seq 401 | 402 | def check_length_time_serie(df, date_var, index): 403 | """ 404 | Checks the length that a time sequence of completes date/hours should have, so that it can be compared 405 | with actual observation 406 | :params: df as pandas dataframe, date_var as string, index as list as groupby variable 407 | :return: a Pandas dataframe 408 | """ 409 | freq = pd.Series(df[date_var].unique()).dt.freq 410 | pivot = pd.pivot_table(df, index=index, values=date_var, aggfunc=['count', 'min', 'max']).reset_index() 411 | pivot.columns = pivot.columns.get_level_values(0) 412 | pivot.loc[:, 'td'] = pivot.loc[:, 'max'].max() - pivot.loc[:, 'min'].min() 413 | pivot.loc[:, 'count'] = pivot.loc[:, 'count'].astype(float) 414 | 415 | if freq=='H': 416 | pivot.loc[:, 'freq'] = 'H' 417 | pivot.loc[:, 'expected_obs'] = pivot.loc[:, 'td'].apply(lambda x: x.days*24) + pivot.loc[:, 'td'].apply(lambda x: x.seconds/3600) + 1 418 | pivot.loc[:, 'mismatch'] = 0 419 | pivot.loc[pivot['count']!=pivot['expected_obs'], 'mismatch'] = 1 420 | if sum(pivot.mismatch)>0: 421 | print('Expected length of sequence is NOT OK \n', pivot[[index, 'count', 'expected_obs']].drop_duplicates()) 422 | else: 423 | print('Expected length of sequence is OK \n', pivot[[index, 'count', 'expected_obs']].drop_duplicates()) 424 | 425 | elif freq=='D': 426 | pivot.loc[:, 'freq'] = 'D' 427 | pivot.loc[:, 'expected_obs'] = pivot.loc[:, 'td'].apply(lambda x: x.days) + pivot.loc[:, 'td'].apply(lambda x: x.seconds/3600*24) + 1 428 | pivot.loc[:, 'mismatch'] = 0 429 | pivot.loc[pivot['count']!=pivot['expected_obs'], 'mismatch'] = 1 430 | if sum(pivot.mismatch)>0: 431 | print('Expected length of sequence is NOT OK \n', pivot[[index, 'count', 'expected_obs']].drop_duplicates()) 432 | else: 433 | print('Expected length of sequence is OK \n', pivot[[index, 'count', 'expected_obs']].drop_duplicates()) 434 | 435 | else: 436 | pivot.loc[:, 'freq'] = np.nan 437 | pivot.loc[:, 'expected_obs'] = np.nan 438 | print('check_length_time_serie: could not infer frequency') 439 | 440 | 441 | return pivot 442 | 443 | def check_regressors_availability(df, date_var, regressors_list, forecast_end_date): 444 | """ 445 | Checks the availability of regressors based on forecast end date 446 | :params: df as pandas dataframe, date_var as string, regressors_list as list and forecast_end_date as string in format "2022-12-31" 447 | :return: None 448 | """ 449 | forecast_end_date = pd.to_datetime(forecast_end_date, dayfirst = False) 450 | 451 | for r in regressors_list: 452 | if any(df.loc[df[date_var]<=forecast_end_date, r].isnull()): 453 | print('Latest filled available date for regressor', r, 'is', df.loc[df[r].isnull()==False, date_var].max(), '\n expected is', forecast_end_date) 454 | raise Exception('Regressor', r, 'shows null values <= forecast_end_date. \n Please, fill them before going on') 455 | else: 456 | print('Regressor', r, 'has all needed values') 457 | return None 458 | 459 | def remove_regressors_with_nan(df, date_var, regressors_list, forecast_end_date): 460 | """ 461 | Remove regressors with nan based on forecast end date 462 | :params: df as pandas dataframe, date_var as string, regressors_list as list and forecast_end_date as string in format "2022-12-31" 463 | :return: pandas dataframe 464 | """ 465 | forecast_end_date = pd.to_datetime(forecast_end_date, dayfirst = False) 466 | 467 | for r in regressors_list: 468 | if any(df.loc[df[date_var]<=forecast_end_date, r].isnull()): 469 | print('Latest filled available date for regressor', r, 'is', df.loc[df[r].isnull()==False, date_var].max(), '\n expected is', forecast_end_date) 470 | print('Regressor', r, 'shows null values <= forecast_end_date. \n Regressor REMOVED') 471 | df.drop(columns = r, inplace=True) 472 | else: 473 | print('Regressor', r, 'has all needed values') 474 | return df 475 | 476 | def match_to_find(serie_to_find): 477 | """ 478 | Finds a match in a list of possible words to match 479 | :params: serie_to_find as a list of words to match 480 | :return: a list 481 | """ 482 | match_to_find = [] 483 | match_to_find = match_to_find + [serie_to_find] 484 | match_to_find = match_to_find + [serie_to_find.lower()] 485 | match_to_find = match_to_find + [serie_to_find.upper()] 486 | match_to_find = match_to_find + [serie_to_find.capitalize()] 487 | match_to_find = match_to_find + [re.sub('[^a-zA-Z0-9 \n\.]', '_', serie_to_find)] 488 | match_to_find = match_to_find + [re.sub('[^a-zA-Z0-9 \n\.]', '_', serie_to_find.lower())] 489 | match_to_find = match_to_find + [re.sub('[^a-zA-Z0-9 \n\.]', '_', serie_to_find.upper())] 490 | match_to_find = match_to_find + [re.sub('[^a-zA-Z0-9 \n\.]', '_', serie_to_find.capitalize())] 491 | return match_to_find 492 | 493 | def find_match(df, serie_name, match_to_find): 494 | """ 495 | Finds a match in a dataframe serie given a list of possible words to match 496 | :params: dataframe, serie_name as string, match_to_find as a list of words to match 497 | :return: a list 498 | """ 499 | 500 | list_to_match = list(df.loc[:, serie_name].unique()) 501 | match_list = list() 502 | for m in match_to_find: 503 | match_list.extend([el for el in list_to_match if isinstance(el, collections_abc.Iterable) and (m in el)]) 504 | 505 | match_list = list(dict.fromkeys(match_list)) 506 | return match_list 507 | 508 | def find_match_in_list(list_to_match, match_to_find): 509 | """ 510 | Finds a match in a list given a list of possible words to match 511 | :params: list to match as a list, match_to_find as a list of words to match 512 | :return: a list 513 | """ 514 | 515 | list_to_match = list(dict.fromkeys(list_to_match)) 516 | match_list = list() 517 | for m in match_to_find: 518 | match_list.extend([el for el in list_to_match if isinstance(el, collections_abc.Iterable) and (m in el)]) 519 | 520 | match_list = list(dict.fromkeys(match_list)) 521 | return match_list 522 | 523 | def id_outliers_IQR(df, q1, q3, date_var, id, var, freq_var): 524 | """ 525 | Identifies outliers creatinga dummy variable (0/1) called outlier using IQR method, where quantile value can be set 526 | :param dates: dataframe, q1 and q3 values as numeric 00, ].copy() 532 | 533 | if isinstance(id, 'list'): 534 | list_id = id + [var, freq_var] 535 | else: 536 | list_id = [id, var, freq_var] 537 | 538 | # Freq var 539 | df.loc[:, freq_var] = df.loc[:, date_var].apply(lambda x: x.month) 540 | 541 | ### ID outliers 542 | grouped = df.loc[:, list_id].groupby(list_id) 543 | df_q1 = grouped.quantile(q1).reset_index() 544 | df_q1.rename(columns={var: 'q1'}, inplace=True) 545 | df_q3 = grouped.quantile(q3).reset_index() 546 | df_q3.rename(columns={var: 'q3'}, inplace=True) 547 | 548 | # Merge 549 | dfs = [df, df_q1, df_q3] 550 | df_outliers = reduce(lambda left,right: pd.merge(left,right,how='left', on=list_id, validate='m:1'), dfs) 551 | 552 | df_outliers.loc[:, 'IQR'] = df_outliers.q3 - df_outliers.q1 553 | df_outliers.loc[:, 'outlier'] = 0 554 | df_outliers.loc[((df_outliers[var]<(df_outliers.q1-1.5*df_outliers.IQR)) | (df_outliers[var]>(df_outliers.q3+1.5*df_outliers.IQR))), 'outlier']= 1 555 | var_cleaned = var + '_cleaned' 556 | df_outliers.loc[:, var_cleaned] = df_outliers.loc[:, var] 557 | df_outliers.loc[df_outliers.outlier==1, var_cleaned] = np.nan 558 | 559 | # Summarizing outliers in a pivot table 560 | pivot_sum = pd.pivot_table(df_outliers, values='outlier', index=list_id, aggfunc=sum).reset_index() 561 | pivot_len = pd.pivot_table(df_outliers, values='outlier', index=list_id, aggfunc=len).reset_index() 562 | pivot_len.rename(columns={'outlier': 'obs'}, inplace=True) 563 | pivot = pd.merge(pivot_sum, pivot_len, on=list_id, how='inner', validate='1:1') 564 | pivot.loc[:, 'outliers_perc'] = round(pivot.outlier / pivot.obs,2) 565 | 566 | dict_outliers = {'df_outliers': df_outliers, 'pivot_outliers': pivot} 567 | return dict_outliers 568 | 569 | 570 | class AlphabeticalCombinations: 571 | def write_neat_csv(saving_file, df_fcst): 572 | """ 573 | Writes neat csv 574 | :params: saving_file as string, df_fcst as dataframe to write 575 | :return: None 576 | """ 577 | df_fcst.to_csv(saving_file, sep=';', date_format="%Y-%m-%d %H:%M:%S", header=True, index=False, compression='infer', quoting=None, quotechar='"', doublequote=False, decimal='.') 578 | 579 | return(print('*** write_neat_csv: completed', saving_file)) 580 | 581 | def convert(string): 582 | """ 583 | Convert string to list 584 | :params: string 585 | :return: a list 586 | """ 587 | list1=[] 588 | list1[:0]=string 589 | return list1 590 | 591 | def excel_columns(): 592 | """ 593 | Counts excel columns 594 | :params: none 595 | :return: a list 596 | """ 597 | alphabet_string = string.ascii_uppercase 598 | li = AlphabeticalCombinations.convert(alphabet_string) 599 | excel_columns = [letter for letter in alphabet_string] 600 | for L in li: 601 | aces = [L + li for li in li] 602 | excel_columns.extend(aces) 603 | 604 | return excel_columns 605 | 606 | def write_beautiful_excel(saving_file, dict_df_to_write): 607 | """ 608 | Writes beautiful excel 609 | :params: saving_file as string, dict_df_to_write as dictionary with dict key as sheet name and dict value as data 610 | :return: None 611 | """ 612 | ### Writing to Excel 613 | writer = pd.ExcelWriter(saving_file, engine='xlsxwriter', datetime_format='dd/mm/yyyy hh:mm:ss', date_format='dd/mm/yyyy') 614 | 615 | # FCST 616 | for d in list(dict_df_to_write.keys()): 617 | df = dict_df_to_write[d] 618 | df.to_excel(writer, sheet_name=d, index=False) 619 | 620 | # Make handles for workbook/sheet 621 | workbook = writer.book 622 | worksheet = writer.sheets[d] 623 | 624 | # Create positive/negative cell format 625 | format_simone = workbook.add_format({'num_format': '#,##0;- #,##0'}) 626 | format_percentage = workbook.add_format({'num_format': '0.00%'}) 627 | 628 | # Identify percentage columns 629 | cols_percentage = [] 630 | for c in list(df.columns): 631 | try: 632 | if any(df[c]>=1) and any(df[c]>=0) and any(df[c].between(0, 1, inclusive=False)): 633 | cols_percentage.extend([c]) 634 | except: 635 | pass 636 | 637 | # Define the worksheet range to apply number format 638 | cols = AlphabeticalCombinations.excel_columns() 639 | row = len(df) 640 | format_range = '{}{}:{}{}'.format(cols[0], row, cols[len(df.columns)-1], row) 641 | 642 | # Apply number formats to specified range 643 | worksheet.set_column(format_range, None, format_simone) 644 | 645 | if len(cols_percentage)>0: 646 | for f in cols_percentage: 647 | n = list(df.columns).index(f) 648 | row = len(df) 649 | format_range = '{}{}:{}{}'.format(cols[n], row, cols[n], row) 650 | worksheet.set_column(format_range, None, format_percentage) 651 | 652 | #Iterate through each column and set the width == the max length in that column. A padding length of 2 is also added. 653 | for i, col in enumerate(df.columns): 654 | # find length of column i 655 | column_len = df[col].astype(str).str.len().max() 656 | # Setting the length if the column header is larger 657 | # than the max column value length 658 | column_len = max(column_len, len(col)) + 4 659 | # set the column length 660 | worksheet.set_column(i, i, column_len) 661 | 662 | ## Close the Pandas Excel writer and output the Excel file 663 | writer.save() 664 | return(print('*** write_beatiful_excel: completed', saving_file)) 665 | 666 | def write_beautiful_excel_table(saving_file, dict_df_to_write): 667 | """ 668 | Writes beautiful excel tables 669 | :params: saving_file as string, dict_df_to_write as dictionary with dict key as sheet name and dict value as data 670 | :return: None 671 | """ 672 | ### Writing to Excel 673 | writer = pd.ExcelWriter(saving_file, engine='xlsxwriter', datetime_format='dd/mm/yyyy hh:mm:ss', date_format='dd/mm/yyyy') 674 | 675 | # FCST 676 | for d in list(dict_df_to_write.keys()): 677 | df = dict_df_to_write[d] 678 | df.to_excel(writer, sheet_name=d, index=False) 679 | 680 | # Make handles for workbook/sheet 681 | workbook = writer.book 682 | worksheet = writer.sheets[d] 683 | 684 | # Create positive/negative cell format 685 | format_simone = workbook.add_format({'num_format': '#,##0;- #,##0'}) 686 | format_percentage = workbook.add_format({'num_format': '0.00%'}) 687 | 688 | # Identify percentage columns 689 | cols_percentage = [] 690 | for c in list(df.columns): 691 | try: 692 | if any(df[c]>=1) and any(df[c]>=0) and any(df[c].between(0, 1, inclusive=False)): 693 | cols_percentage.extend([c]) 694 | except: 695 | pass 696 | 697 | # Define the worksheet range to apply number format 698 | cols = AlphabeticalCombinations.excel_columns() 699 | row = len(df) 700 | format_range = '{}{}:{}{}'.format(cols[0], row, cols[len(df.columns)-1], row) 701 | 702 | # Apply number formats to specified range 703 | worksheet.set_column(format_range, None, format_simone) 704 | 705 | if len(cols_percentage)>0: 706 | for f in cols_percentage: 707 | n = list(df.columns).index(f) 708 | row = len(df) 709 | format_range = '{}{}:{}{}'.format(cols[n], row, cols[n], row) 710 | worksheet.set_column(format_range, None, format_percentage) 711 | 712 | #Iterate through each column and set the width == the max length in that column. A padding length of 2 is also added. 713 | for i, col in enumerate(df.columns): 714 | # find length of column i 715 | column_len = df[col].astype(str).str.len().max() 716 | # Setting the length if the column header is larger 717 | # than the max column value length 718 | column_len = max(column_len, len(col)) + 4 719 | # set the column length 720 | worksheet.set_column(i, i, column_len) 721 | 722 | # Create a list of column headers, to use in add_table(). 723 | column_settings = [] 724 | for header in df.columns: 725 | column_settings.append({'header': header}) 726 | 727 | # Add the table. 728 | worksheet.add_table(0, 0, df.shape[0], df.shape[1] - 1, {'columns': column_settings}) 729 | 730 | ## Close the Pandas Excel writer and output the Excel file 731 | writer.save() 732 | return(print('*** write_beatiful_excel: completed', saving_file)) 733 | 734 | -------------------------------------------------------------------------------- /Code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/__init__.py -------------------------------------------------------------------------------- /Code/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /Configuration/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Configuration/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Configuration/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /Configuration/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Configuration/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /Configuration/config.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from box import Box 3 | import sys 4 | import os 5 | 6 | from pathlib import Path 7 | 8 | def get_project_root() -> Path: 9 | return Path(__file__).parent.parent 10 | 11 | root = get_project_root() 12 | with open(os.path.join(root, "Configuration/config.yaml"), "r") as ymlfile: 13 | cfg_path = Box(yaml.safe_load(ymlfile)) 14 | 15 | -------------------------------------------------------------------------------- /Configuration/config.yaml: -------------------------------------------------------------------------------- 1 | data_dir: 2 | input_path: "Data/Input" 3 | output_path: "Data/Output" 4 | plot_path: "Data/Plots" 5 | -------------------------------------------------------------------------------- /Dashboards/EnergyDashboard.pbix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Dashboards/EnergyDashboard.pbix -------------------------------------------------------------------------------- /Docs/Images/banner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/banner.jpg -------------------------------------------------------------------------------- /Docs/Images/calendar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/calendar.png -------------------------------------------------------------------------------- /Docs/Images/elbow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/elbow.png -------------------------------------------------------------------------------- /Docs/Images/intermittent_TS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/intermittent_TS.png -------------------------------------------------------------------------------- /Docs/Images/panel_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/panel_data.png -------------------------------------------------------------------------------- /Docs/Images/sliding_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/sliding_plot.png -------------------------------------------------------------------------------- /Docs/Images/thermal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/thermal.png -------------------------------------------------------------------------------- /Docs/Slides/ds_toolkit_forecasting_2.0_memo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Slides/ds_toolkit_forecasting_2.0_memo.pdf -------------------------------------------------------------------------------- /Environment/forecasting_energy.yml: -------------------------------------------------------------------------------- 1 | name: forecasting_energy 2 | channels: 3 | - anaconda 4 | - defaults 5 | dependencies: 6 | - ca-certificates=2020.10.14=0 7 | - certifi=2020.6.20=py37_0 8 | - openssl=1.1.1h=he774522_0 9 | - pip=20.2.4=py37_0 10 | - python=3.7.7=h81c818b_4 11 | - setuptools=50.3.0=py37h9490d1a_1 12 | - sqlite=3.33.0=h2a8f88b_0 13 | - vc=14.1=h0510ff6_4 14 | - vs2015_runtime=14.16.27012=hf0eaf9b_3 15 | - wheel=0.35.1=py_0 16 | - wincertstore=0.2=py37_0 17 | - zlib=1.2.11=vc14h1cdd9ab_1 18 | - pip: 19 | - absl-py==0.11.0 20 | - adal==1.2.5 21 | - adjustText==0.7.3 22 | - altair==4.1.0 23 | - antlr4-python3-runtime==4.8 24 | - applicationinsights==0.11.9 25 | - argcomplete==1.12.3 26 | - argon2-cffi==21.1.0 27 | - argparse==1.4.0 28 | - astor==0.8.1 29 | - astunparse==1.6.3 30 | - async-generator==1.10 31 | - attrs==21.2.0 32 | - autopep8==1.5.7 33 | - azure-cognitiveservices-vision-customvision==3.0.0 34 | - azure-common==1.1.26 35 | - azure-core==1.23.0 36 | - azure-graphrbac==0.61.1 37 | - azure-identity==1.4.1 38 | - azure-keyvault-secrets==4.4.0 39 | - azure-mgmt-authorization==0.61.0 40 | - azure-mgmt-containerregistry==2.8.0 41 | - azure-mgmt-keyvault==2.2.0 42 | - azure-mgmt-resource==10.3.0 43 | - azure-mgmt-storage==11.2.0 44 | - azure-storage-blob==12.10.0 45 | - azureml-automl-core==1.18.0.post1 46 | - azureml-core==1.17.0 47 | - azureml-dataprep==2.4.4 48 | - azureml-dataprep-native==24.0.0 49 | - azureml-dataprep-rslex==1.2.3 50 | - azureml-dataset-runtime==1.18.0 51 | - azureml-defaults==1.18.0 52 | - azureml-model-management-sdk==1.0.1b6.post1 53 | - azureml-pipeline==1.18.0 54 | - azureml-pipeline-core==1.18.0 55 | - azureml-pipeline-steps==1.18.0 56 | - azureml-sdk==1.18.0 57 | - azureml-telemetry==1.18.0 58 | - azureml-train==1.18.0 59 | - azureml-train-automl-client==1.18.0 60 | - azureml-train-core==1.18.0.post1 61 | - azureml-train-restclients-hyperdrive==1.18.0 62 | - backcall==0.2.0 63 | - backports-tempfile==1.0 64 | - backports-weakref==1.0.post1 65 | - backports-zoneinfo==0.2.1 66 | - base58==2.1.1 67 | - bleach==4.1.0 68 | - blinker==1.4 69 | - cached-property==1.5.2 70 | - cachetools==4.1.1 71 | - cffi==1.14.3 72 | - charset-normalizer==2.0.6 73 | - click==7.1.2 74 | - cloudpickle==1.6.0 75 | - cmdstanpy==0.9.5 76 | - colorama==0.4.4 77 | - configparser==3.7.4 78 | - contextlib2==0.6.0.post1 79 | - convertdate==2.3.2 80 | - cryptography==3.2.1 81 | - cycler==0.10.0 82 | - cython==0.29.26 83 | - databricks-cli==0.16.2 84 | - databricks-connect==7.3.30 85 | - dateinfer==0.2.0 86 | - debugpy==1.5.0 87 | - decorator==5.1.0 88 | - defusedxml==0.7.1 89 | - dill==0.3.3 90 | - distro==1.5.0 91 | - docker==4.3.1 92 | - dotnetcore2==2.1.19 93 | - entrypoints==0.3 94 | - ephem==4.1.3 95 | - et-xmlfile==1.1.0 96 | - flask==1.0.3 97 | - fusepy==3.0.1 98 | - gast==0.3.3 99 | - gitdb==4.0.9 100 | - gitpython==3.1.24 101 | - google-auth==1.23.0 102 | - google-auth-oauthlib==0.4.2 103 | - google-pasta==0.2.0 104 | - grpcio==1.33.2 105 | - gunicorn==19.9.0 106 | - h11==0.12.0 107 | - h5py==3.1.0 108 | - hijri-converter==2.2.2 109 | - holidays==0.11.3.1 110 | - idna==3.2 111 | - importlib-metadata==2.0.0 112 | - imutils==0.5.3 113 | - ipykernel==6.4.1 114 | - ipython==7.28.0 115 | - ipython-genutils==0.2.0 116 | - ipywidgets==7.6.5 117 | - isodate==0.6.0 118 | - itsdangerous==1.1.0 119 | - jedi==0.18.0 120 | - jeepney==0.6.0 121 | - jinja2==2.11.2 122 | - jmespath==0.10.0 123 | - joblib==0.17.0 124 | - json-logging-py==0.2 125 | - json5==0.8.5 126 | - jsonpickle==1.4.1 127 | - jsonschema==4.0.1 128 | - jupyter-client==7.0.6 129 | - jupyter-core==4.8.1 130 | - jupyterlab-pygments==0.1.2 131 | - jupyterlab-widgets==1.0.2 132 | - kaggle==1.5.12 133 | - keras-applications==1.0.8 134 | - keras-preprocessing==1.1.0 135 | - kiwisolver==1.3.2 136 | - kneed==0.7.0 137 | - korean-lunar-calendar==0.2.1 138 | - liac-arff==2.5.0 139 | - lunarcalendar==0.0.9 140 | - markdown==3.3.3 141 | - markupsafe==1.1.1 142 | - matplotlib==3.4.3 143 | - matplotlib-inline==0.1.3 144 | - mistune==0.8.4 145 | - msal==1.6.0 146 | - msal-extensions==0.2.2 147 | - msrest==0.6.21 148 | - msrestazure==0.6.2 149 | - nbclient==0.5.4 150 | - nbconvert==6.2.0 151 | - nbformat==5.1.3 152 | - nbimporter==0.3.4 153 | - ndg-httpsclient==0.5.1 154 | - nest-asyncio==1.5.1 155 | - notebook==6.4.5 156 | - numpy==1.19.0 157 | - oauthlib==3.1.0 158 | - omegaconf==2.1.2 159 | - opencv-python==4.3.0.36 160 | - opencv-python-headless==4.3.0.36 161 | - opendatasets==0.1.20 162 | - openpyxl==3.0.9 163 | - opt-einsum==3.3.0 164 | - outcome==1.1.0 165 | - packaging==21.2 166 | - pandas==1.3.5 167 | - pandocfilters==1.5.0 168 | - parso==0.8.2 169 | - pathspec==0.8.1 170 | - pep8==1.7.1 171 | - pickleshare==0.7.5 172 | - pillow==8.3.2 173 | - plotly==5.3.1 174 | - portalocker==1.7.1 175 | - prometheus-client==0.12.0 176 | - prompt-toolkit==3.0.20 177 | - protobuf==3.14.0 178 | - py4j==0.10.9 179 | - pyarrow==1.0.1 180 | - pyasn1==0.4.8 181 | - pyasn1-modules==0.2.8 182 | - pycodestyle==2.7.0 183 | - pycparser==2.20 184 | - pydeck==0.7.1 185 | - pygments==2.10.0 186 | - pyjwt==1.7.1 187 | - pymeeus==0.5.11 188 | - pyodbc==4.0.32 189 | - pyopenssl==19.1.0 190 | - pyparsing==2.4.7 191 | - pyrsistent==0.18.0 192 | - pystan==2.19.1.1 193 | - python-box==5.4.1 194 | - python-dateutil==2.8.1 195 | - python-slugify==6.1.1 196 | - pytz==2020.4 197 | - pytz-deprecation-shim==0.1.0.post0 198 | - pywin32==227 199 | - pywinpty==1.1.5 200 | - pyyaml==6.0 201 | - pyzmq==22.3.0 202 | - repackage==0.7.3 203 | - requests==2.26.0 204 | - requests-oauthlib==1.3.0 205 | - rsa==4.6 206 | - ruamel-yaml==0.16.12 207 | - ruamel-yaml-clib==0.2.2 208 | - scikit-learn==0.22.2.post1 209 | - scipy==1.4.1 210 | - seaborn==0.11.2 211 | - secretstorage==3.2.0 212 | - selenium==4.0.0 213 | - send2trash==1.8.0 214 | - setuptools-git==1.2 215 | - shapely==1.7.0 216 | - six==1.15.0 217 | - sklearn==0.0 218 | - smmap==5.0.0 219 | - sniffio==1.2.0 220 | - sortedcontainers==2.4.0 221 | - streamlit==1.1.0 222 | - tabulate==0.8.9 223 | - tenacity==8.0.1 224 | - tensorboard==2.2.2 225 | - tensorboard-plugin-wit==1.7.0 226 | - tensorflow==2.2.0 227 | - tensorflow-estimator==2.2.0 228 | - tensorflow-gpu==2.2.0 229 | - tensorflow-gpu-estimator==2.2.0 230 | - termcolor==1.1.0 231 | - terminado==0.12.1 232 | - testpath==0.5.0 233 | - text-unidecode==1.3 234 | - toml==0.10.2 235 | - toolz==0.11.1 236 | - tornado==6.1 237 | - tqdm==4.62.3 238 | - traitlets==5.1.0 239 | - trio==0.19.0 240 | - trio-websocket==0.9.2 241 | - typing-extensions==4.1.1 242 | - tzdata==2021.5 243 | - tzlocal==4.1 244 | - ujson==5.1.0 245 | - urllib3==1.26.7 246 | - validators==0.18.2 247 | - watchdog==2.1.6 248 | - wcwidth==0.2.5 249 | - webencodings==0.5.1 250 | - websocket-client==0.57.0 251 | - werkzeug==1.0.1 252 | - widgetsnbextension==3.5.2 253 | - wrapt==1.13.1 254 | - wsproto==1.0.0 255 | - xgboost==1.4.2 256 | - xlrd==2.0.1 257 | - xlsxwriter==3.0.1 258 | - zipp==3.4.0 259 | prefix: C:\Users\mabellani\.conda\envs\forecasting_energy 260 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /Notebooks/EnergyClusteringRegular.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "dc3e4402", 6 | "metadata": {}, 7 | "source": [ 8 | "# Implementation" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "sWbXCGozBRNW", 14 | "metadata": { 15 | "id": "sWbXCGozBRNW" 16 | }, 17 | "source": [ 18 | "## Packages" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "kmxpysFu7zjH", 25 | "metadata": { 26 | "colab": { 27 | "base_uri": "https://localhost:8080/" 28 | }, 29 | "id": "kmxpysFu7zjH", 30 | "outputId": "db2717d5-22be-4fa8-99fb-3f9ea90e7e1b" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "# data elaboration functions\n", 35 | "import pandas as pd\n", 36 | "from six.moves import collections_abc\n", 37 | "import string\n", 38 | "import numpy as np\n", 39 | "\n", 40 | "# datetime functions\n", 41 | "import datetime as dt\n", 42 | "\n", 43 | "# file management functions\n", 44 | "import os\n", 45 | "import sys\n", 46 | "import opendatasets as od\n", 47 | "import pickle\n", 48 | "from pathlib import Path\n", 49 | "\n", 50 | "# plot functions\n", 51 | "import matplotlib.pyplot as plt\n", 52 | "%matplotlib inline\n", 53 | "\n", 54 | "# data science functions\n", 55 | "import matplotlib.pyplot as plt\n", 56 | "from kneed import KneeLocator\n", 57 | "from sklearn.datasets import make_blobs\n", 58 | "from sklearn.cluster import KMeans\n", 59 | "from sklearn.metrics import silhouette_score\n", 60 | "from sklearn.preprocessing import StandardScaler, scale\n", 61 | "from sklearn.metrics import mean_absolute_error\n", 62 | "import joblib\n", 63 | "from sklearn.linear_model import LinearRegression\n", 64 | "from sklearn.ensemble import RandomForestRegressor\n", 65 | "from sklearn.model_selection import train_test_split\n", 66 | "import xgboost as xgb\n", 67 | "\n", 68 | "# statistical functions\n", 69 | "from scipy.stats.mstats import winsorize\n", 70 | "\n", 71 | "# configuration file\n", 72 | "module_path = os.path.abspath(os.path.join('..'))\n", 73 | "if module_path not in sys.path:\n", 74 | " sys.path.append(module_path)\n", 75 | "\n", 76 | "# custom functions\n", 77 | "from Code.Profiling.Intermittent.intermittent import Intermittent\n", 78 | "from Code.Utils.utils import Utils\n", 79 | "from Code.Scoring.kpi import Kpi\n", 80 | "from Code.Scoring.forecast import Forecasting\n", 81 | "from Code.Scoring.train import Training\n", 82 | "from Code.Scoring.train_test import TrainTest\n", 83 | "from Code.Scoring.scoring import Scoring\n", 84 | "from Code.Regressors.regressors import Regressors\n", 85 | "from Code.Plotting.plots import Plots\n", 86 | "from Configuration.config import cfg_path" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "8dc26b7b", 92 | "metadata": {}, 93 | "source": [ 94 | "## Setup" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "458162d0", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# od.download(\"https://www.kaggle.com/arashnic/building-sites-power-consumption-dataset/download\")\n", 105 | "root = Path(os.getcwd()).parent\n", 106 | "dataset_path = os.path.join(root, cfg_path.data_dir.input_path)\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "86bb0e13", 112 | "metadata": {}, 113 | "source": [ 114 | "## Load Data" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "id": "09358d6d", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "dict_profiling = pd.read_pickle(os.path.join(root, cfg_path.data_dir.output_path, 'dict_profiling.pkl'))\n", 125 | "df_final = pd.read_pickle(os.path.join(\n", 126 | " root, cfg_path.data_dir.output_path, 'df_final.pkl'))\n", 127 | "df_final.head()\n" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "id": "f23ed7fb", 133 | "metadata": {}, 134 | "source": [ 135 | "## Parameter setup" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "id = 'site_id'\n", 145 | "list_unique_id = ['site_id', 'timestamp']\n", 146 | "list_temp = ['temp']\n", 147 | "y = 'value'\n", 148 | "date_var = Utils.find_date(df_final)\n", 149 | "\n", 150 | "# Winsorizing parameters\n", 151 | "highest = 0.05\n", 152 | "lowest = 0.05" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "# Clustering regular time series" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "# Define regular ids list\n", 169 | "list_id_clustering = list(dict_profiling['regular'])\n", 170 | "mask = df_final[id].isin(list(dict_profiling['regular']))\n", 171 | "df = df_final.loc[mask, [date_var, id, y]]\n", 172 | "\n", 173 | "# Set seed\n", 174 | "sample_seed_kmeans = 789\n", 175 | "# Standardizing data\n", 176 | "df_win_sum = df.loc[:, [id, y]].groupby(id).apply(\n", 177 | " lambda x: np.sum(winsorize(x, (highest, lowest)))).reset_index()\n", 178 | "df_win_sum.columns = [id, \"sum_\" + y]\n", 179 | "\n", 180 | "# Checking if some ids have 0 values after winsorizing\n", 181 | "if len(set(list_id_clustering) - set(list(df_win_sum[id].unique()))) > 0:\n", 182 | " list_id_clustering = list(set(list_id_clustering) - set(list(df_win_sum[id].unique())))\n", 183 | " print(id, list_id_clustering, \"has/have 0\", y, \"after winsorizing\")\n", 184 | " mask = (df[y]!=np.nan) & (~df[id].isin(list_id_clustering))\n", 185 | " df_std = df.loc[mask, ].pivot(index=date_var, columns=id, values=y).reset_index()\n", 186 | " charvec = df_std[date_var].dt.strftime('%Y-%m-%d')\n", 187 | " df_std.set_index(date_var, inplace=True)\n", 188 | "else:\n", 189 | " mask = (df[y]!=np.nan)\n", 190 | " df_std = df.loc[mask, ].pivot(index=date_var, columns=id, values=y).reset_index()\n", 191 | " charvec = df_std[date_var].dt.strftime('%Y-%m-%d')\n", 192 | " df_std.set_index(date_var, inplace=True)\n", 193 | " print(\"NO\", id, \"has/have 0\", y, \"after winsorizing\")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "## Defining a set of ids to cluster with NO nan" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### In order to perform cluster analysis, one need to have a matrix with no nan value and set the index of the dataframe with date_var" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "df_std_no_nan = df_std.dropna()\n", 217 | "if len(df_std_no_nan)==0:\n", 218 | " list_id_cluster = [16, 21,22,25,26, 27, 29, 33, 40, 49]\n", 219 | " df_cluster = df_std.loc[:, list_id_cluster].dropna()\n", 220 | "else:\n", 221 | " list_id_cluster = list(set(list(df_std.columns)) - set(list(date_var)))\n", 222 | " df_cluster = df_std.loc[:, list_id_cluster].dropna()\n", 223 | "print('Clustering regular profiles on ids', list_id_cluster)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "### Set the number of cluster you want to try" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "# Total sum of squares\n", 240 | "tot_ss = pd.DataFrame(df_cluster.apply(scale, axis=1)**2).sum(axis=0, skipna=True)\n", 241 | "\n", 242 | "# Setting up charvec\n", 243 | "start_date = min(df_cluster.index)\n", 244 | "end_date = max(df_cluster.index)\n", 245 | "\n", 246 | "# Define the number of clusters\n", 247 | "try_clusters = 11\n", 248 | "\n", 249 | "# K-means setup\n", 250 | "kmeans_kwargs = { \n", 251 | " \"init\": \"random\",\n", 252 | " \"n_init\": 10,\n", 253 | " \"max_iter\": 300,\n", 254 | " \"random_state\": 42,\n", 255 | "}" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "### Choosing the Appropriate Number of Clusters\n", 263 | "In this section, you’ll look at two methods that are commonly used to evaluate the appropriate number of clusters:\n", 264 | "\n", 265 | "- The elbow method\n", 266 | "- The silhouette coefficient\n", 267 | "\n", 268 | "These are often used as complementary evaluation techniques" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "#### The elbow method" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "#X = np.array(df_cluster.transpose())\n", 285 | "X = np.array(df_cluster)\n", 286 | "\n", 287 | "# A list holds the SSE values for each k\n", 288 | "\n", 289 | "sse = []\n", 290 | "for k in range(1, try_clusters):\n", 291 | " kmeans = KMeans(n_clusters = k, **kmeans_kwargs)\n", 292 | " kmeans.fit(X)\n", 293 | " sse.append(kmeans.inertia_)\n", 294 | "\n", 295 | "plt.style.use(\"fivethirtyeight\")\n", 296 | "plt.plot(range(1, try_clusters), sse)\n", 297 | "plt.xticks(range(1, try_clusters))\n", 298 | "plt.xlabel(\"Number of Clusters\")\n", 299 | "plt.ylabel(\"SSE\")\n", 300 | "plt.show()" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "kl = KneeLocator(range(1, 11), sse, curve=\"convex\", direction=\"decreasing\")\n", 310 | "print(\"Elbow method: optimal number of clusters is\", kl.elbow)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "#### The silhouette coefficient\n", 318 | "The silhouette coefficient is a measure of cluster cohesion and separation. It quantifies how well a data point fits into its assigned cluster based on two factors:\n", 319 | "\n", 320 | "- How close the data point is to other points in the cluster\n", 321 | "- How far away the data point is from points in other clusters\n", 322 | "\n", 323 | "Silhouette coefficient values range between -1 and 1. Larger numbers indicate that samples are closer to their clusters than they are to other clusters." 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "# A list holds the silhouette coefficients for each k\n", 333 | "silhouette_coefficients = []\n", 334 | "\n", 335 | "# Notice you start at 2 clusters for silhouette coefficient\n", 336 | "for k in range(2, try_clusters):\n", 337 | " kmeans = KMeans(n_clusters=k, **kmeans_kwargs)\n", 338 | " kmeans.fit(X)\n", 339 | " score = silhouette_score(X, kmeans.labels_)\n", 340 | " silhouette_coefficients.append(score)\n", 341 | " \n", 342 | "pd.DataFrame(silhouette_coefficients)\n", 343 | " \n", 344 | "plt.style.use(\"fivethirtyeight\")\n", 345 | "plt.plot(range(2, try_clusters), silhouette_coefficients)\n", 346 | "plt.xticks(range(2, try_clusters))\n", 347 | "plt.xlabel(\"Number of Clusters\")\n", 348 | "plt.ylabel(\"Silhouette Coefficient\")\n", 349 | "plt.show()\n", 350 | "\n", 351 | "df_sil_coeff = pd.DataFrame(silhouette_coefficients).reset_index()\n", 352 | "optimal_silhouette_coefficients = df_sil_coeff.loc[df_sil_coeff[0]==max(silhouette_coefficients), 'index'][0] + 2\n", 353 | "print(\"Silhouette coefficients: optimal number of clusters is\", optimal_silhouette_coefficients)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "## Clustering using the optimal number of clusters chosen" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "chosen_clusters = 4" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "kmeans = KMeans(n_clusters=chosen_clusters, **kmeans_kwargs)\n", 379 | "identified_clusters = kmeans.fit_predict(X)\n", 380 | "\n", 381 | "df_cluster.loc[:, 'cluster'] = identified_clusters \n", 382 | "\n", 383 | "# Updating profiling dictionary\n", 384 | "dict_profiling['regular']['cluster'] = {}\n", 385 | "for c in range(0, len(dict_profiling['regular'])):\n", 386 | " dict_profiling['cluster'] = {dict_profiling['regular'][c]: df_cluster.loc[df_cluster.index==dict_profiling['regular'][c], 'cluster'].unique()[0]}\n", 387 | " print(id, c, dict_profiling[c])" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "### Plotting clustered regular series" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "df_to_plot = pd.melt(df_cluster.reset_index(), id_vars=[date_var, 'cluster'])\n", 404 | "for cluster in list(df_cluster['cluster'].unique()):\n", 405 | " count = 1\n", 406 | " for i in list(df_to_plot[id].unique()):\n", 407 | " print('Plotting id:', i, 'as', count, 'of',\n", 408 | " len(list(df_to_plot[id].unique())))\n", 409 | " chart_title = id + ' ' + str(i) + \" - Profile regular cluster \" + str(cluster)\n", 410 | " plot = Plots.sliding_line_plot(df_to_plot, y, id, i, chart_title)\n", 411 | " plot.write_html(os.path.join(root, cfg_path.data_dir.plot_path,\n", 412 | " id + '_' + str(i) + '_profile_regular_cluster_' + str(cluster) + \".html\"))\n", 413 | " count = count + 1\n" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "# Saving" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "# create a binary pickle file \n", 430 | "f = open(os.path.join(root, cfg_path.data_dir.output_path, 'dict_profiling.pkl'),\"wb\")\n", 431 | "# write the python object (dict) to pickle file\n", 432 | "pickle.dump(dict_profiling,f)\n", 433 | "# close file\n", 434 | "f.close()\n" 435 | ] 436 | } 437 | ], 438 | "metadata": { 439 | "interpreter": { 440 | "hash": "bde6963c5f9d136d1b0963ec6638d0588f83e0d56652a4cd4ef0ca62bda372aa" 441 | }, 442 | "kernelspec": { 443 | "display_name": "Python 3.7.7 ('forecasting_energy')", 444 | "language": "python", 445 | "name": "python3" 446 | }, 447 | "language_info": { 448 | "codemirror_mode": { 449 | "name": "ipython", 450 | "version": 3 451 | }, 452 | "file_extension": ".py", 453 | "mimetype": "text/x-python", 454 | "name": "python", 455 | "nbconvert_exporter": "python", 456 | "pygments_lexer": "ipython3", 457 | "version": "3.7.7" 458 | }, 459 | "orig_nbformat": 4 460 | }, 461 | "nbformat": 4, 462 | "nbformat_minor": 2 463 | } 464 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![banner](Docs/Images/banner.jpg) 2 | 3 | # Forecasting 2.0 Accelerator 4 | [Forecasting 2.0 accelerator presentation](Docs/Slides/ds_toolkit_forecasting_2.0_memo.pdf) 5 | 6 | - [Forecasting 2.0 Accelerator](#forecasting-20-accelerator) 7 | - [Overview](#overview) 8 | - [I am a data scientist new to demand forecasting. How can this accelerator help me? What should I do to use it?](#i-am-a-data-scientist-new-to-demand-forecasting-how-can-this-accelerator-help-me-what-should-i-do-to-use-it) 9 | - [What do I need in terms of time series data to use this accelerator?](#what-do-i-need-in-terms-of-time-series-data-to-use-this-accelerator) 10 | - [Why this accelerator might be useful for you](#why-this-accelerator-might-be-useful-for-you) 11 | - [How to use this accelerator as guideline](#how-to-use-this-accelerator-as-guideline) 12 | - [Notebooks](#notebooks) 13 | - [1. EnergyDataExploration](#1-energydataexploration) 14 | - [2. EnergyPredictionDataPreparation](#2-energypredictiondatapreparation) 15 | - [3. EnergyProfilingIntermittent](#3-energyprofilingintermittent) 16 | - [4. EnergyClusteringRegular](#4-energyclusteringregular) 17 | - [5. EnergyPredictionScoring](#5-energypredictionscoring) 18 | - [How should I validate a model?](#how-should-i-validate-a-model) 19 | - [Interpreting errors](#interpreting-errors) 20 | - [Profiling (clustering) Time Series:​](#profiling-clustering-time-series) 21 | - [Identifying intermittent time series:​](#identifying-intermittent-time-series) 22 | - [How to identify intermittent time series:​](#how-to-identify-intermittent-time-series) 23 | - [Intermittent indicators parameters](#intermittent-indicators-parameters) 24 | - [What if I am working with data that are not related to energy consumption?](#what-if-i-am-working-with-data-that-are-not-related-to-energy-consumption) 25 | - [References on intermittent time series: ​](#references-on-intermittent-time-series-) 26 | - [Methods to forecast intermittent time series (not yet implemented in this accelerator):​](#methods-to-forecast-intermittent-time-series-not-yet-implemented-in-this-accelerator) 27 | - [Constant](#constant) 28 | - [Constant at zero](#constant-at-zero) 29 | - [Unforecastable time and unforecastable quantity](#unforecastable-time-and-unforecastable-quantity) 30 | - [Spikes, lumpy, erratic](#spikes-lumpy-erratic) 31 | - [Clustering profiles​](#clustering-profiles) 32 | - [Methods to forecast regular time series](#methods-to-forecast-regular-time-series) 33 | - [Getting Started](#getting-started) 34 | - [config.yaml file example](#configyaml-file-example) 35 | - [Default Directory Structure](#default-directory-structure) 36 | - [Build and Test](#build-and-test) 37 | - [Functions](#functions) 38 | - [Plotting](#plotting) 39 | - [Profiling](#profiling) 40 | - [Regressors](#regressors) 41 | - [Scoring](#scoring) 42 | - [Kpi](#kpi) 43 | - [Utils](#utils) 44 | - [Contributing](#contributing) 45 | - [As data scientist, how can I contribute?](#as-data-scientist-how-can-i-contribute) 46 | - [How to contribute to profiling?](#how-to-contribute-to-profiling) 47 | - [Insurance Claims data](#insurance-claims-data) 48 | - [How to contribute to data preparation and scoring?](#how-to-contribute-to-data-preparation-and-scoring) 49 | - [Trademarks](#trademarks) 50 | # Overview 51 | This accelerator provides code and guidance to produce time series forecasting and time series profiling. The aim of this accelerator is to help data scientists to forecast multiple time series by building models based on the time-series profiling, by performing an accurate data preparation and by training and forecasting multiple time series based with models created ad-hoc for each profile. 52 | 53 | Time series modelling is defined as the combination of: 54 | 1. Choice of explanatory variables or regressors - which variables help me in explaining the target variable I want to forecast? 55 | 2. Choice of forecasting algorithm - which algorithm do I use to produce my forecast? Arima, Linear regression, Boosting model? 56 | 3. Choice of train set - how many observations do I use to train my model and produce my forecast? 57 | 58 | Each model is optimized to better fit the training dataset and forecast the target variable: from energy consumption to spare parts demand. Classification or Clustering profile of time series data helps in defining the best fitting model in terms of choice of regressors (calendar variables or temperatures), forecasting algorithm (ARIMA vs Exponential smoothing) and train set (one year or just few days of data). 59 | 60 | # If I am new to demand forecasting, how can this accelerator help me? What should I do to use it? 61 | ## What do I need in terms of time series data to use this accelerator? 62 | This accelerator deals with so-called **panel data**. In statistics and econometrics, panel data or longitudinal data is a collection of data that contains observations about different cross sections (groups or ids) that is assembled over intervals in time and ordered chronologically. Examples of groups that may make up panel data series include countries, firms, individuals, or demographic groups. 63 | 64 | ![Alt text](Docs/Images/panel_data.png?raw=true "Panel data") 65 | 66 | Specifically: 67 | 68 | | Group or Id | Time period | Notation | 69 | | :--- | :--- | :--- | 70 | | 1 | 1 | $Y_{11}$ | 71 | | 1 | 2 | $Y_{12}$ | 72 | | 1 | T | $Y_{1T}$ | 73 | | $\vdots$ | $\vdots$ | $\vdots$ | 74 | | N | 1 | $Y_{N1}$ | 75 | | N | 2 | $Y_{N2}$ | 76 | | N | T | $Y_{NT}$ | 77 | 78 | Example datasets: 79 | 80 | | Field | Topics | Example dataset | 81 | | :--- | :--- | :--- | 82 | | Microeconomics | GDP across multiple countries, Unemployment across different states, Income dynamic studies, international current account balances | [Panel Study of Income Dynamics (PSID)](https://psidonline.isr.umich.edu/) | 83 | | Macroeconomics | International trade tables, world socioeconomic tables, currency exchange rate tables | [Penn World Tables](https://www.rug.nl/ggdc/productivity/pwt/) | 84 | Epidemiology and Health Statistics| Public health insurance data, disease survival rate data, child development and well-being data| [Medical Expenditure Panel Survey](https://www.meps.ahrq.gov/mepsweb/) 85 | Finance| Stock prices by firm, market volatilities by country or firm| [Global Market Indices](https://finance.yahoo.com/world-indices/) 86 | 87 | If you have a **single time series** it can be thought of as special cases of panel data that has one dimension only (one panel member or individual), so you can still take advantge from the accelerator, altought it is not useful to run the profiler, since you will have just one profile by default. 88 | 89 | ## Why might this accelerator be useful for you 90 | 1. It provides you with guidelines in the form of notebooks that can help you taking into account all necessary steps in order to perform a good data preparation, which is crucial in forecasting 91 | 2. It provides you with a library of functions you might need when dealing with demand forecasting, such as: 92 | - Sliding plots like the one below: 93 | ![Alt text](Docs/Images/sliding_plot.png?raw=true "Sliding plot") 94 | - Adding holidays by country or other regressors such as months, weekdays and interaction terms 95 | - Creating normal temperature future scenarios to generate years-ahead forecasts 96 | - Filling missing data using similar days or similar weeks values 97 | - Compute errors like mean absolute error and mean absolute percentage error (also in case of zero dividend...) 98 | - Wrap up results in Excel or csv files 99 | 3. If you have several time series to forecast, thanks to the **Profiling** module, it allows you to quickly understand how "difficult" to forecast are the time series you are dealing with by classifying time series as intermittent or regular. You might want to know that if data profiling shows intermittent, you might not have consistent accuracy. This is crucial to drive the right customer expectations on the forecast accuracy. Profiling also helps you accelerating the production of forecast when dealing with high numbers of time series to forecast (more than 10 and less than 100): by grouping time series, for example with 2 intermittent + 4 regular consumption profiles, you can develop 6 models which can be applied by category thus reducing work load and increasing accuracy 100 | 4. It helps you to quickly run backtesting with multiple models, and choosing the best model in terms of mean absolute error 101 | 102 | ## How to use this accelerator as guideline 103 | This accelerator provides you with 5 Notebooks that drives you through the essential steps you need to obtain a good forecast. 104 | 105 | ### Notebooks 106 | Notebooks are available in the Notebooks folder and provide guidance to use the Forecast 2.0 functions. 107 | #### 1. EnergyDataExploration 108 | [A notebook](./Notebooks/EnergyDataExploration.ipynb) that provides an exploratory data analysis in order to understand the type of time series you are dealing with 109 | #### 2. EnergyPredictionDataPreparation 110 | [A notebook](./Notebooks/EnergyPredictionDataPreparation.ipynb) that helps with Time Series Data Preparation, in particular how to deal with NAs, how to aggregate time series and how to create useful regressors (e.g. calendar variables) 111 | #### 3. EnergyProfilingIntermittent 112 | [A notebook](./Notebooks/EnergyProfilingIntermittent.ipynb) that profiles time series to be regular, intermittent, lumpy, erratic, unforecastable in terms of time, unforecastable in terms of quantity, constant and constant at zero 113 | #### 4. EnergyClusteringRegular 114 | [A notebook](./Notebooks/EnergyClusteringRegular.ipynb) that performs a k-means flat cluster analysis on those time series that were classified as regular 115 | #### 5. EnergyPredictionScoring 116 | [A notebook](./Notebooks/EnergyPredictionScoring.ipynb) that helps you produce a forecast, plot the results and compute KPIs on a panel dataframe, where you have multiple timeseries identified by a given group or id (e.g. multiple sensors time series, multiple plants or site-id energy consumption, etc) 117 | 118 | ## How should I validate a model? 119 | You can validate your model using the following KPIs (implemented, please refer to the EnergyPredictionScoring Notebooks and to the Functions section below): 120 | 1. `Mean Error`: average of all forecast-actual 121 | 2. `Mean Absolute Error`: average of all absolute values (forecast-actual) 122 | 3. `Mean Absolute Percentage Error`: average of all absolute errors/actual 123 | 124 | ### Interpreting errors 125 | As you can infer, the above KPIs values depends on: 126 | - **Seasonality** 127 | This means that when you have, for example, yearly seasonality, you might have periods of the year where the model performs better and where the model perform worse. Make sure which one is best for your use case. 128 | - **Low demand values** 129 | This means that when you have, for example, a lot of low demand actual values and your forecast is in the neighbourhood of that value, your Absolute Percentage Error will easily result very close to 1, significantly worsening your MAPE. Make sure to interpret your error results accordingly. 130 | 131 | Other important factors that can affect your error: 132 | - **Auto-regressive components** 133 | If you have data that allows to employ auto-regressive components, i.e. the lagged value of the variable you want to forecast, this will improve your accuracy significantly. 134 | - **Length of forecast horizon** 135 | If you need to forecast a long duration of horizon ahead (i.e. you start from daily data granularity and you need to forecast years ahead), your accuracy will reduce 136 | - **Measurement error** 137 | If your data has a lot of outliers, missing data or measurement errors (i.e. sensors data), this will reduce your accuracy 138 | - **Collinearity** 139 | Multicollinearity is a statistical concept where several independent variables in a model are correlated. Two variables are considered to be perfectly collinear if their correlation coefficient is +/- 1.0. Multicollinearity among independent variables will result in less reliable statistical inferences. You might consider using techniques such as Principal Component Analysis in order to deal with the issue. 140 | 141 | # Profiling (clustering) Time Series:​ 142 | The **goal** is to identify consumption patterns that are similar to each other in order to assign the optimal model in terms of min of MAE or MSE​. 143 | 144 | The **first step** is to identify the series that is classified as “intermittent” with respect to those “regular”​ and **then** proceed to perform a k-means cluster analysis only on the latter. 145 | 146 | The **expected output** is to label each time series as intermittent with respect to regular. 147 | 148 | ### Identifying intermittent time series:​ 149 | Definition of intermittent time series: intermittent time series or demand comes about when a product or a time series experiences several periods of zero demand. Often in these situation, when demand occurs it is small, and sometimes highly variable in size​ 150 | 151 | #### How to identify intermittent time series:​ 152 | Compute the following indicators such as 153 | 1. ​Average Inter-demand Interval (ADI), this parameter is period based which is calculated as average interval time between two demand occurrences​ 154 | 2. Coefficient of Variation Squared (CV2), this statistical parameter is calculated as standard deviation of the *For correspondence demand divided by the average demand for non-zero demand periods. The squared coefficient of variation represents variability of demand size.​ 155 | 3. Standard Deviation of Inter-demand Interval (SDDI) ​ 156 | 157 | Based on their values, it is possible to identify intermittent time series as: 158 | - spikes 159 | - lumpy 160 | - erratic 161 | - unforecastable in terms of time volatility 162 | - unforecastable in terms of quantity volatility 163 | - constant 164 | - constant at zero 165 | - regular time series ​ 166 | 167 | ![Alt text](Docs/Images/intermittent_TS.png?raw=true "Intermittent time series") 168 | 169 | #### Intermittent indicators parameters 170 | Intermittent indicators parameters vary depending on the type of time series (i.e. data generation process of the time series) such as energy consumption in KWh or insurance claims in USD, therefore intermittent indicators must be set every time depending on the type of time series and their validation is done looking at time series charts resulting from the profiling Notebook. 171 | 172 | Intermittent indicators are the following: 173 | - **thres_cv2_constant** defines the threshold value to set constant time series with respect to a constant at zero time series 174 | - **thres_cv2** defines the threshold value between low CV2 and high CV2 175 | - **thres_adi** defines the threshold value between low ADI and high ADI 176 | - **thres_sddi** defines the threshold value between low SDDI and high SDDI 177 | - **min_time_cons** defines the threshold value of minimum time between two demand entries (on with respect to off demand) 178 | 179 | Parameters for electricity consumption in KWh, daily data. 180 | - thres_cv2_constant = 0.06 181 | - thres_cv2 = 2 182 | - thres_adi = 3 183 | - thres_sddi = 6.2 184 | - min_time_cons = 2 185 | 186 | Parameters for insurance claims data in USD, daily data. Claims from work accidents in mining industry. 187 | - thres_cv2_constant = 0.01 188 | - thres_cv2 = 0.2 189 | - thres_adi = 1.2 190 | - thres_sddi = 6.0 191 | - min_time_cons = 25 192 | 193 | ##### What if I am working with data that are not related to energy consumption? 194 | You can still use the accelerator and the profiler, but you need to setup new intermittent indicators. To do so, create a copy of the DataPreparation and ProfilingIntermittent Notebooks, run first the DataPreparation and save your data. Load them into the ProfilingIntermittent and having in mind the [Intermittent Classificator Chart](Docs/Images/intermittent_TS.png?raw=true "Intermittent time series"), set new parameters for thres_cv2_constant, thres_cv2, thres_adi, thres_sddi, min_time_cons and look if the resulting classification makes sense. 195 | 196 | ### References on intermittent time series: ​ 197 | 198 | - [Lancaster Centre For Marketing Analytics and Forecasting](https://www.lancaster.ac.uk/lums/research/areas-of-expertise/centre-for-marketing-analytics-and-forecasting/) 199 | 200 | - [Methods for Intermittent Demand Forecasting](https://www.lancaster.ac.uk/pg/waller/pdfs/Intermittent_Demand_Forecasting.pdf) 201 | 202 | #### Methods to forecast intermittent time series (not yet implemented in this accelerator):​ 203 | ##### Constant 204 | - Moving average 205 | ##### Constant at zero 206 | - Moving average or actual zero value 207 | ##### Unforecastable time and unforecastable quantity 208 | - Do not use a statistical model, it is better to develop a deterministic model (i.e. based on if/then rules) 209 | ##### Spikes, lumpy, erratic 210 | - Croston’s method: Implementation in [sktime](https://www.sktime.org/en/v0.8.0/api_reference/auto_generated/sktime.forecasting.croston.Croston.html)​ 211 | - Adjusted Croston methods ​ 212 | - Model-based forecasting methods 213 | - ARMA models​ 214 | - DARMA models -> Discrete ARMA​ 215 | - INARMA models -> Integer-valued ARMA (INARMA)​ 216 | 217 | ### Clustering profiles​ 218 | - Clustering regular time series using K-Means flat 219 | - Choose the optimal number of clusters ​ 220 | - As a method to choose the optimal number of cluster, use max explained variance at the minimum number of cluster -> Elbow Method​ 221 | ![Alt text](Docs/Images/elbow.png?raw=true "Elbow method") 222 | - Check weather identified profiles have a business meaning 223 | - Define and assign a best model: 224 | - use temperatures if heating or cooling is present in an energy consumption use case 225 | ![Alt text](Docs/Images/thermal.png?raw=true "Thermal time series") 226 | - use calendar variables correlation when temperatures is not present 227 | ![Alt text](Docs/Images/calendar.png?raw=true "Calendar time series") 228 | 229 | #### Methods to forecast regular time series 230 | |# | Model | Library | Status | Notes | 231 | | :--- | :----: | ---: | ---: |---: | 232 | | 1 | Linear regression | [statsmodel](https://www.statsmodels.org/stable/api.html#univariate-time-series-analysis) |Implemented | | 233 | | 2 | Gradient boosting | [xgboost](https://xgboost.readthedocs.io/en/stable/) |Implemented | | 234 | | 3 | Random forest | [statsmodel](https://www.statsmodels.org/stable/api.html#univariate-time-series-analysis) |Implemented | | 235 | | 4 | Kats |[Kats](https://facebookresearch.github.io/Kats/api/) |Not yet tmplemented | | 236 | | 5 | Prophet | [Prophet](https://facebook.github.io/prophet/docs/quick_start.html#python-api)|Not yet implemented |Decompose into trend + season + holiday, etc | 237 | | 6 |Neural networks|[Neural prophet](https://neuralprophet.com/html/index.html) |Not yet implemented | | 238 | | 7 |Probabilistic model|[PyFlux](https://github.com/RJT1990/pyflux) |Not yet implemented | | 239 | | 8|Scikit-learn wrapper|[Sktime](https://www.sktime.org/en/stable/) |Not yet implemented | | 240 | | 9|Automatic time series|[AutoTimeSeries](https://github.com/AutoViML/Auto_TS) |Not yet implemented | | 241 | | 10 |Create synthetic time series for model testing|[TimeSynth](https://github.com/TimeSynth/TimeSynth) |Not yet implemented | | 242 | | 11 |Computes series characteristics|[Tsfresh](https://github.com/blue-yonder/tsfresh) |Not yet implemented | | 243 | | 12 |ARIMA and deep NN|[Darts](https://github.com/unit8co/darts) |Not yet implemented | | 244 | | 13 |Uber forecasting package|[Orbit](https://github.com/uber/orbit) |Not yet implemented | pystan backend | 245 | | 14 |Converting dates|[Arrow](https://github.com/pastas/pastas) |Not yet implemented | | 246 | | 15 |Hydro(geo)logical time series analysis|[Pastas](https://github.com/pastas/pastas) |Not yet implemented | | 247 | | 16|Deep learning|[Flow forecast](https://github.com/AIStream-Peelout/flow-forecast) |Not yet implemented | | 248 | | 17 |Automating iterative tasks of machine learning model development|[AutoML in Azure ML](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#supported-models) |Not yet implemented | | 249 | | 18 |Netflix forecasting package | [Metaflow](https://docs.metaflow.org/introduction/what-is-metaflow) |Not yet implemented | | 250 | 251 | # Getting Started 252 | 1. Create a new conda environment named forecasting_energy using the `forecasting_energy_env.yml` in the `Environment` folder in the repository. To install a new environment using conda, you can access Anaconda navigator, click on import, name the new environment as forecasting_energy, select Python 3.8 and use the path to forecasting_energy_env.yml to install the new environment. Or you can use the following command: 253 | ```sh 254 | conda env create -f ./Environment/forecasting_energy.yml 255 | ``` 256 | 257 | 2. To have an idea of software dependencies, read `requirements.txt` 258 | 3. Create a `config.yaml` in `Configuration` folder, in order to run the code on your local machine/virtual machine. This is an example of the file(`config.yaml`) 259 | ```sh 260 | data_dir: 261 | input_path: "Data/Input" 262 | output_path: "Data/Output" 263 | plot_path: "Data/Plots" 264 | ``` 265 | 266 | 4. Create your input, output and plot path 267 | 5. Load the [test dataset from Kaggle](https://www.kaggle.com/arashnic/building-sites-power-consumption-dataset/download"), unzip and save it in your input folder 268 | 269 | ## Default Directory Structure 270 | 271 | ```bash 272 | ├───Code # Folder containing all the custom function created for this accelerator 273 | │ ├───Plotting # Plotting functions 274 | │ └───Profiling # Profiling time series functions 275 | │ ├───Intermittent # Identification and classification of intermittent time series functions 276 | │ └───Regressors # Create useful time series regressors, such as calendar variables or temperature transformations 277 | │ └───Scoring # Create train and test sets, training, forecasting and computing KPIs functions 278 | │ └───Utils # Several utils functions called in the notebooks 279 | ├── Configuration # config.py that lead to config.yaml. with configuration 280 | ├───Docs # Additional documents 281 | ├───Notebooks # Notebooks to do Profiling, Data Preparation, Scoring and Forecasting 282 | ├───Test # Test Notebooks to do Profiling, Data Preparation, Scoring and Forecasting on various use cases 283 | ├── .gitignore 284 | ├── CODE_OF_CONDUCT.md 285 | ├── LICENSE.md 286 | ├── README.md 287 | |── requirements.txt 288 | ├── SECURITY.md 289 | └── SUPPORT.md 290 | ``` 291 | 292 | ## Build and Test 293 | 1. Create a config.yaml as described above and compile it as: 294 | - In data_dir set your folder tree for input, output and plot folder 295 | - In saving choose your saving preferences 296 | 297 | # Functions 298 | Functions are available in the Code folder. 299 | 300 | ## Plotting 301 | - Class Plots 302 | 303 | ```sh 304 | sliding_line_plot(df, serie_to_plot, id, i, chart_title="") 305 | ``` 306 | 307 | - Creates a sliding time series chart 308 | 309 | ```sh 310 | sliding_fcst_plot(df, predict_col, expected_values, chart_title="", kpi=True) 311 | ``` 312 | 313 | Creates a forecast vs actual sliding time series chart, with KPI option 314 | 315 | 316 | ## Profiling 317 | - Class Intermittent 318 | ```sh 319 | cv2_by_group(df, y, grouping_var, highest=0.05, lowest=0.05): 320 | ``` 321 | - Computes cv2 by group 322 | ```sh 323 | cv2(array, highest=0.05, lowest=0.05): 324 | ``` 325 | - Winsorization is the process of replacing the extreme values of statistical data in order to limit 326 | the effect of the outliers on the calculations or the results obtained by using that data. 327 | The mean value calculated after such replacement of the extreme values is called winsorized mean. 328 | ```sh 329 | adi(array, highest=0.05, lowest=0.05): 330 | ``` 331 | 332 | ```sh 333 | sddi(array, highest=0.05, lowest=0.05): 334 | ``` 335 | 336 | ```sh 337 | compute_indicator_values(vect, threshold, perc, quant, highest, lowest): 338 | ``` 339 | - Computes indicator values 340 | ```sh 341 | enh_compute_indicator_values(vect, threshold, perc, quant, highest, lowest): 342 | ``` 343 | Computes indicator values (enhanced) 344 | 345 | ## Regressors 346 | - Class Regressors 347 | ```sh 348 | create_interactions(df, var1, var2) 349 | ``` 350 | Adds interaction terms between two variables as var1*var2 to dataframe 351 | 352 | ```sh 353 | create_non_linear_terms(df, var, n) 354 | ``` 355 | Adds non linear terms as var^2 to dataframe 356 | 357 | ```sh 358 | add_holidays_by_country(df, date_var, country) 359 | ``` 360 | Adds holidays a dummy variable (0/1) to dataframe 361 | 362 | ```sh 363 | add_weekdays(df, date_var) 364 | ``` 365 | Adds weekdays a dummy variables (0/1) for each weekday to dataframe 366 | 367 | ```sh 368 | add_months(df, date_var) 369 | ``` 370 | Adds months a dummy variables (0/1) for each month to dataframe 371 | 372 | ```bash 373 | calculate_degree_days(df, base_temperature, temperature) 374 | ``` 375 | Calculate the Degree Days Heating and Cooling values 376 | 377 | ```bash 378 | merge_holidays_by_date(df, df_holidays, id) 379 | ``` 380 | Merge Holiday df with the train df 381 | 382 | ```bash 383 | merge_additional_days_off(df, df_metadata, id, dict_days_off) 384 | ``` 385 | Merge Site Weekend data with train df 386 | 387 | ```bash 388 | merge_weather(df, weather, date_var, id) 389 | ``` 390 | Merge weather data into the train df 391 | 392 | - Class SimilarDay: 393 | ```bash 394 | get_similar_days_in_previous_year(dates, country) 395 | ``` 396 | Retrieves the similar day for a given date 397 | 398 | ```bash 399 | get_similar_days_in_previous_week(dates, country) 400 | ``` 401 | Retrieves the similar day for a given date 402 | 403 | ```bash 404 | get_similar_day_in_previous_year(d, holiday_calendar) 405 | ``` 406 | Retrieves the similar day for a given date. If the given date is not an holiday, the similar day is the closest day of the previous year in terms of calendar position which shares the weekday. If such a date is an holiday, the same weekday of the week before is considered. 407 | 408 | ```bash 409 | get_similar_day_in_previous_week(d, holiday_calendar) 410 | ``` 411 | Retrieves the similar day for a given date. If the given date is not an holiday, the similar day is the closest day of the previous year in terms of calendar position which shares the weekday. If such a date is an holiday, the same weekday of the week before is considered. If the given date is an holiday, its similar day is the closest holiday to the given date in the previous year. 412 | 413 | - Class StandardConsumption: 414 | ```bash 415 | get_standard_consumption_as_mean(df, id, date_var, var, country) 416 | ``` 417 | Retrieves the standard consumption for a given date as hourly monthly mean differentiated by holiday, weekend, weekdays 418 | 419 | - Class Temperatures: 420 | ```bash 421 | ten_year(df, id, date_var = 'date_daily', start_date ='', end_date='31/12/2050') 422 | ``` 423 | Computes ten year averages temperatures and As-Is temperatures: where available use actual temp, if not use ten year averages 424 | 425 | ```bash 426 | get_minimum_consumption(df, date_var, var, country) 427 | ``` 428 | Retrieves the minimum consumption for a given date as hourly monthly minimum value differentiated by holiday, weekend, night 429 | 430 | ## Scoring 431 | - Class Training 432 | ```bash 433 | train(dict_model_to_train, model) 434 | ``` 435 | Generate train 436 | 437 | - Class Forecasting 438 | ```bash 439 | forecast(dict_test, trained_model) 440 | ``` 441 | Generate forecast 442 | 443 | - Class Scoring 444 | ```bash 445 | find_best_algorithm(y, dict_train, dict_test, dict_algorithms, out_of_sample) 446 | ``` 447 | Finds the best performing algorithm in terms of min mean absolute error 448 | 449 | ```bash 450 | stats_per_site(df, id, date_var) 451 | ``` 452 | Helper function to identify amount of data per site 453 | 454 | ```bash 455 | resample_train_data(df, date_var, id, predict_col, sampling="D") 456 | ``` 457 | Resample the data to a particular frequency 458 | 459 | - Class TrainTest 460 | ```bash 461 | define_train_test_set_dates(df, y, train_start_date, train_end_date, test_start_date, test_end_date, test_size=0.33) 462 | ``` 463 | Defines train and test dates if left blank 464 | 465 | ```bash 466 | def_train(df, y, list_id, train_start_date='', train_end_date='') 467 | ``` 468 | Define train dataset 469 | 470 | ```bash 471 | def_test(df, y, list_id, test_start_date='', test_end_date='') 472 | ``` 473 | Define test dataset 474 | 475 | ## Kpi 476 | - Class Kpi 477 | ```bash 478 | find_mae(y, dict_train, dict_test, dict_models): 479 | ``` 480 | Compute mean absolute error 481 | ```bash 482 | compute_error(df, fcst, y): 483 | ``` 484 | Compute error as forecast-actual 485 | ```bash 486 | compute_absolute_error(df, fcst, y): 487 | ``` 488 | Compute absolute error as abs(forecast-actual) 489 | ```bash 490 | compute_absolute_percentage_error(df, fcst, y): 491 | ``` 492 | Compute absolute % error 493 | ```bash 494 | compute_mean_error(df, fcst, y): 495 | ``` 496 | Compute mean error 497 | ```bash 498 | compute_mae(df, fcst, y): 499 | ``` 500 | Compute mean absolute error 501 | ```bash 502 | compute_mape(df, fcst, y): 503 | ``` 504 | Compute mean absolute % error 505 | 506 | ## Utils 507 | - Class Utils 508 | ```bash 509 | def camel_to_snake(name) 510 | ``` 511 | Changes string from camel case to snake case 512 | ```bash 513 | columns_camel_to_snake(df) 514 | ``` 515 | Changes dataframe columns from camel case to snake case 516 | ```bash 517 | find_date(df) 518 | ``` 519 | Finds date columns in a dataframe 520 | ```bash 521 | find_match_in_list(list_to_match, match_to_find): 522 | ``` 523 | Finds a match in a list given a list of possible words to match 524 | ```bash 525 | delta_format(delta: np.timedelta64) -> str: 526 | ``` 527 | Identifies frequency in numpy timedelta 528 | ```bash 529 | find_freq(timedelta): 530 | ``` 531 | Finds frequency in numpy timedelta 532 | ```bash 533 | find_freq_in_dataframe(df, date_var) 534 | ``` 535 | Finds frequency in pandas dataframe 536 | ```bash 537 | create_folder_tree(folder_name) 538 | ``` 539 | creates folder tree 540 | ```bash 541 | get_project_root(Path): 542 | ``` 543 | Finds the parent folder of the project 544 | ```bash 545 | add_daily_date(df): 546 | ``` 547 | Adds a date variable at daily frequency to dataframe 548 | ```bash 549 | find_categorical_variables(df): 550 | ``` 551 | Finds categorical variables in pandas dataframe 552 | ```bash 553 | resample_data(df, id, date_var, sampling, dict_grouping) 554 | ``` 555 | Resample by aggregating the data to a particular frequency as defined in dict_grouping as {variable_to_resample: 'function_to_apply'}, i.e.{value: 'sum'} 556 | ```bash 557 | resample_data(df, id, date_var, sampling, dict_grouping) 558 | ``` 559 | Resample by aggregating the data to a particular frequency (x-m,x-h,x-D) as defined (e.g. 3-M) in aggregation_per_col as{variable_to_resample: 'function_to_apply'}, i.e.{value: 'sum'} 560 | ```bash 561 | add_seq(df, date_var, serie, freq, end_date='', start_date='') 562 | ``` 563 | Creates a sequence of complete date/hours to a dataframe 564 | ```bash 565 | check_length_time_serie(df, date_var, index) 566 | ``` 567 | Checks the length that a time series of complete date/hours should have, so that it can be compared 568 | with actual observation 569 | ```bash 570 | match_to_find(serie_to_find) 571 | ``` 572 | Finds a match in a list of possible words to match 573 | ```bash 574 | find_match(df, serie_name, match_to_find): 575 | ``` 576 | Finds a match in a dataframe series given a list of possible words to match 577 | ```bash 578 | find_match_in_list(list_to_match, match_to_find) 579 | ``` 580 | Finds a match in a list given a list of possible words to match 581 | ```bash 582 | id_outliers_IQR(df, q1, q3, date_var, id, var, freq_var) 583 | ``` 584 | Identifies outliers creatinga dummy variable (0/1) called outlier using IQR method, where quantile value can be set 585 | 586 | - Class AlphabeticalCombinations 587 | ```bash 588 | write_neat_csv(saving_file, df_fcst) 589 | ``` 590 | Writes neat csv 591 | ```bash 592 | convert(string) 593 | ``` 594 | Convert string to list 595 | ```bash 596 | excel_columns() 597 | ``` 598 | Counts excel columns 599 | ```bash 600 | write_beautiful_excel(saving_file, dict_df_to_write) 601 | ``` 602 | Writes beautiful excel 603 | ```bash 604 | write_beautiful_excel_table(saving_file, dict_df_to_write) 605 | ``` 606 | Writes beautiful excel tables 607 | 608 | # Contributing 609 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 610 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 611 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 612 | 613 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 614 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 615 | provided by the bot. You will only need to do this once across all repos using our CLA. 616 | 617 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 618 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 619 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 620 | 621 | ## As data scientist, how can I contribute? 622 | You can contribute both in extending the **Profiling** tool and in the data preparation and scoring part of this accelerator. 623 | 624 | ### How to contribute to profiling? 625 | What needs to be done is to test and define intermittent indicators (thres_cv2_constant, thres_cv2, thres_adi, thres_sddi, min_time_cons) for other types of data than electricity consumption, as reported below. 626 | 627 | #### Insurance Claims data 628 | Insurance claims data in USD, daily data. Claims from work accidents in mining industry. 629 | 630 | - thres_cv2_constant = 0.01 631 | - thres_cv2 = 0.2 632 | - thres_adi = 1.2 633 | - thres_sddi = 6.0 634 | - min_time_cons = 25 635 | 636 | ### How to contribute to data preparation and scoring? 637 | What needs to be done is to improve the code to make it scalable and more efficient when working with big datasets (e.g. more than 100 id). 638 | 639 | # Trademarks 640 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 641 | trademarks or logos is subject to and must follow 642 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 643 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 644 | Any use of third-party trademarks or logos are subject to those third-party's policies. 645 | 646 | 647 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /Tests/InsuranceClaimsDataPreparation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "dc3e4402", 6 | "metadata": {}, 7 | "source": [ 8 | "# Implementation" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "sWbXCGozBRNW", 14 | "metadata": { 15 | "id": "sWbXCGozBRNW" 16 | }, 17 | "source": [ 18 | "## Packages" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "kmxpysFu7zjH", 25 | "metadata": { 26 | "colab": { 27 | "base_uri": "https://localhost:8080/" 28 | }, 29 | "id": "kmxpysFu7zjH", 30 | "outputId": "db2717d5-22be-4fa8-99fb-3f9ea90e7e1b" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "# data elaboration functions\n", 35 | "import pandas as pd\n", 36 | "import string\n", 37 | "import numpy as np\n", 38 | "\n", 39 | "# datetime functions\n", 40 | "import datetime as dt\n", 41 | "\n", 42 | "# file management functions\n", 43 | "import os\n", 44 | "import sys\n", 45 | "import opendatasets as od\n", 46 | "import pickle\n", 47 | "from pathlib import Path\n", 48 | "\n", 49 | "# plot functions\n", 50 | "import matplotlib.pyplot as plt\n", 51 | "%matplotlib inline\n", 52 | "\n", 53 | "# data science functions\n", 54 | "import xgboost as xgb\n", 55 | "from sklearn.model_selection import train_test_split\n", 56 | "from sklearn.ensemble import RandomForestRegressor\n", 57 | "from sklearn.linear_model import LinearRegression\n", 58 | "import joblib\n", 59 | "from sklearn.metrics import mean_absolute_error\n", 60 | "\n", 61 | "# configuration file\n", 62 | "module_path = os.path.abspath(os.path.join('..'))\n", 63 | "if module_path not in sys.path:\n", 64 | " sys.path.append(module_path)\n", 65 | "from Configuration.config import cfg_path\n", 66 | "\n", 67 | "# custom functions\n", 68 | "from Code.Plotting.plots import Plots\n", 69 | "from Code.Regressors.regressors import Regressors\n", 70 | "from Code.Regressors.temperatures import Temperatures\n", 71 | "from Code.Scoring.scoring import Scoring\n", 72 | "from Code.Scoring.train_test import TrainTest\n", 73 | "from Code.Scoring.train import Training\n", 74 | "from Code.Scoring.forecast import Forecasting\n", 75 | "from Code.Scoring.kpi import Kpi\n", 76 | "from Code.Scoring.scoring import Scoring\n", 77 | "from Code.Utils.utils import Utils\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "8dc26b7b", 83 | "metadata": {}, 84 | "source": [ 85 | "## Setup" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "458162d0", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "root = Path(os.getcwd()).parent\n", 96 | "dataset_path = os.path.join(root, cfg_path.data_dir.input_path)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "4Q-4BToWB7LC", 102 | "metadata": { 103 | "id": "4Q-4BToWB7LC" 104 | }, 105 | "source": [ 106 | "## Load Data\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "d7e24623", 113 | "metadata": { 114 | "colab": { 115 | "base_uri": "https://localhost:8080/" 116 | }, 117 | "id": "d7e24623", 118 | "outputId": "30507a03-42e3-4f9e-8b2b-bccb623a06c9" 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "df_train_data = pd.read_csv(os.path.join(\n", 123 | " root, cfg_path.data_dir.input_path, 'insurance-claims.csv'))\n", 124 | "df_train_data.head()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "1ShqG6YJGmBk", 130 | "metadata": { 131 | "id": "1ShqG6YJGmBk" 132 | }, 133 | "source": [ 134 | "# Data Preparation\n" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "f23ed7fb", 140 | "metadata": {}, 141 | "source": [ 142 | "## Parameter setup" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "0ddada30", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "id = 'ICD10Description'\n", 153 | "list_unique_id = ['ICD10Description', 'DateOfAccident']\n", 154 | "list_temp = []\n", 155 | "y = 'Sum of PaidDaysValue'" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "id": "78309614", 161 | "metadata": {}, 162 | "source": [ 163 | "#### Setting date as datetime" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "51b01c28", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "df_train_data['DateOfAccident'] = pd.to_datetime(df_train_data['DateOfAccident'], format = '%d-%m-%y %H:%M:%S %p')" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "2799c9d5", 179 | "metadata": {}, 180 | "source": [ 181 | "#### Setting forecast end date" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "id": "5f55942a", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "# Make sure to have all regressors available until forecast_end_date (temperatures, etc)\n", 192 | "forecast_end_date = '2022-12-31'" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "id": "08af66c3", 198 | "metadata": {}, 199 | "source": [ 200 | "## Plotting y series" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "id": "23685319", 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "# Print available ids and choose which one to plot \n", 211 | "print(list(df_train_data[id].unique())[0:20])" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "id": "a1fabf6b", 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "# Adjusting id names by removing special characters\n", 222 | "import re\n", 223 | "df_train_data.loc[:, id] = df_train_data.loc[:, id].apply(lambda x: re.sub('[^A-Za-z0-9]+', '_', x))\n" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "6e669264", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "# Selecting 100 ids to plot\n", 234 | "list_ids_to_plot = list(df_train_data[id].unique()[0:100])" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "109aaf82", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "count = 1\n", 245 | "for i in list_ids_to_plot:\n", 246 | " print('Plotting id:', i, 'as', count, 'of', len(list_ids_to_plot))\n", 247 | " plot = Plots.sliding_line_plot(df_train_data, y, id, i, chart_title=\"\")\n", 248 | " plot.write_html(os.path.join(root, cfg_path.data_dir.plot_path, id + '_' + str(i) + \".html\"))\n", 249 | " count = count + 1 " 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "id": "c0be27d0", 255 | "metadata": {}, 256 | "source": [ 257 | "## Dealing with NAs and aggregating at a chosen frequency" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "id": "0e88444e", 263 | "metadata": {}, 264 | "source": [ 265 | "Create a full time sequence on a chosen frequency and aggregate" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "id": "26de9cc1", 271 | "metadata": {}, 272 | "source": [ 273 | "#### Consumption data (y)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "id": "77429654", 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "# Selecting 100 ids to elaborate\n", 284 | "df_train_data = df_train_data.loc[df_train_data[id].isin(list_ids_to_plot), ]\n", 285 | "date_var = Utils.find_date(df_train_data)\n", 286 | "print('List ids:', list_ids_to_plot)\n", 287 | "len(list_ids_to_plot)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "id": "f711e287", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "# Resampling function aggregates data in a dataframe with a chosen function, that can vary depending on the variable\n", 298 | "# i.e. temperatures when aggregated should be averaged, consumption should be summed, dummy variables should be pick as 'first'\n", 299 | "\n", 300 | "df_train_data[date_var].apply(lambda x: x.tz_localize(None))\n", 301 | "sampling = dt.timedelta(days=1)\n", 302 | "dict_grouping = {'RmaRegionDesc': 'first', 'Product': 'first', 'Sum of PaidDaysValue': 'sum'}\n", 303 | "df_resampled = Utils.resample_data(df_train_data, id, date_var, sampling, dict_grouping)\n", 304 | "print('List ids after resampling:', list(df_resampled[id].unique()))" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "id": "fecd0d49", 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "# Adding a full time sequence\n", 315 | "df_train_data = Utils.add_seq(df_resampled, date_var, serie = id, freq = sampling, end_date=forecast_end_date, start_date='')" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "id": "650cf7b7", 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "# This function count the number of obs you should have if you had a full time sequence\n", 326 | "Utils.check_length_time_serie(df_train_data, date_var, index = id).head()" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "id": "7d18510c", 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "df_train_data.head()" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "id": "42bc870d", 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "print('List ids after resampling and adding full time sequence:', list(df_train_data[id].unique()))" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "id": "8a56ffd2", 352 | "metadata": {}, 353 | "source": [ 354 | "## Creating working dataset" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "id": "6mGY36qeLgvf", 361 | "metadata": { 362 | "id": "6mGY36qeLgvf" 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "# Final df\n", 367 | "df_final = df_train_data.copy()\n", 368 | "\n", 369 | "# Date\n", 370 | "date_var = Utils.find_date(df_final)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "id": "53a5656c", 376 | "metadata": {}, 377 | "source": [ 378 | "#### Count NAs in y by id" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "id": "59ba6bca", 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "df_final.head()" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "id": "6a3889e4", 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "pivotna = pd.pivot_table(df_final[df_final[y].isna()], index=id, values = y, aggfunc='count').reset_index()\n", 399 | "pivotna.rename(columns={y: y + '_count_NA'})\n", 400 | "pivotna" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "id": "6740bfb1", 406 | "metadata": {}, 407 | "source": [ 408 | "### Adding regressors to final dataframe" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "id": "e5c112c5", 414 | "metadata": {}, 415 | "source": [ 416 | "#### Holidays" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "id": "2830270a", 422 | "metadata": {}, 423 | "source": [ 424 | "If you don't have specific holiday dataset, you can use the following general function by country that uses the holiday python package and adds to your dataframe a columns with a holiday dummy variable (0/1):\n", 425 | "\n", 426 | " df_final = Regressors.add_holidays_by_country(df_final, date_var, country = 'France')" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "id": "805ebacf", 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "df_final = Regressors.add_holidays_by_country(df_final, date_var, country='United States')\n", 437 | "print('Min date:', df_final[date_var].min())\n", 438 | "print('Max date:', df_final[date_var].max())" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "id": "395e6780", 444 | "metadata": {}, 445 | "source": [ 446 | "#### Other calendar variables" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "id": "196089f6", 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "df_final = Regressors.add_weekdays(df_final, date_var)\n", 457 | "df_final = Regressors.add_months(df_final, date_var)\n", 458 | "print('Min date:', df_final[date_var].min())\n", 459 | "print('Max date:', df_final[date_var].max())" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "id": "6743f041", 465 | "metadata": {}, 466 | "source": [ 467 | "#### Remove duplicates" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "id": "fbcb2765", 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "df_final = df_final.drop_duplicates()\n", 478 | "print('List ids in df_final after removing duplicates:', list(df_final[id].unique()))\n", 479 | "assert df_final[df_final.duplicated()].count().sum() == 0, \"y should not contain duplicates\"\n", 480 | "print('Min date:', df_final[date_var].min())\n", 481 | "print('Max date:', df_final[date_var].max())" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "id": "a7809c54", 487 | "metadata": {}, 488 | "source": [ 489 | "#### Check regressor availability" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "id": "4ea99f83", 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "df_final.columns" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "id": "e7945831", 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "# Temperatures have been filled, only temperature asis that is the composition between the actual temperature and ten year averages\n", 510 | "regressors_list = [ 'holidays','RmaRegionDesc', 'Product',\n", 511 | " 'holidays', 'wd_mon', 'wd_tue', 'wd_wed',\n", 512 | " 'wd_thu', 'wd_fri', 'wd_sat', 'wd_sun', 'month_01', 'month_02',\n", 513 | " 'month_03', 'month_04', 'month_05', 'month_06', 'month_07', 'month_08',\n", 514 | " 'month_09', 'month_10', 'month_11', 'month_12']\n", 515 | "\n", 516 | "try:\n", 517 | " Utils.check_regressors_availability(df_final, date_var, regressors_list, forecast_end_date)\n", 518 | "except:\n", 519 | " Utils.remove_regressors_with_nan(df_final, date_var, regressors_list, forecast_end_date)" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "id": "f6dff377", 525 | "metadata": {}, 526 | "source": [ 527 | "# Saving" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "id": "4715ab4e", 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [ 537 | "df_final.to_pickle(os.path.join(root, cfg_path.data_dir.output_path, 'insurance_claims_final.pkl'))" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "id": "bd0951d8", 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [ 547 | "print('Min date:', df_final[date_var].min())\n", 548 | "print('Max date:', df_final[date_var].max())\n", 549 | "df_final.head()\n" 550 | ] 551 | } 552 | ], 553 | "metadata": { 554 | "colab": { 555 | "collapsed_sections": [ 556 | "AbKOiffyAql8", 557 | "6YxUycDC9p0h" 558 | ], 559 | "name": "Analysis (1).ipynb", 560 | "provenance": [] 561 | }, 562 | "interpreter": { 563 | "hash": "2b8f5b14411d0017ed363cef4929504a7281087d06f1b18c01da6e951b937e80" 564 | }, 565 | "kernelspec": { 566 | "display_name": "Python 3.7.7 ('forecasting_energy')", 567 | "language": "python", 568 | "name": "python3" 569 | }, 570 | "language_info": { 571 | "codemirror_mode": { 572 | "name": "ipython", 573 | "version": 3 574 | }, 575 | "file_extension": ".py", 576 | "mimetype": "text/x-python", 577 | "name": "python", 578 | "nbconvert_exporter": "python", 579 | "pygments_lexer": "ipython3", 580 | "version": "3.7.7" 581 | } 582 | }, 583 | "nbformat": 4, 584 | "nbformat_minor": 5 585 | } 586 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.11.0 2 | adal==1.2.5 3 | adjustText==0.7.3 4 | altair==4.1.0 5 | antlr4-python3-runtime==4.8 6 | applicationinsights==0.11.9 7 | argcomplete==1.12.3 8 | argon2-cffi==21.1.0 9 | astor==0.8.1 10 | astunparse==1.6.3 11 | async-generator==1.10 12 | attrs==21.2.0 13 | autopep8==1.5.7 14 | azure-cognitiveservices-vision-customvision==3.0.0 15 | azure-common==1.1.26 16 | azure-core==1.23.0 17 | azure-graphrbac==0.61.1 18 | azure-identity==1.4.1 19 | azure-keyvault-secrets==4.4.0 20 | azure-mgmt-authorization==0.61.0 21 | azure-mgmt-containerregistry==2.8.0 22 | azure-mgmt-keyvault==2.2.0 23 | azure-mgmt-resource==10.3.0 24 | azure-mgmt-storage==11.2.0 25 | azure-storage-blob==12.10.0 26 | azureml-automl-core==1.18.0.post1 27 | azureml-core==1.17.0 28 | azureml-dataprep==2.4.4 29 | azureml-dataprep-native==24.0.0 30 | azureml-dataprep-rslex==1.2.3 31 | azureml-dataset-runtime==1.18.0 32 | azureml-defaults==1.18.0 33 | azureml-model-management-sdk==1.0.1b6.post1 34 | azureml-pipeline==1.18.0 35 | azureml-pipeline-core==1.18.0 36 | azureml-pipeline-steps==1.18.0 37 | azureml-sdk==1.18.0 38 | azureml-telemetry==1.18.0 39 | azureml-train==1.18.0 40 | azureml-train-automl-client==1.18.0 41 | azureml-train-core==1.18.0.post1 42 | azureml-train-restclients-hyperdrive==1.18.0 43 | backcall==0.2.0 44 | backports.tempfile==1.0 45 | backports.weakref==1.0.post1 46 | backports.zoneinfo==0.2.1 47 | base58==2.1.1 48 | bleach==4.1.0 49 | blinker==1.4 50 | cached-property==1.5.2 51 | cachetools==4.1.1 52 | certifi==2020.6.20 53 | cffi==1.14.3 54 | charset-normalizer==2.0.6 55 | click==7.1.2 56 | cloudpickle==1.6.0 57 | cmdstanpy==0.9.5 58 | colorama==0.4.4 59 | configparser==3.7.4 60 | contextlib2==0.6.0.post1 61 | convertdate==2.3.2 62 | cryptography==3.3.2 63 | cycler==0.10.0 64 | Cython==0.29.26 65 | databricks-cli==0.16.2 66 | databricks-connect==7.3.30 67 | dateinfer==0.2.0 68 | debugpy==1.5.0 69 | decorator==5.1.0 70 | defusedxml==0.7.1 71 | dill==0.3.3 72 | distro==1.5.0 73 | docker==4.3.1 74 | dotnetcore2==2.1.19 75 | entrypoints==0.3 76 | ephem==4.1.3 77 | et-xmlfile==1.1.0 78 | Flask==1.0.3 79 | fusepy==3.0.1 80 | gast==0.3.3 81 | gitdb==4.0.9 82 | GitPython==3.1.24 83 | google-auth==1.23.0 84 | google-auth-oauthlib==0.4.2 85 | google-pasta==0.2.0 86 | grpcio==1.33.2 87 | gunicorn==19.9.0 88 | h11==0.12.0 89 | h5py==3.1.0 90 | hijri-converter==2.2.2 91 | holidays==0.11.3.1 92 | idna==3.2 93 | importlib-metadata==2.0.0 94 | imutils==0.5.3 95 | ipykernel==6.4.1 96 | ipython==7.31.1 97 | ipython-genutils==0.2.0 98 | ipywidgets==7.6.5 99 | isodate==0.6.0 100 | itsdangerous==1.1.0 101 | jedi==0.18.0 102 | jeepney==0.6.0 103 | Jinja2==2.11.3 104 | jmespath==0.10.0 105 | joblib==0.17.0 106 | json-logging-py==0.2 107 | json5==0.8.5 108 | jsonpickle==1.4.1 109 | jsonschema==4.0.1 110 | jupyter-client==7.0.6 111 | jupyter-core==4.8.1 112 | jupyterlab-pygments==0.1.2 113 | jupyterlab-widgets==1.0.2 114 | kaggle==1.5.12 115 | Keras-Applications==1.0.8 116 | Keras-Preprocessing==1.1.0 117 | kiwisolver==1.3.2 118 | kneed==0.7.0 119 | korean-lunar-calendar==0.2.1 120 | liac-arff==2.5.0 121 | LunarCalendar==0.0.9 122 | Markdown==3.3.3 123 | MarkupSafe==1.1.1 124 | matplotlib==3.4.3 125 | matplotlib-inline==0.1.3 126 | mistune==0.8.4 127 | msal==1.6.0 128 | msal-extensions==0.2.2 129 | msrest==0.6.21 130 | msrestazure==0.6.2 131 | nbclient==0.5.4 132 | nbconvert==6.2.0 133 | nbformat==5.1.3 134 | nbimporter==0.3.4 135 | ndg-httpsclient==0.5.1 136 | nest-asyncio==1.5.1 137 | notebook==6.4.10 138 | numpy==1.21.0 139 | oauthlib==3.1.0 140 | omegaconf==2.1.2 141 | opencv-python==4.3.0.36 142 | opencv-python-headless==4.3.0.36 143 | opendatasets==0.1.20 144 | openpyxl==3.0.9 145 | opt-einsum==3.3.0 146 | outcome==1.1.0 147 | packaging==21.2 148 | pandas==1.3.5 149 | pandasql==0.7.3 150 | pandocfilters==1.5.0 151 | parso==0.8.2 152 | pathspec==0.8.1 153 | patsy==0.5.2 154 | pep8==1.7.1 155 | pickleshare==0.7.5 156 | Pillow==9.0.1 157 | plotly==5.3.1 158 | portalocker==1.7.1 159 | prometheus-client==0.12.0 160 | prompt-toolkit==3.0.20 161 | protobuf==3.15.0 162 | py4j==0.10.9 163 | pyarrow==1.0.1 164 | pyasn1==0.4.8 165 | pyasn1-modules==0.2.8 166 | pycodestyle==2.7.0 167 | pycparser==2.20 168 | pydeck==0.7.1 169 | Pygments==2.10.0 170 | PyJWT==2.4.0 171 | PyMeeus==0.5.11 172 | pyodbc==4.0.32 173 | pyOpenSSL==19.1.0 174 | pyparsing==2.4.7 175 | pyrsistent==0.18.0 176 | pystan==2.19.1.1 177 | python-box==5.4.1 178 | python-dateutil==2.8.1 179 | python-slugify==6.1.1 180 | pytz==2020.4 181 | pytz-deprecation-shim==0.1.0.post0 182 | pywin32==301 183 | pywinpty==1.1.5 184 | PyYAML==6.0 185 | pyzmq==22.3.0 186 | repackage==0.7.3 187 | requests==2.26.0 188 | requests-oauthlib==1.3.0 189 | rsa==4.7 190 | ruamel.yaml==0.16.12 191 | ruamel.yaml.clib==0.2.2 192 | scikit-learn==0.22.2.post1 193 | scipy==1.4.1 194 | seaborn==0.11.2 195 | SecretStorage==3.2.0 196 | selenium==4.0.0 197 | Send2Trash==1.8.0 198 | setuptools-git==1.2 199 | Shapely==1.7.0 200 | six==1.15.0 201 | sklearn==0.0 202 | smmap==5.0.0 203 | sniffio==1.2.0 204 | sortedcontainers==2.4.0 205 | statsmodels==0.13.1 206 | streamlit==1.1.0 207 | tabulate==0.8.9 208 | tenacity==8.0.1 209 | tensorboard==2.2.2 210 | tensorboard-plugin-wit==1.7.0 211 | tensorflow==2.7.2 212 | tensorflow-estimator==2.2.0 213 | tensorflow-gpu==2.7.2 214 | tensorflow-gpu-estimator==2.2.0 215 | termcolor==1.1.0 216 | terminado==0.12.1 217 | testpath==0.5.0 218 | text-unidecode==1.3 219 | toml==0.10.2 220 | toolz==0.11.1 221 | tornado==6.1 222 | tqdm==4.62.3 223 | traitlets==5.1.0 224 | trio==0.19.0 225 | trio-websocket==0.9.2 226 | typing-extensions==4.1.1 227 | tzdata==2021.5 228 | tzlocal==4.1 229 | ujson==5.2.0 230 | urllib3==1.26.7 231 | validators==0.18.2 232 | watchdog==2.1.6 233 | wcwidth==0.2.5 234 | webencodings==0.5.1 235 | websocket-client==0.57.0 236 | Werkzeug==1.0.1 237 | widgetsnbextension==3.5.2 238 | wincertstore==0.2 239 | wrapt==1.13.1 240 | wsproto==1.0.0 241 | xgboost==1.4.2 242 | xlrd==2.0.1 243 | XlsxWriter==3.0.1 244 | zipp==3.4.0 245 | --------------------------------------------------------------------------------