├── .gitignore
├── CODE_OF_CONDUCT.md
├── Code
    ├── Plotting
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   └── plots.cpython-37.pyc
    │   └── plots.py
    ├── Profiling
    │   ├── Intermittent
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   └── intermittent.cpython-37.pyc
    │   │   └── intermittent.py
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── Regressors
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   └── regressors.cpython-37.pyc
    │   ├── regressors.py
    │   ├── similar_day.py
    │   └── temperatures.py
    ├── Scoring
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── forecast.cpython-37.pyc
    │   │   ├── kpi.cpython-37.pyc
    │   │   ├── scoring.cpython-37.pyc
    │   │   ├── train.cpython-37.pyc
    │   │   └── train_test.cpython-37.pyc
    │   ├── forecast.py
    │   ├── kpi.py
    │   ├── scoring.py
    │   ├── train.py
    │   └── train_test.py
    ├── Utils
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   └── utils.cpython-37.pyc
    │   └── utils.py
    ├── __init__.py
    └── __pycache__
    │   └── __init__.cpython-37.pyc
├── Configuration
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   └── config.cpython-37.pyc
    ├── config.py
    └── config.yaml
├── Dashboards
    └── EnergyDashboard.pbix
├── Docs
    ├── Images
    │   ├── banner.jpg
    │   ├── calendar.png
    │   ├── elbow.png
    │   ├── intermittent_TS.png
    │   ├── panel_data.png
    │   ├── sliding_plot.png
    │   └── thermal.png
    └── Slides
    │   └── ds_toolkit_forecasting_2.0_memo.pdf
├── Environment
    └── forecasting_energy.yml
├── LICENSE
├── Notebooks
    ├── EnergyClusteringRegular.ipynb
    ├── EnergyDataExploration.ipynb
    ├── EnergyPredictionDataPreparation.ipynb
    ├── EnergyPredictionScoring.ipynb
    └── EnergyProfilingIntermittent.ipynb
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── Tests
    ├── InsuranceClaimsDataPreparation.ipynb
    └── InsuranceClaimsProfilingIntermittent.ipynb
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | Data/
2 | __pycache__
3 | **/__pycache__
4 | *.pyc
5 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/Code/Plotting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Plotting/__init__.py


--------------------------------------------------------------------------------
/Code/Plotting/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Plotting/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Plotting/__pycache__/plots.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Plotting/__pycache__/plots.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Plotting/plots.py:
--------------------------------------------------------------------------------
  1 | # data elaboration functions
  2 | import pandas as pd
  3 | import numpy as np
  4 | import re
  5 | 
  6 | # file management functions
  7 | import os
  8 | import glob
  9 | 
 10 | # time management functions
 11 | import datetime as dt
 12 | 
 13 | # plot functions
 14 | from matplotlib import pyplot as plt
 15 | import matplotlib.dates as mdates
 16 | import plotly.graph_objects as go
 17 | 
 18 | # custom functions
 19 | from Configuration.config import cfg_path
 20 | from Code.Utils.utils import Utils
 21 | from Code.Scoring.kpi import Kpi
 22 | 
 23 | class Plots:
 24 |                 
 25 |     def sliding_line_plot(df, serie_to_plot, id, i, chart_title=""):
 26 |         """
 27 |         Creates a time series plot with sliding dates
 28 |         :params: df as pandas dataframe
 29 |         :return: html file with plot
 30 |         """
 31 |         
 32 |         ### Setup          
 33 |         date = Utils.find_date(df)
 34 |         
 35 |         ## Sort
 36 |         df.sort_values(date, inplace=True)
 37 |         
 38 |         ## Create figure
 39 |         fig = go.Figure()
 40 |         fig.add_trace(go.Scatter(x=list(df.loc[df[id] == i, date]), y=list(df.loc[df[id] == i, serie_to_plot]), name=str(i)))
 41 | 
 42 |         # Set title
 43 |         if chart_title!="":
 44 |             fig.update_layout(
 45 |                 title_text=chart_title
 46 |             )
 47 | 
 48 |         else:
 49 |             chart_title = serie_to_plot.capitalize() + ' ' + str(id) + ' ' + str(i)
 50 |             fig.update_layout(
 51 |                 title_text=chart_title
 52 |             )            
 53 |         
 54 |         print('sliding_line_plot: plotting', chart_title)
 55 | 
 56 |         # Add range slider
 57 |         fig.update_layout(
 58 |             xaxis=dict(
 59 |                 rangeselector=dict(
 60 |                     buttons=list([
 61 |                         dict(count=1,
 62 |                                 label="1m",
 63 |                                 step="month",
 64 |                                 stepmode="backward"),
 65 |                         dict(count=3,
 66 |                                 label="3m",
 67 |                                 step="month",
 68 |                                 stepmode="backward"),
 69 |                         dict(count=6,
 70 |                                 label="6m",
 71 |                                 step="month",
 72 |                                 stepmode="backward"),
 73 |                         dict(count=1,
 74 |                                 label="YTD",
 75 |                                 step="year",
 76 |                                 stepmode="todate"),
 77 |                         dict(count=1,
 78 |                                 label="1y",
 79 |                                 step="year",
 80 |                                 stepmode="backward"),
 81 |                         dict(step="all")
 82 |                         ])
 83 |                     ),
 84 |                 rangeslider=dict(
 85 |                     visible=True
 86 |                     ),
 87 |                 type="date"
 88 |                 )
 89 |         )
 90 |         return fig
 91 |     
 92 |     
 93 |     def sliding_fcst_plot(df, predict_col, expected_values, chart_title="", kpi=True):
 94 |         """
 95 |         Creates a time series plot with sliding dates
 96 |         :params: df as pandas dataframe, chart_title as string, kpi as boolean 
 97 |         :return: html file with plot
 98 |         """
 99 |         
100 |         ### Setup
101 |         date = Utils.find_date(df)
102 |     
103 |         if isinstance(date, list):
104 |             date = list(set(Utils.find_date(df)) - set(['train_start_date', 'train_end_date', 'test_start_date', 'test_end_date']))[0]
105 |         
106 |         y = predict_col.copy()
107 |         fcst = expected_values.copy()
108 |         
109 |         ## Sort
110 |         df = df.sort_values(date).copy()
111 |         
112 |         ## Adding model info to chart title
113 |         if 'best_model' in list(df.columns):
114 |             model = df['best_model'].unique()[0]
115 |             chart_title = str(chart_title) + ' - ' + model 
116 |         else:
117 |             chart_title = str(chart_title)     
118 |         
119 |         ## Checking KPI
120 |         if kpi == True:
121 |             try:
122 |                 mape = str(round(Kpi.compute_mape(df, 'fcst', y), 2)*100)
123 |                 min_mape_date = min(df.loc[~df.absolute_percentage_error.isnull(), date]).strftime("%d-%m-%Y")
124 |                 max_mape_date = max(df.loc[~df.absolute_percentage_error.isnull(), date]).strftime("%d-%m-%Y")
125 |                 chart_title = chart_title + ' - MAPE: ' +  mape + "% from " + min_mape_date + ' to ' + max_mape_date
126 |             except:
127 |                 chart_title = str(chart_title)
128 |         else:
129 |             chart_title = str(chart_title)    
130 | 
131 |         ## Create figure
132 |         fig = go.Figure()
133 | 
134 |         fig.add_trace(go.Scatter(x=list(df[date]), y=list(df[y]), name=y))
135 |         fig.add_trace(go.Scatter(x=list(df[date]), y=list(df[fcst]), name=fcst))
136 | 
137 |         # Set title
138 |         if chart_title!="":
139 |             fig.update_layout(
140 |                 title_text=chart_title
141 |             )
142 |         else:
143 |             fig.update_layout(
144 |                 title_text="Forecasting " + y.capitalize()
145 |             )     
146 |             
147 |         # Add annotations
148 |         for col in ['train_start_date', 'train_end_date', 'test_start_date', 'test_end_date']: 
149 |             if col in list(df.columns) and col in ['train_end_date', 'test_end_date']:
150 |                 col_date = pd.to_datetime(str(df[col].unique()[0])).strftime('%Y-%m-%d')
151 |                 closest_date = df[col].unique()[0]
152 |                 x_value = pd.to_datetime(df.loc[df[date]==closest_date, date].reset_index(drop=True)[0], format='%Y-%m-%d') 
153 |                 y_value = pd.to_numeric(df.loc[df[date]==closest_date, y].reset_index(drop=True)[0])
154 |                 fig.add_annotation(
155 |                 x=x_value, 
156 |                 y=y_value,  
157 |                 text= col + ': ' +  str(col_date),
158 |                 showarrow=True,
159 |                 arrowhead=1, 
160 |                 arrowsize=1,
161 |                 arrowwidth=2,
162 |                 font = dict(
163 |                 color="black",
164 |                 size=16
165 |                 ))            
166 |             elif col in list(df.columns) and col in ['train_start_date']:
167 |                 col_date = pd.to_datetime(str(df[col].unique()[0])).strftime('%Y-%m-%d')
168 |                 closest_date  = df[col].unique()[0]
169 |                 x_value = pd.to_datetime(df.loc[df[date]==closest_date, date].reset_index(drop=True)[0], format='%Y-%m-%d') 
170 |                 y_value = pd.to_numeric(df.loc[df[date]==closest_date, y].reset_index(drop=True)[0])
171 |                 fig.add_annotation(
172 |                 x=x_value, 
173 |                 y=y_value*2,  
174 |                 text= col + ': ' +  str(col_date),
175 |                 showarrow=True,
176 |                 arrowhead=1, 
177 |                 arrowsize=1,
178 |                 arrowwidth=2,
179 |                 font = dict(
180 |                 color="black",
181 |                 size=16
182 |                 ))
183 |             elif col in list(df.columns) and col in ['test_start_date']:
184 |                 col_date = pd.to_datetime(str(df[col].unique()[0])).strftime('%Y-%m-%d')
185 |                 closest_date = df[col].unique()[0]
186 |                 x_value = pd.to_datetime(df.loc[df[date]==closest_date, date].reset_index(drop=True)[0], format='%Y-%m-%d') 
187 |                 y_value = pd.to_numeric(df.loc[df[date]==closest_date, y].reset_index(drop=True)[0])
188 |                 fig.add_annotation(
189 |                 x=x_value, 
190 |                 y=y_value*1.5,  
191 |                 text= col + ': ' +  str(col_date),
192 |                 showarrow=True,
193 |                 arrowhead=1, 
194 |                 arrowsize=1,
195 |                 arrowwidth=2,
196 |                 font = dict(
197 |                 color="black",
198 |                 size=16
199 |                 ))
200 |             else:
201 |                 print('No annotation available for', col)
202 | 
203 |         # Add range slider
204 |         fig.update_layout(
205 |             xaxis=dict(
206 |                 rangeselector=dict(
207 |                     buttons=list([
208 |                         dict(count=1,
209 |                                 label="1m",
210 |                                 step="month",
211 |                                 stepmode="backward"),
212 |                         dict(count=3,
213 |                                 label="3m",
214 |                                 step="month",
215 |                                 stepmode="backward"),
216 |                         dict(count=6,
217 |                                 label="6m",
218 |                                 step="month",
219 |                                 stepmode="backward"),
220 |                         dict(count=1,
221 |                                 label="YTD",
222 |                                 step="year",
223 |                                 stepmode="todate"),
224 |                         dict(count=1,
225 |                                 label="1y",
226 |                                 step="year",
227 |                                 stepmode="backward"),
228 |                         dict(step="all")
229 |                         ])
230 |                     ),
231 |                 rangeslider=dict(
232 |                     visible=True
233 |                     ),
234 |                 type="date"
235 |                 )
236 |         )
237 |             
238 |         return fig
239 | 
240 |    


--------------------------------------------------------------------------------
/Code/Profiling/Intermittent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Profiling/Intermittent/__init__.py


--------------------------------------------------------------------------------
/Code/Profiling/Intermittent/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Profiling/Intermittent/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Profiling/Intermittent/__pycache__/intermittent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Profiling/Intermittent/__pycache__/intermittent.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Profiling/Intermittent/intermittent.py:
--------------------------------------------------------------------------------
  1 | # data elaboration functions
  2 | import pandas as pd
  3 | import numpy as np
  4 | 
  5 | # statistical functions
  6 | from scipy.stats.mstats import winsorize
  7 | 
  8 | class Intermittent:
  9 |     def cv2_by_group(df, y, grouping_var, highest=0.05, lowest=0.05):
 10 |         ''' Computes cv2 by group
 11 |         :params: df as pandas dataframe, y as string, grouping_var as list, highest and lowest as scalars 0<=x<=1 as winsorization percentages
 12 |         :return: a dataframe'''
 13 |         cv2_by_freq = df.loc[:, [grouping_var, y]].groupby(grouping_var).apply(lambda x: Intermittent.cv2(x, highest, lowest)).reset_index(level=grouping_var)
 14 |         cv2_by_freq.columns = [grouping_var, 'cv2_by_group']
 15 |         return cv2_by_freq
 16 |     
 17 |     def cv2(array, highest=0.05, lowest=0.05):
 18 |         ''' Winsorization is the process of replacing the extreme values of statistical data in order to limit 
 19 |         the effect of the outliers on the calculations or the results obtained by using that data. 
 20 |         The mean value calculated after such replacement of the extreme values is called winsorized mean.
 21 |         :params: array as numpy array, highest and lowest as scalars 0<=x<=1 as winsorization percentages
 22 |         :return: a scalar'''
 23 |         winsorized_array = winsorize(array,(highest,lowest))
 24 |         cv2 = (np.std(winsorized_array)/np.mean(winsorized_array))**2        
 25 |         return cv2
 26 |     
 27 |     def adi(array, highest=0.05, lowest=0.05):
 28 |         ''' Winsorization is the process of replacing the extreme values of statistical data in order to limit 
 29 |         the effect of the outliers on the calculations or the results obtained by using that data. 
 30 |         The mean value calculated after such replacement of the extreme values is called winsorized mean.
 31 |         :params: array as numpy array, highest and lowest as scalars 0<=x<=1 as winsorization percentages
 32 |         :return: a scalar'''
 33 |         winsorized_array = winsorize(array,(highest,lowest))
 34 |         adi = np.mean(winsorized_array)
 35 |         return adi
 36 |     
 37 |     def sddi(array, highest=0.05, lowest=0.05):
 38 |         ''' Winsorization is the process of replacing the extreme values of statistical data in order to limit 
 39 |         the effect of the outliers on the calculations or the results obtained by using that data. 
 40 |         The mean value calculated after such replacement of the extreme values is called winsorized mean.
 41 |         :params: array as numpy array, highest and lowest as scalars 0<=x<=1 as winsorization percentages
 42 |         :return: a scalar'''
 43 |         winsorized_array = winsorize(array,(highest,lowest))
 44 |         sddi = np.std(winsorized_array)
 45 |         return sddi
 46 |     
 47 |     def compute_indicator_values(vect, threshold, perc, quant, highest, lowest):
 48 |         ''' Computes indicator values
 49 |         :params: vect as numpy array, threshold as numeric, perc as numeric, quant as numeric, highest and lowest as scalars 0<=x<=1 as winsorization percentages
 50 |         :return: a dictionary
 51 |         '''  
 52 |         
 53 |         if isinstance(vect,(np.ndarray))==False:
 54 |             try:
 55 |                 vect = np.array(vect)
 56 |             except:        
 57 |                 raise Exception("identify_intermittent: input vect is not numeric and could not be converted")
 58 |         if threshold=='':
 59 |             print("No threshold provided. Using vect[0] to compute scores with OFF threshold as percentage of threshold and excluding vect[0] from score computation for all OFF thesholds.")
 60 |             threshold = vect[0]
 61 |             vect = vect[1:len(vect)]
 62 |             print('Threshold:', threshold)
 63 |             
 64 |         ### Removing nan
 65 |         vect = vect[vect!=np.nan]
 66 |         vect = vect.astype(np.float)
 67 | 
 68 |         ### Create low demand list names
 69 |         list_low_demand = ["zero", "perc_threshold"]
 70 |         for ind in ["floor_perc_quant_", "perc_quant_"]:
 71 |             list_low_demand.append(ind + str(quant).replace('0.', ''))
 72 | 
 73 |             for LD in list_low_demand:
 74 |                 if LD=="zero":
 75 |                     low_demand = 0
 76 |                 elif LD=="perc_threshold":
 77 |                     low_demand = perc*threshold
 78 |                 elif LD=="floor_perc_quant_"+ str(quant).replace('0.', ''):
 79 |                     low_demand = max([0.250, 0.001*np.quantile(vect, quant)])    
 80 |                 elif LD=="perc_quant_"+ str(quant).replace('0.', ''):
 81 |                     low_demand =  perc*np.quantile(vect, quant)
 82 |                 
 83 |                 nzd = vect[vect>low_demand]
 84 |                 k = len(nzd)
 85 |                 
 86 |                 if (sum(vect[vect>low_demand])>=2) & (k>1):
 87 |                     x = np.append([nzd[0]], [nzd[1:k] - nzd[0:(k-1)]])
 88 |                     
 89 |                     cv2 = Intermittent.cv2(nzd, highest, lowest)
 90 |                     adi = Intermittent.adi(x, highest, lowest)
 91 |                     sddi = Intermittent.sddi(x, highest, lowest)
 92 |                 else:
 93 |                     cv2 = np.nan
 94 |                     adi = np.nan
 95 |                     sddi = np.nan
 96 |                 
 97 |                 res = pd.DataFrame.from_dict({'type': [LD], 'k': [k], 'low_demand': [low_demand], 'cv2': [cv2], 'adi': [adi], 'sddi': [sddi]})
 98 |         
 99 |         return res
100 |     
101 |     def enhanced_compute_indicator_values(vect, threshold, perc, quant, highest, lowest):   
102 |         ''' Computes indicator values (enhanced)
103 |         :params: vect as numpy array, threshold as numeric, perc as numeric, quant as numeric, highest and lowest as scalars 0<=x<=1 as winsorization percentages
104 |         :return: a dictionary
105 |         '''    
106 |         if isinstance(vect,(np.ndarray))==False:
107 |             try:
108 |                 vect = np.array(vect)
109 |             except:        
110 |                 raise Exception("identify_intermittent: input vect is not numeric and could not be converted")
111 |         if threshold=='':
112 |             print("No threshold provided. Using vect[0] to compute scores with OFF threshold as percentage of threshold and excluding vect[0] from score computation for all OFF thesholds.")
113 |             threshold = vect[0]
114 |             vect = vect[1:len(vect)]
115 |             print('Threshold:', threshold)
116 |             
117 |         ### Removing nan and selecting float
118 |         vect = vect[(vect!=np.nan)]
119 |         vect = vect.astype(np.float)
120 | 
121 |         ### Z function
122 |         def Z(quant):
123 |             cond1 = max([perc * np.quantile(vect, quant), 0.1*perc*np.quantile(vect, quant), 0.25])
124 |             cond2 = min([perc * np.quantile(vect, quant), 0.1*perc*np.quantile(vect, quant)])
125 |             if 0.25 >= cond1:
126 |                 return 0.25
127 |             elif 0.25<cond2:
128 |                 return 0.1*perc*np.quantile(vect, quant)
129 |             else:
130 |                 return perc*np.quantile(vect, quant)
131 |             
132 |         ### Low demand
133 |         low_demand_name = "mix_floor_Q_" + str(quant).replace('0.', '')
134 |         dict_low_demand = {low_demand_name: {'low_demand': Z(quant)}}
135 | 
136 |         for LD in list(dict_low_demand.keys()):
137 |             low_demand = dict_low_demand[LD]['low_demand']
138 |             nzd = vect[vect>low_demand]
139 |             k = len(nzd)
140 |             
141 |             if (sum(vect[vect>low_demand])>=2) & (k>1):
142 |                 x = np.array([nzd[0]]) + [nzd[1:k] - nzd[0:(k-1)]] + np.array([len(vect)+1-nzd[k-1]])
143 | 
144 |                 cv2 = Intermittent.cv2(nzd, highest, lowest)
145 |                 adi = Intermittent.adi(x, highest, lowest)
146 |                 sddi = Intermittent.sddi(x, highest, lowest)
147 |             else:
148 |                 cv2 = np.nan
149 |                 adi = np.nan
150 |                 sddi = np.nan
151 | 
152 |             res = pd.DataFrame.from_dict({'type': [LD], 'k': [k], 'low_demand': [low_demand], 'cv2': [cv2], 'adi': [adi], 'sddi': [sddi]})
153 |             
154 |         return res
155 |     
156 |     def classify_intermittent(df, type, thres_cv2_constant, thres_cv2, thres_adi, thres_sddi, min_time_cons):
157 |         ''' Classifies intermittent time series based on indicator values
158 |         :params: df as pandas dataframe, type as string, thres_cv2_constant as numeric, thres_cv2 as numeric, thres_adi as numeric, thres_sddi as numeric, min_time_cons as numeric
159 |         :return: a pandas dataframe
160 |         '''
161 |         # Excluding the ids for which the indicators are np.nan
162 |         score_no_nan = df.dropna()
163 | 
164 |         # Regular
165 |         mask_regular = (score_no_nan.type == type) &\
166 |                                         (score_no_nan.k > min_time_cons) &\
167 |                                         (score_no_nan.cv2 >= thres_cv2_constant) &\
168 |                                         (score_no_nan.cv2 < thres_cv2) &\
169 |                                         (score_no_nan.cv2 < thres_adi) &\
170 |                                         (score_no_nan.cv2 < thres_sddi)
171 |         df_regular = score_no_nan.loc[mask_regular, ]
172 |         try:
173 |             df_regular.loc[:, 'profile'] = 'regular'
174 |             print('classify_intermittent: regular ids', len(df_regular))
175 |         except:
176 |             print('classify_intermittent: no regular ids')
177 | 
178 |         # Constant at zero
179 |         mask_constant_zero = (score_no_nan.type == type) &\
180 |                                         (score_no_nan.k <= min_time_cons)
181 |         df_constant_zero = score_no_nan.loc[mask_constant_zero, ]
182 |         try:
183 |             df_constant_zero.loc[:, 'profile'] = 'constant_zero'
184 |             print('classify_intermittent: constant_zero ids', len(df_constant_zero))
185 |         except:
186 |             print('classify_intermittent: no constant_zero ids')
187 | 
188 |         # Constant
189 |         mask_constant = (score_no_nan.type == type) &\
190 |                                         (score_no_nan.k > min_time_cons) &\
191 |                                         (score_no_nan.cv2 < thres_cv2_constant) &\
192 |                                         (score_no_nan.cv2 < thres_adi) &\
193 |                                         (score_no_nan.cv2 < thres_sddi)
194 |         df_constant = score_no_nan.loc[mask_constant, ]
195 |         try:
196 |             df_constant.loc[:, 'profile'] = 'constant'
197 |             print('classify_intermittent: constant ids', len(df_constant))
198 |         except:
199 |             print('classify_intermittent: no constant ids')
200 | 
201 |         # Spikes
202 |         mask_spikes = (score_no_nan.type == type) &\
203 |                                         (score_no_nan.k > min_time_cons) &\
204 |                                         (score_no_nan.cv2 < thres_cv2) &\
205 |                                         (score_no_nan.cv2 >= thres_adi) &\
206 |                                         (score_no_nan.cv2 < thres_sddi)
207 |         df_spikes = score_no_nan.loc[mask_spikes, ]
208 |         try:
209 |             df_spikes.loc[:, 'profile'] = 'spikes'
210 |             print('classify_intermittent: spikes ids', len(df_spikes))
211 |         except:
212 |             print('classify_intermittent: no spikes ids')
213 | 
214 |         # Lumpy
215 |         mask_lumpy = (score_no_nan.type == type) &\
216 |                                         (score_no_nan.k > min_time_cons) &\
217 |                                         (score_no_nan.cv2 >= thres_cv2) &\
218 |                                         (score_no_nan.cv2 >= thres_adi) &\
219 |                                         (score_no_nan.cv2 < thres_sddi)
220 |         df_lumpy = score_no_nan.loc[mask_lumpy, ]
221 |         try:    
222 |             df_lumpy.loc[:, 'profile'] = 'lumpy'
223 |             print('classify_intermittent: lumpy', len(df_lumpy))
224 |         except:
225 |             print('classify_intermittent: no lumpy ids')
226 | 
227 |         # Erratic
228 |         mask_erratic = (score_no_nan.type == type) &\
229 |                                         (score_no_nan.k > min_time_cons) &\
230 |                                         (score_no_nan.cv2 >= thres_cv2) &\
231 |                                         (score_no_nan.cv2 < thres_adi) &\
232 |                                         (score_no_nan.cv2 < thres_sddi)
233 |         df_erratic = score_no_nan.loc[mask_erratic, ]
234 |         try:
235 |             df_erratic.loc[:, 'profile'] = 'erratic'
236 |             print('classify_intermittent: erratic ids', len(df_erratic))
237 |         except:
238 |             print('classify_intermittent: no erratic ids')
239 | 
240 |         # Unforecastable time
241 |         mask_unforecastable_time = (score_no_nan.type == type) &\
242 |                                         (score_no_nan.k > min_time_cons) &\
243 |                                         (score_no_nan.cv2 < thres_cv2) &\
244 |                                         (score_no_nan.cv2 >= thres_sddi)
245 |         df_unforecastable_time = score_no_nan.loc[mask_unforecastable_time, ]
246 |         try:
247 |             df_unforecastable_time.loc[:, 'profile'] = 'unforecastable_time'
248 |             print('classify_intermittent: unforecastable_time ids', len(df_unforecastable_time))
249 |         except:
250 |             print('classify_intermittent: no unforecastable_time ids')
251 | 
252 |         # Unforecastable quantity
253 |         mask_unforecastable_quantity = (score_no_nan.type == type) &\
254 |                                         (score_no_nan.k > min_time_cons) &\
255 |                                         (score_no_nan.cv2 >= thres_cv2) &\
256 |                                         (score_no_nan.cv2 >= thres_sddi)
257 |         df_unforecastable_quantity = score_no_nan.loc[mask_unforecastable_quantity, ]
258 |         try:
259 |             df_unforecastable_quantity.loc[:, 'profile'] = 'unforecastable_quantity'
260 |             print('classify_intermittent: unforecastable_quantity ids', len(df_unforecastable_quantity))
261 |         except:
262 |             print('classify_intermittent: no unforecastable_quantity ids')
263 |         
264 |         # df_profiling        
265 |         df_profiling = pd.concat([df_regular, df_constant_zero, df_constant, df_spikes, df_lumpy, df_erratic, df_unforecastable_time, df_unforecastable_quantity], axis=0)
266 |         
267 |         return df_profiling
268 |     
269 |     def call_intermittent_function(func, *args):
270 |         from Code.Profiling.Intermittent.intermittent import Intermittent
271 |         func_dict = {'enhanced_compute_indicator_values': Intermittent.enhanced_compute_indicator_values, 'compute_indicator_values': Intermittent.compute_indicator_values}
272 |         result = func_dict.get(func)(*args)
273 |         return result
274 | 
275 | 


--------------------------------------------------------------------------------
/Code/Profiling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Profiling/__init__.py


--------------------------------------------------------------------------------
/Code/Profiling/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Profiling/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Regressors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Regressors/__init__.py


--------------------------------------------------------------------------------
/Code/Regressors/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Regressors/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Regressors/__pycache__/regressors.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Regressors/__pycache__/regressors.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Regressors/regressors.py:
--------------------------------------------------------------------------------
  1 | # data elaboration functions
  2 | import pandas as pd
  3 | import numpy as np
  4 | import re
  5 | 
  6 | # file management functions
  7 | import os
  8 | import glob
  9 | import holidays as h
 10 | 
 11 | # time management functions
 12 | import datetime as dt
 13 | 
 14 | # custom functions
 15 | from Configuration.config import cfg_path
 16 | from Code.Utils.utils import Utils
 17 | 
 18 | class Regressors:
 19 |     def create_interactions(df, var1, var2):
 20 |         """
 21 |         Adds interaction terms between two variables as var1*var2 to dataframe
 22 |         :params: dataframe, var1 and var 2 as string
 23 |         :return: a Pandas dataframe
 24 |         """
 25 |         variables = df[[var1, var2]]
 26 |         for i in range(0, variables.columns.size):
 27 |             for j in range(0, variables.columns.size):
 28 |                 col1 = variables.columns[i]
 29 |                 col2 = variables.columns[j]
 30 |                 if i <= j:
 31 |                     name = col1 + "*" + col2
 32 |                     df.loc[:, name] = variables[col1] * variables[col2]
 33 | 
 34 |         df.drop(columns = [var1 + "*" + var1], inplace=True)
 35 |         df.drop(columns = [var2 + "*" + var2], inplace=True)
 36 |         return df
 37 |         
 38 |     def create_non_linear_terms(df, var, n):
 39 |         """
 40 |         Adds non linear terms as var^2 to dataframe
 41 |         :params: dataframe, var as string and n as int
 42 |         :return: a Pandas dataframe
 43 |         """
 44 |         name = var + "^" + str(n)
 45 |         df.loc[:, name] = df.loc[:, var]**n
 46 |         return df
 47 |     
 48 |     def add_holidays_by_country(df, date_var, country):
 49 |         """
 50 |         Adds holidays a dummy variable (0/1) to dataframe
 51 |         :params: dataframe, date_var as string, country as string
 52 |         :return: a Pandas dataframe
 53 |         """
 54 |         if 'holidays' in list(df.columns):
 55 |             print('add_holidays_by_country: holidays column already present')
 56 |         else:
 57 |             holidays = eval("h." + country.capitalize() + "()")
 58 |             date_holidays = df.loc[:, date_var].apply(lambda x: int(1) if x in holidays else int(0))
 59 |             date_holidays = pd.DataFrame(date_holidays)
 60 |             date_holidays.columns = pd.Index(['holidays'])
 61 |             df = pd.concat([df, date_holidays], axis=1)
 62 |         return df
 63 |         
 64 |     def add_weekdays(df, date_var):
 65 |         """
 66 |         Adds weekdays a dummy variables (0/1) for each weekday to dataframe
 67 |         :params: dataframe, date_var as string
 68 |         :return: a Pandas dataframe
 69 |         """
 70 |         df.loc[:,'wd_mon'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 0 else int(0))
 71 |         df.loc[:,'wd_tue'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 1 else int(0))
 72 |         df.loc[:,'wd_wed'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 2 else int(0))
 73 |         df.loc[:,'wd_thu'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 3 else int(0))
 74 |         df.loc[:,'wd_fri'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 4 else int(0))
 75 |         df.loc[:,'wd_sat'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 5 else int(0))
 76 |         df.loc[:,'wd_sun'] = df.loc[:, date_var].apply(lambda x: int(1) if x.weekday() == 6 else int(0))
 77 |         return df
 78 | 
 79 |     def add_months(df, date_var):
 80 |         """
 81 |         Adds months a dummy variables (0/1) for each month to dataframe
 82 |         :params: dataframe, date_var as string
 83 |         :return: a Pandas dataframe
 84 |         """
 85 |         for i in range(1, 13):
 86 |             if i < 10:
 87 |                 varname = 'month_0' + str(i)
 88 |             else:
 89 |                 varname = 'month_' + str(i)
 90 |             
 91 |             df.loc[:, varname] = df.loc[:, date_var].apply(lambda x: int(1) if x.month == i else int(0))
 92 |         return df 
 93 |         
 94 |     def calculate_degree_days(df, base_temperature, temperature):
 95 |         """
 96 |         Calculate the Degree Days Heating and Cooling values
 97 |         :params: dataframe, base temperature to start and actual temperature as string
 98 |         :return: a pandas dataframe
 99 |         """
100 |         df['DDC_temperature'] = (df[temperature] - df[base_temperature]).clip(lower=0)
101 |         df['DDH_temperature'] = (df[base_temperature] - df[temperature]).clip(lower=0)
102 | 
103 |         return df
104 | 
105 |     def merge_holidays_by_date(df, df_holidays, id):
106 |         """
107 |         Merge Holiday df with the train df
108 |         :params: df as dataframe, df_holidays as df containing info on holidays, id as string
109 |         :return: a pandas dataframe
110 |         """
111 |         date_var = Utils.find_date(df)
112 |         date_var_holidays = Utils.find_date(df_holidays)
113 |         
114 |         cols_to_keep = list(df.columns)
115 |         
116 |         df['date_key'] = df[date_var].dt.year.astype(str) + df[date_var].dt.month.astype(str) + df[date_var].dt.day.astype(str)
117 |         df_holidays['date_key'] = df_holidays[date_var_holidays].dt.year.astype(str) + df_holidays[date_var_holidays].dt.month.astype(str) + df_holidays[date_var_holidays].dt.day.astype(str)
118 |         
119 |         df.loc[:, 'holidays'] = int(0)
120 |         df_merge = pd.merge(df, df_holidays, how="left", on=["date_key", id], indicator=True)
121 |         df_merge.loc[df_merge._merge=='both', 'holidays'] = int(1)
122 |         
123 |         cols_to_keep = cols_to_keep + ['holidays']
124 |         df = df_merge[cols_to_keep].copy()
125 | 
126 |         return df
127 | 
128 |     def merge_additional_days_off(df, df_metadata, id, dict_days_off):
129 |         """
130 |         Merge Site Weekend data with train df
131 |         :params: df as dataframe, df_metadata as df containing additional info, id as string, dict_days_off as dictionary 
132 |         :return: a pandas dataframe
133 |         """
134 |         date_var = Utils.find_date(df)
135 | 
136 |         # Sites only had weekly leaves on Friday, Saturday and Sunday
137 |         list_days_off = list(dict_days_off.keys())
138 |         df.loc[:, 'day_off'] = int(0)
139 |         for d in list_days_off: 
140 |             leave = (df[date_var].dt.dayofweek == dict_days_off[d]) & (df[id].isin(df_metadata[df_metadata[d]][id]))
141 |             df.loc[leave==True, 'day_off'] = int(1)
142 | 
143 |         df['day_off'] = df['day_off'].astype("int8")
144 | 
145 |         return df
146 | 
147 |     def merge_weather(df, weather, date_var, id):
148 |         """
149 |         Merge weather data into the train df
150 |         :params: df as dataframe, weather as dataframe with weather info, date_var as string, id as string
151 |         :return: a pandas dataframe
152 |         
153 |         """
154 |         
155 |         date_var = Utils.find_date(df)
156 |         date_var_weather = Utils.find_date(weather)
157 | 
158 |         # drop duplicate values in weather and pick the closest weather station
159 |         weather_cleaned = weather.sort_values([date_var, id, "distance"]).groupby([date_var, id]).first().reset_index()
160 |         assert weather_cleaned.groupby([date_var, id]).count().max().max() == 1
161 | 
162 |         df = pd.merge(df.sort_values([date_var, id]), weather_cleaned.sort_values([date_var_weather]), left_on=[date_var, id], right_on= [date_var_weather, id], how='left', validate="m:1")
163 | 
164 |         return df


--------------------------------------------------------------------------------
/Code/Regressors/similar_day.py:
--------------------------------------------------------------------------------
  1 | # data elaboration functions
  2 | import numpy as np
  3 | import pandas as pd
  4 | import holidays as h
  5 | from functools import reduce
  6 | 
  7 | # datetime functions
  8 | import dateutil
  9 | import datetime
 10 | from dateutil.relativedelta import relativedelta
 11 | 
 12 | # custom functions
 13 | from Code.Regressors.regressors import Regressors
 14 | from Code.Utils.utils import AlphabeticalCombinations, Utils
 15 | 
 16 | class SimilarDay: 
 17 |     def get_similar_days_in_previous_year(dates, country):
 18 |         """
 19 |         Retrieves the similar day for a given date. 
 20 |         :param dates: a list-like object of dates, country as string
 21 |         :return: a Pandas series of similar days
 22 |         """
 23 |         d = pd.to_datetime(pd.Series(dates))        
 24 |         holidays = eval("h." + country.capitalize() + "()")
 25 |         return d.apply(lambda x: SimilarDay.get_similar_day_in_previous_year(x, holidays))
 26 | 
 27 |     def get_similar_days_in_previous_week(dates, country):
 28 |         """
 29 |         Retrieves the similar day for a given date.
 30 |         :param dates: a list-like object of dates, country as string
 31 |         :return: a Pandas series of similar days
 32 |         """
 33 |         d = pd.to_datetime(pd.Series(dates))
 34 |         holidays = eval("h." + country.capitalize() + "()")
 35 |         return d.apply(lambda x: SimilarDay.get_similar_day_in_previous_week(x, holidays))
 36 | 
 37 | 
 38 |     def get_similar_day_in_previous_year(d, holiday_calendar):
 39 |         """
 40 |         Retrieves the similar day for a given date. If the given date is not an holiday, the similar day is the
 41 |         closest day of the previous year in terms of calendar position which shares the weekday. If such a date is an holiday,
 42 |         the same weekday of the week before is considered. 
 43 |         If the given date is an holiday, its similar day is the closest holiday to the given date in the previous year.
 44 |         :param d: a date
 45 |         :param holiday_calendar: a calendar from holidays package
 46 |         :return: the similar day
 47 |         """
 48 |         if not d or pd.isna(d):
 49 |             return None
 50 | 
 51 |         new_date = d - relativedelta(years=1)
 52 |         holiday = holiday_calendar.get(d)
 53 |         diff = d.weekday() - new_date.weekday() if d.weekday() >= new_date.weekday() \
 54 |             else d.weekday() - new_date.weekday() + 7
 55 | 
 56 |         if not holiday:
 57 |             new_date = new_date + datetime.timedelta(days=diff)
 58 |             while holiday_calendar.get(new_date):
 59 |                 new_date = new_date - datetime.timedelta(days=7)
 60 |         # elif holiday == 'Pasqua di Resurrezione':
 61 |         #     new_date = dateutil.easter.easter(new_date.year)
 62 |         # elif holiday == "Lunedì dell'Angelo":
 63 |         #     new_date = dateutil.easter.easter(new_date.year) + datetime.timedelta(days=1)
 64 | 
 65 |         return new_date
 66 | 
 67 |     def get_similar_day_in_previous_week(d, holiday_calendar):
 68 |         """
 69 |         Retrieves the similar day for a given date. If the given date is not an holiday, the similar day is the
 70 |         closest day of the previous year in terms of calendar position which shares the weekday. If such a date is an holiday,
 71 |         the same weekday of the week before is considered. 
 72 |         If the given date is an holiday, its similar day is the closest holiday to the given date in the previous year.
 73 |         :param d: a date
 74 |         :param holiday_calendar: a calendar from holidays package
 75 |         :return: the similar day
 76 |         """
 77 |         if not d or pd.isna(d):
 78 |             return None
 79 | 
 80 |         new_date = d - relativedelta(weeks=1)
 81 |         holiday = holiday_calendar.get(d)
 82 |         diff = d.weekday() - new_date.weekday() if d.weekday() >= new_date.weekday() \
 83 |             else d.weekday() - new_date.weekday() + 7
 84 | 
 85 |         if not holiday:
 86 |             new_date = new_date + datetime.timedelta(days=diff)
 87 |             while holiday_calendar.get(new_date):
 88 |                 new_date = new_date - datetime.timedelta(days=7)
 89 |         # elif holiday == 'Pasqua di Resurrezione':
 90 |         #     new_date = dateutil.easter.easter(new_date.year)
 91 |         # elif holiday == "Lunedì dell'Angelo":
 92 |         #     new_date = dateutil.easter.easter(new_date.year) + datetime.timedelta(days=1)
 93 | 
 94 |         return new_date
 95 | 
 96 | class StandardConsumption:   
 97 |     def get_standard_consumption_as_mean(df, id, date_var, var, country):
 98 |         """
 99 |         Retrieves the standard consumption for a given date as hourly monthly mean differentiated by holiday, weekend, weekdays. 
100 |         :params: dataframe and date_var as string, var as string, country as string
101 |         :return: the similar day
102 |         """
103 | 
104 |         df = Regressors.add_holidays_by_country(df, date_var, country)
105 |         df = Regressors.add_weekdays(df, date_var)
106 |         df.loc[:, 'day'] = df.loc[:, date_var].dt.day
107 |         df.loc[:, 'hour'] = df.loc[:, date_var].dt.hour
108 |         df.loc[:, 'month'] = df.loc[:, date_var].dt.month
109 |         
110 |         timedelta = Utils.delta_format(abs(np.diff(df[date_var])).mean())
111 |         freq = Utils.find_freq(timedelta)
112 |         
113 |         if freq == 'D':
114 |             freq_var='day'
115 |         else:
116 |             freq_var='hour'
117 |         
118 |         # Compute standard consumption as means        
119 |         mask = (~df[var].isnull()) &  ((df.wd_mon==1) | (df.wd_tue==1) | (df.wd_wed==1) | (df.wd_thu==1) | (df.wd_fri==1)) & (df.holidays==0) 
120 |         df_mean_weekdays = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.mean).reset_index()
121 |         new_var = var + '_std_weekdays'
122 |         df_mean_weekdays.rename(columns={var: new_var}, inplace=True)
123 |         df_mean_weekdays.loc[df_mean_weekdays[new_var]<0, new_var] = 0
124 |         
125 |         mask = (~df[var].isnull()) & ((df.wd_sat==1) | (df.wd_sun==1)) & (df.holidays==0) 
126 |         df_mean_weekend = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.mean).reset_index()
127 |         new_var = var + '_std_weekend'
128 |         df_mean_weekend.rename(columns={var: new_var}, inplace=True)
129 |         df_mean_weekend.loc[df_mean_weekend[new_var]<0, new_var] = 0
130 |         
131 |         mask = (~df[var].isnull()) & (df.holidays==1) 
132 |         df_mean_holidays = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.mean).reset_index()
133 |         new_var = var + '_std_holidays'
134 |         df_mean_holidays.rename(columns={var: new_var}, inplace=True)
135 |         df_mean_holidays.loc[df_mean_holidays[new_var]<0, new_var] = 0
136 |         
137 |         # Merging
138 |         dfs = [df_mean_holidays, df_mean_weekdays, df_mean_weekend]
139 |         df_mean = reduce(lambda left,right: pd.merge(left,right,how='outer', on=[id, 'month', freq_var], validate='1:1'), dfs)
140 |         df = pd.merge(df, df_mean, how='left', on=[id, 'month', freq_var], validate='m:1')
141 |         
142 |         return df
143 |     
144 |     
145 |     def get_minimum_consumption(df, date_var, var, country):
146 |         """
147 |         Retrieves the minimum consumption for a given date as hourly monthly minimum value differentiated by holiday, weekend, night. 
148 |         :params: dataframe and date_var as string, var as string, country as string
149 |         :return: the similar day
150 |         """
151 | 
152 |         df = Regressors.add_holidays_by_country(df, date_var, country)
153 |         df = Regressors.add_weekdays(df, date_var)
154 |         df.loc[:, 'day'] = df.loc[:, date_var].dt.day
155 |         df.loc[:, 'hour'] = df.loc[:, date_var].dt.hour
156 |         df.loc[:, 'month'] = df.loc[:, date_var].dt.month
157 |         
158 |         timedelta = Utils.delta_format(abs(np.diff(df[date_var])).mean())
159 |         freq = Utils.find_freq(timedelta)
160 |         
161 |         if freq == 'D':
162 |             freq_var='day'
163 |         else:
164 |             freq_var='hour'
165 |         
166 |         # Compute min consumption        
167 |         mask = (~df[var].isnull()) & (df.holidays==0) & ((df.wd_mon==1) | (df.wd_tue==1) | (df.wd_wed==1) | (df.wd_thu==1) | (df.wd_fri==1))
168 |         df_min_weekdays = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.min).reset_index()
169 |         new_var = var + '_min_weekdays'
170 |         df_min_weekdays.rename(columns={var: new_var}, inplace=True)
171 |         df_min_weekdays.loc[df_min_weekdays[new_var]<0, new_var] = 0
172 |         
173 |         mask = (~df[var].isnull()) & ((df.wd_sat==1) | (df.wd_sun==1)) & (df.holidays==0) 
174 |         df_min_weekend = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.min).reset_index()
175 |         new_var = var + '_min_weekend'
176 |         df_min_weekend.rename(columns={var: new_var}, inplace=True)
177 |         df_min_weekend.loc[df_min_weekend[new_var]<0, new_var] = 0
178 |         
179 |         mask = (~df[var].isnull()) & (df.holidays==1) 
180 |         df_min_holidays = pd.pivot_table(df.loc[mask==True, ], index=[id, 'month', freq_var], values=var, aggfunc=np.min).reset_index()
181 |         new_var = var + '_min_holidays'
182 |         df_min_holidays.rename(columns={var: new_var}, inplace=True)
183 |         df_min_holidays.loc[df_min_holidays[new_var]<0, new_var] = 0
184 |         
185 |         # Merging
186 |         dfs = [df_min_holidays, df_min_weekdays, df_min_weekend]
187 |         df_min = reduce(lambda left,right: pd.merge(left,right,how='outer', on=[id, 'month', freq_var], validate='1:1'), dfs)
188 |         df = pd.merge(df, df_min, how='left', on=[id, 'month', freq_var], validate='m:1')
189 |         
190 |         return df
191 | 
192 | 
193 | 


--------------------------------------------------------------------------------
/Code/Regressors/temperatures.py:
--------------------------------------------------------------------------------
 1 | # selenium for web driving
 2 | from logging import raiseExceptions
 3 | from selenium import webdriver
 4 | from selenium.webdriver.common.by import By
 5 | from selenium.webdriver.support.ui import WebDriverWait
 6 | from selenium.webdriver.support import expected_conditions as EC
 7 | from selenium.webdriver import ActionChains
 8 | from selenium.webdriver.common.keys import Keys
 9 | from selenium.webdriver.chrome.options import Options
10 | 
11 | # time for pausing between navigation
12 | import time
13 | import glob
14 | import shutil
15 | 
16 | # datetime functions
17 | import datetime as dt
18 | 
19 | # file management functions
20 | import os
21 | import configparser
22 | import ctypes
23 | 
24 | # data elaboration functions
25 | import pandas as pd
26 | import numpy as np
27 | from openpyxl import load_workbook
28 | from functools import reduce
29 | 
30 | # custom functions
31 | from Code.Utils.utils import Utils, AlphabeticalCombinations
32 | 
33 | class Temperatures:
34 |         
35 |     def ten_year(df, id, date_var, freq, temperature_list, start_date, end_date):
36 |         """
37 |         Computes ten year temperatures and asis temperatures
38 |         :params: dataframe
39 |         :return: a Pandas dataframe, a .pkl file and a .xlsx file
40 |         """
41 |         ten_year_list = []
42 |         ten_year_overall_list = []
43 |         for t in temperature_list:
44 |             ten_year_list = ten_year_list + [t + '_ten_year']
45 |             ten_year_overall_list = ten_year_overall_list + [t + '_ten_year_overall']
46 |             
47 |         df_seq = Utils.add_seq(df, date_var = date_var, serie=id, freq = freq, start_date=start_date, end_date=end_date)
48 |         df_seq.loc[:, 'months_days'] = df_seq.loc[:, date_var].dt.strftime('%m/%d')
49 |         
50 |         # Defining averages by id
51 |         df_to_merge = pd.pivot_table(df_seq, values=temperature_list, index=[id, 'months_days'], aggfunc=np.mean).reset_index()
52 |         col_list = [id, 'months_days'] + ten_year_list
53 |         df_to_merge.columns = col_list
54 |         
55 |         # Defining overall averages
56 |         df_to_merge_overall = pd.pivot_table(df_seq, values=temperature_list, index=['months_days'], aggfunc=np.mean).reset_index()
57 |         col_list_overall = ['months_days'] + ten_year_overall_list 
58 |         df_to_merge_overall.columns = col_list_overall
59 |         
60 |         # Merging
61 |         df_merge = pd.merge(df_seq, df_to_merge, on=[id, 'months_days'], how='left', validate='m:1')
62 |         df_merge_overall = pd.merge(df_merge, df_to_merge_overall, on=['months_days'], how='left', validate='m:1')
63 |                
64 |         ### Creating As-Is temperatures: where available use actual temp, if not use ten year
65 |         for t in temperature_list:
66 |             asis_name = t + '_asis'
67 |             ten_year_name = t + '_ten_year'
68 |             ten_year_overall_name = t + '_ten_year_overall'
69 |             df_merge_overall.loc[:, asis_name] = df_merge_overall.loc[:, t]
70 |             df_merge_overall.loc[df_merge_overall[asis_name].isnull(), asis_name] = df_merge_overall.loc[:, ten_year_name]
71 |             df_merge_overall.loc[df_merge_overall[asis_name].isnull(), asis_name] = df_merge_overall.loc[:, ten_year_overall_name]
72 | 
73 |             if (any(df_merge_overall[asis_name].isnull())):
74 |                 print('ten_year: asis temperatures still CONTAIN nan value: removing')
75 |                 df_merge_overall = df_merge_overall.loc[df_merge_overall[asis_name].isnull()==False, ]
76 |             else:
77 |                 print('ten_year: asis temperatures do NOT contain any nan value')
78 |                 
79 |         df_ten_year = df_merge_overall.loc[:, ['site_id', 'timestamp', 'temperature', 'distance', 'months_days',
80 |        'temperature_ten_year', 'temperature_asis']]
81 | 
82 |         return df_ten_year
83 |     
84 |    
85 | 


--------------------------------------------------------------------------------
/Code/Scoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__init__.py


--------------------------------------------------------------------------------
/Code/Scoring/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Scoring/__pycache__/forecast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/forecast.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Scoring/__pycache__/kpi.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/kpi.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Scoring/__pycache__/scoring.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/scoring.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Scoring/__pycache__/train.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/train.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Scoring/__pycache__/train_test.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Scoring/__pycache__/train_test.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Scoring/forecast.py:
--------------------------------------------------------------------------------
 1 | # file management functions
 2 | import os
 3 | import glob
 4 | from pyexpat.errors import XML_ERROR_UNEXPECTED_STATE
 5 | 
 6 | # data elaboration functions
 7 | import numpy as np
 8 | import pandas as pd
 9 | from openpyxl import load_workbook
10 | import re
11 | import pickle
12 | 
13 | # datetime functions
14 | import datetime as dt
15 | 
16 | # AI functions
17 | import xgboost as xgb
18 | from sklearn.linear_model import LinearRegression
19 | from sklearn.metrics import mean_squared_error
20 | from sklearn.model_selection import train_test_split
21 | 
22 | # custom functions
23 | from Code.Utils.utils import Utils
24 | 
25 | class Forecasting:
26 |     def forecast(dict_test, trained_model):
27 |         """
28 |         Generate forecast
29 |         :params: dict_test as dictionary, trained_model as dictionary from training
30 |         :return: a dictionary
31 |         """
32 |         X_test = dict_test['X_test']
33 |         date_array_test = dict_test["date_array"]
34 |         list_id = dict_test['list_id']
35 |         date = Utils.find_date(dict_test['y_tilda'])
36 |         
37 |         # Regressors list
38 |         regressors_list = sorted(list(set(list(X_test.columns)) - set(list_id)))
39 |         
40 |         # Forecasting    
41 |         print('Forecasting')
42 |          
43 |         y_test = X_test.loc[:, regressors_list].copy()
44 |         y_hat = trained_model.predict(y_test)
45 |                     
46 |         ### Adjusting negative values
47 |         y_hat_series_pos = y_hat.copy()
48 |         y_hat_series_pos[y_hat_series_pos < 0] = 0
49 | 
50 |         forecasted_model = {'df_fcst': pd.DataFrame({date: date_array_test, 'fcst': y_hat_series_pos})}
51 | 
52 |         print('Forecasting completed')
53 |         return forecasted_model
54 |     
55 | 


--------------------------------------------------------------------------------
/Code/Scoring/kpi.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # data elaboration functions
  3 | from attr import validate
  4 | import pandas as pd
  5 | from six.moves import collections_abc
  6 | import string
  7 | import numpy as np
  8 | import math
  9 | 
 10 | # datetime functions
 11 | import datetime as dt
 12 | 
 13 | # file management functions
 14 | import os
 15 | import sys
 16 | import opendatasets as od
 17 | import pickle
 18 | from pathlib import Path
 19 | 
 20 | # data science functions
 21 | from sklearn.metrics import mean_absolute_error
 22 | 
 23 | # custom functions
 24 | from Code.Utils.utils import Utils
 25 | from Code.Scoring.train import Training
 26 | from Code.Scoring.forecast import Forecasting
 27 | 
 28 | class Kpi:
 29 |     def find_mae(y, dict_train, dict_test, dict_models):
 30 |         """
 31 |         Compute mean absolute error
 32 |         :params: y as string, dict_train as dictionary, dict_test as dictionary, dict_models as dictionary
 33 |         :return: a dictionary
 34 |         """
 35 |         
 36 |         dict_test_no_nan = dict_test.copy()
 37 |         dict_test_no_nan['X_test'] = dict_test['X_test'].dropna()
 38 |         dict_test_no_nan['y_tilda'] = dict_test['y_tilda'].dropna()
 39 |         
 40 |         date_var_y_tilda = Utils.find_date(dict_test_no_nan['y_tilda'])
 41 |         dict_test_no_nan['date_array'] = dict_test_no_nan['y_tilda'].loc[:, date_var_y_tilda]
 42 |         
 43 |         # Training and forecasting
 44 |         dict_kpi = {}
 45 |         for m in list(dict_models.keys()):  
 46 |             print('kpi for model', m)
 47 |             try:
 48 |                 model = dict_models[m]       
 49 |                 trained_model = Training.train(dict_train, model)
 50 |                 forecasted_model = Forecasting.forecast(dict_test, trained_model = trained_model)
 51 |                 y_tilda = dict_test['y_tilda'].copy()
 52 |                 y_tilda_date = Utils.find_date(y_tilda)
 53 |                 y_hat = forecasted_model['df_fcst'].copy()
 54 |                 y_hat_date = Utils.find_date(y_hat)
 55 |                 
 56 |                 df_merge = pd.merge(y_tilda, y_hat, left_on=y_tilda_date, right_on=y_hat_date, how='inner', validate='1:1').dropna()
 57 |                 mae = mean_absolute_error(df_merge[y], df_merge['fcst'])
 58 |                 dict_kpi[m] = mae
 59 |             except:
 60 |                 print('kpi for model', m, 'could not be computed')
 61 | 
 62 |         return dict_kpi
 63 |     
 64 |     def compute_error(df, fcst, y):
 65 |         """       
 66 |         Compute error as forecast-actual
 67 |         :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns,
 68 |         :return: a dataframe
 69 |         """
 70 |         if 'error' in df.columns:
 71 |             df = df.drop(columns='error')
 72 |             
 73 |         df.loc[:, 'error'] = (df[fcst] - df[y])
 74 |         return df
 75 |     
 76 |     def compute_absolute_error(df, fcst, y):
 77 |         """       
 78 |         Compute absolute error as abs(forecast-actual)
 79 |         :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns,
 80 |         :return: a dataframe
 81 |         """
 82 |         if 'absolute_error' in df.columns:
 83 |             df = df.drop(columns='absolute_error')
 84 |             
 85 |         df.loc[:, 'absolute_error'] = abs(df[fcst] - df[y])
 86 |         return df
 87 |     
 88 |     def compute_absolute_percentage_error(df, fcst, y):
 89 |         """       
 90 |         Compute absolute % error
 91 |         :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns,
 92 |         :return: a dataframe
 93 |         """
 94 |         if 'absolute_error' in df.columns:
 95 |             df = df.drop(columns='absolute_error')
 96 |         
 97 |         if 'absolute_percentage_error' in df.columns:
 98 |             df = df.drop(columns='absolute_percentage_error')
 99 |         
100 |         df = Kpi.compute_absolute_error(df, fcst, y)
101 |         df.loc[:, 'absolute_percentage_error'] = df.loc[:, 'absolute_error']/df.loc[:, y]
102 |         return df
103 |     
104 |     def compute_mean_error(df, fcst, y):
105 |         """       
106 |         Compute mean  error
107 |         :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns,
108 |         :return: a scalar
109 |         """
110 |         df = Kpi.compute_error(df, fcst, y)
111 |         mean_error = df.loc[:, 'error'].mean()
112 |         return mean_error
113 |     
114 |     def compute_mae(df, fcst, y):
115 |         """       
116 |         Compute mean absolute error
117 |         :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns,
118 |         :return: a scalar
119 |         """
120 |         df = Kpi.compute_absolute_error(df, fcst, y)
121 |         var = 'absolute_error'
122 |         mask = (df[var].isnull()==False) & (np.isneginf(df[var])==False) & (np.isposinf(df[var])==False)
123 |         mae = df.loc[mask==True, var].mean()
124 |         return mae
125 |     
126 |     def compute_mape(df, fcst, y):
127 |         """       
128 |         Compute mean absolute % error
129 |         :params: df as pandas dataframe, fcst as string as the name of the forecast columns, y as string as the name of the actual columns,
130 |         :return: a scalar
131 |         """
132 |         df = Kpi.compute_absolute_percentage_error(df, fcst, y)
133 |         var = 'absolute_percentage_error'
134 |         mask = (df[var].isnull()==False) & (np.isneginf(df[var])==False) & (np.isposinf(df[var])==False)
135 |         mape = df.loc[mask==True, var].mean()
136 |         return mape
137 |         


--------------------------------------------------------------------------------
/Code/Scoring/scoring.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # data elaboration functions
 3 | import pandas as pd
 4 | from six.moves import collections_abc
 5 | import string
 6 | import numpy as np
 7 | 
 8 | # datetime functions
 9 | import datetime as dt
10 | 
11 | # file management functions
12 | import os
13 | import sys
14 | import opendatasets as od
15 | import pickle
16 | from pathlib import Path
17 | 
18 | # data science functions
19 | # custom functions
20 | from Code.Utils.utils import Utils
21 | from Code.Scoring.kpi import Kpi
22 | 
23 | class Scoring:
24 |     def find_best_algorithm(y, dict_train, dict_test, dict_algorithms, out_of_sample):
25 |         """
26 |         Finds the best performing algorithm in terms of min mean absolute error
27 |         :params: y as string, dict_train as dictionary, dict_test as dictionary, dict_algorithm as dictionary, out_of_sample as string
28 |         :return: a string
29 |         """
30 |         try:
31 |             dict_kpi = Kpi.find_mae(y, dict_train, dict_test, dict_algorithms)
32 |             # Best model        
33 |             df_best_model = pd.DataFrame.from_dict(dict_kpi, orient='index').reset_index()
34 |             df_best_model.rename(columns={'index': 'model', 0: 'mae'}, inplace=True)
35 |             best_model = df_best_model.loc[df_best_model.mae==df_best_model.mae.min(), 'model'].reset_index(drop=True)[0]
36 |         except:
37 |             print('best model could not be computed, no KPI available, using out of sample algorithm. Check to have an overlap between training and test sets dates!')
38 |             best_model = out_of_sample
39 |         return best_model
40 |     
41 |     def stats_per_site(df, id, date_var):
42 |         """
43 |         Helper function to identify amount of data per site
44 |         :params: df as pandas dataframe, id as string, date_var as string
45 |         :return: a pandas dataframe
46 |         """
47 |         return pd.DataFrame(
48 |             [{
49 |                 id: site, 
50 |                 "Years": df.loc[(df[id] == site), date_var].dt.year.unique(), 
51 |                 "Max Timestamp": df.loc[(df[id] == site), date_var].max(), 
52 |                 "Min Timestamp": df.loc[(df[id] == site), date_var].min(),
53 |                 "Samples": df[(df[id] == site)].count().sum()
54 |                 } for site in df[id].unique()]
55 |         ).sort_values("Samples", ascending=False)
56 |         
57 |     def resample_train_data(df, date_var, id, predict_col, sampling="D"):
58 |         """
59 |         Resample the data to a particular frequency
60 |         :params: df as pandas dataframe, date_var as string, id as string, sampling as string of frequency
61 |         """
62 |         try: 
63 |             df_resampled = df.groupby(id) \
64 |                     .apply(lambda group: group.set_index(date_var).resample(sampling).interpolate(method="time")) \
65 |                     .reset_index(level=1) \
66 |                     .reset_index(drop=True) \
67 |                     .dropna(subset=[predict_col])
68 |         except:
69 |             print('resample_train_data: data are already at', sampling, 'frequency')
70 |             df_resampled = df.copy()
71 |                     
72 |         return df_resampled
73 | 
74 |    
75 | 


--------------------------------------------------------------------------------
/Code/Scoring/train.py:
--------------------------------------------------------------------------------
 1 | # file management functions
 2 | import os
 3 | import glob
 4 | 
 5 | # data elaboration functions
 6 | import numpy as np
 7 | import pandas as pd
 8 | from openpyxl import load_workbook
 9 | import re
10 | import pickle
11 | 
12 | # datetime functions
13 | import datetime as dt
14 | 
15 | # AI functions
16 | import xgboost as xgb
17 | from sklearn.linear_model import LinearRegression
18 | from sklearn.metrics import mean_absolute_error
19 | 
20 | # custom functions
21 | from Code.Utils.utils import Utils
22 | 
23 | class Training:    
24 |     def train(dict_model_to_train, model):
25 |         """
26 |         Generate train
27 |         :params: dict_model_to_train as dictionary, model as string
28 |         :return: a pandas dictionary
29 |         """
30 |         y = dict_model_to_train['y']
31 |         X_train = dict_model_to_train['X_train']
32 |         Y_train = dict_model_to_train['Y_train']
33 |         list_id = dict_model_to_train['list_id']
34 |         regressors_list = sorted(list(set(list(X_train.columns)) - set(list_id)))
35 |         
36 |         # Training
37 |         print('Training')             
38 |         
39 |         X = X_train.loc[:, sorted(regressors_list)].copy().reset_index(drop=True)
40 |         Y = Y_train.loc[:, y].copy().reset_index(drop=True)
41 |     
42 |         trained_model = model.fit(X,Y)  
43 |         
44 |         print('Training completed')
45 |         return trained_model
46 |     
47 | 
48 |         
49 | 
50 |     


--------------------------------------------------------------------------------
/Code/Scoring/train_test.py:
--------------------------------------------------------------------------------
  1 | # file management functions
  2 | import os
  3 | import glob
  4 | 
  5 | # data elaboration functions
  6 | import numpy as np
  7 | import pandas as pd
  8 | from openpyxl import load_workbook
  9 | 
 10 | # datetime functions
 11 | import datetime as dt
 12 | 
 13 | # custom functions
 14 | from Code.Utils.utils import Utils
 15 | 
 16 | 
 17 | class TrainTest:
 18 |     def define_train_test_set_dates(df, y, train_start_date, train_end_date, test_start_date, test_end_date, test_size=0.33):
 19 |         """        
 20 |         Defines train and test dates if left blank      
 21 |         :params: df as pandas dataframe, y as string, train_start_date as string in format '%Y-%m-%d', train_end_date as string in format '%Y-%m-%d', test_start_date as string in format '%Y-%m-%d', test_end_date as string in format '%Y-%m-%d', test_size as percentage
 22 |         :return: a dictionary 
 23 |         """
 24 |         date_var = Utils.find_date(df)
 25 |         min_train_start_date = df.loc[(df[y].isnull()==False), date_var].min()
 26 |         max_train_end_date = df.loc[(df[y].isnull()==False), date_var].max()
 27 |         min_test_start_date = df.loc[(df[y].isnull()==True), date_var].min()
 28 |         max_test_end_date = df.loc[(df[y].isnull()==True), date_var].max()
 29 |         range = pd.date_range(start=min_train_start_date,end=max_train_end_date)   
 30 |         
 31 |         # Test set: identify latest date and set test set as latest date - test size offset
 32 |         if test_end_date=='':            
 33 |             test_end_date =  max_test_end_date
 34 |         else:
 35 |             test_end_date = pd.to_datetime(test_end_date, format='%Y-%m-%d')
 36 |             
 37 |         if test_start_date=='':
 38 |             offset_date = pd.to_datetime(max_train_end_date, format='%Y-%m-%d') - pd.DateOffset(n = round(len(range)*test_size, 0) )            
 39 |             test_start_date = offset_date
 40 |         else:
 41 |             test_start_date = pd.to_datetime(test_start_date, format='%Y-%m-%d')
 42 |             
 43 |         # Train set: set train set from test start date -1 to test to minimum date available
 44 |         if train_start_date=='':   
 45 |             train_start_date = min_train_start_date
 46 |         else:
 47 |             train_start_date = pd.to_datetime(train_start_date, format='%Y-%m-%d')
 48 |             
 49 |         if train_end_date=='':
 50 |             train_end_date = test_start_date - pd.DateOffset(n = 1) 
 51 |         else:
 52 |             train_end_date = pd.to_datetime(train_end_date, format='%Y-%m-%d')
 53 |             
 54 |         dict_train_test_set = {'train_start_date': train_start_date, 'train_end_date': train_end_date, 'test_start_date':test_start_date, 'test_end_date': test_end_date}
 55 |         return dict_train_test_set
 56 |     
 57 |     def def_train(df, y, list_id, train_start_date='', train_end_date=''):
 58 |         """
 59 |         Define train dataset 
 60 |         :params: dataset as dataframe, y as string, list_id as list, train_start_date as string, train_end_date as string
 61 |         :return: a Pandas dataframe
 62 |         """
 63 |         date_var = Utils.find_date(df)
 64 |         df.loc[:, date_var] = df.loc[:, date_var].apply(lambda x: pd.to_datetime(dt.datetime.strftime(x, '%Y-%m-%d'), dayfirst=True))
 65 | 
 66 |         if train_start_date == '':
 67 |             train_start_date = min(df.loc[df[y].notnull(), date_var])
 68 |         elif (train_start_date != '') & (isinstance(train_start_date, str)):
 69 |             train_start_date = pd.to_datetime(train_start_date, dayfirst=True)
 70 |         else:
 71 |             print('Train start date is already a date')
 72 |             
 73 |         print('Train start date is', train_start_date)
 74 | 
 75 |         if train_end_date == '':
 76 |             train_end_date = max(df.loc[df[y].notnull(), date_var])
 77 |         elif (train_end_date != '') & (isinstance(train_end_date, str)):
 78 |             train_end_date = pd.to_datetime(train_end_date, dayfirst=True)
 79 |         else:
 80 |             print('Train end date is already a date')            
 81 |             
 82 |         print('Train end date is', train_end_date)
 83 | 
 84 |         ### Slicing by observation
 85 |         df_sliced = df.loc[(~df.loc[:, y].isnull()) & (df.loc[:, date_var]>=train_start_date) & (df.loc[:, date_var]<=train_end_date), ].reset_index(drop=True)
 86 |         print('Train shape before removing nan is', df_sliced.shape[0])
 87 |         
 88 |         # Removing additional nan
 89 |         train = df_sliced[df_sliced.isnull()==False].sort_values(by=date_var).reset_index(drop=True)
 90 |         train_start_date = min(df_sliced.loc[:, date_var])
 91 |         print('Min date AFTER removing nan is', train_start_date)
 92 |         train_end_date = max(df_sliced.loc[:, date_var])
 93 |         print('Max date AFTER removing nan is', train_end_date)
 94 |         print('Shape AFTER removing nan is', df_sliced.shape[0])
 95 |         
 96 |         ### Slicing by feature
 97 |         # Features set
 98 |         train_features = sorted(list(set(list(train.columns)) - set(list_id + [y])))
 99 |         y_plus_train_features = [y] + train_features
100 |         
101 |         # X_train and Y_train
102 |         X_train = train.loc[:, train_features].reset_index(drop=True)
103 |         Y_train = train.loc[:, y_plus_train_features].reset_index(drop=True)
104 |         
105 |         # Date array
106 |         date_array = train.loc[:, date_var].reset_index(drop=True)
107 |         
108 |         # Historical data
109 |         historical_data = df.loc[df[date_var]>=min(df.loc[df[y].notnull(), date_var]), [date_var, y]].reset_index(drop=True)
110 |         
111 |         ### Create final dict
112 |         dict_train = {'X_train': X_train, 'Y_train': Y_train, 'date_array': date_array, 'y': y, 'list_id': list_id, 'train_start_date': train_start_date, 'train_end_date': train_end_date, 'historical_data': historical_data}
113 |         
114 |         return dict_train
115 | 
116 |     def def_test(df, y, list_id, test_start_date='', test_end_date=''):
117 |         """
118 |         Define test dataset
119 |         :params: dataset as dataframe, y as string, list_id as list, test_start_date as string, test_end_date as string
120 |         :return: a Pandas dictionary
121 |         """
122 |         date_var = Utils.find_date(df)
123 |         df.loc[:, date_var] = df.loc[:, date_var].apply(lambda x: pd.to_datetime(dt.datetime.strftime(x, '%Y-%m-%d'), dayfirst=True))        
124 |         if test_start_date == '':
125 |             test_start_date = min(df.loc[df[y].isnull()==False, date_var]) + dt.timedelta(1)
126 |         else:
127 |             test_start_date = pd.to_datetime(test_start_date, dayfirst=True)
128 |         print('Test start date is', test_start_date)
129 | 
130 |         if test_end_date == '':
131 |             test_end_date = df.loc[(df[y].isnull()==True), date_var].max()
132 |         else:
133 |             test_end_date = pd.to_datetime(test_end_date, dayfirst=True)
134 |         print('Test end date is', test_end_date)
135 | 
136 |         ### Slicing by observation
137 |         df_sliced = df.loc[(df[date_var]>= test_start_date) & (df[date_var] <= test_end_date), ].reset_index(drop=True)
138 |         test = df_sliced.sort_values(by=date_var)
139 |         test_start_date = min(df_sliced.loc[:, date_var])
140 |         test_end_date = max(df_sliced.loc[:, date_var])
141 |         
142 |         ### Slicing by feature
143 |         # Features set
144 |         test_features = sorted(list(set(list(test.columns)) - set(list_id + [y])))
145 |         y_plus_date = [date_var] + [y]
146 |         
147 |         # X_train, y_tilda
148 |         X_test = test.loc[:, test_features].copy().reset_index(drop=True)
149 |         y_tilda = test.loc[:, y_plus_date].copy().reset_index(drop=True)
150 |         
151 |         # Date array
152 |         date_array = test.loc[:, date_var].copy().reset_index(drop=True)    
153 |         
154 |         # Historical data
155 |         historical_data = df.loc[:, [date_var, y]].reset_index(drop=True)   
156 | 
157 |         dict_test = {'X_test': X_test, 'y_tilda' : y_tilda, 'date_array': date_array, 'y': y, 'list_id': list_id, 'test_start_date': test_start_date, 'test_end_date': test_end_date, 'historical_data': historical_data}
158 | 
159 |         return dict_test
160 | 
161 |     


--------------------------------------------------------------------------------
/Code/Utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Utils/__init__.py


--------------------------------------------------------------------------------
/Code/Utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Utils/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Utils/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/Utils/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/Code/Utils/utils.py:
--------------------------------------------------------------------------------
  1 | # data elaboration functions
  2 | import pandas as pd
  3 | import string
  4 | import numpy as np
  5 | import re
  6 | from functools import reduce
  7 | from pandasql import sqldf
  8 | 
  9 | # datetime functions
 10 | import datetime as dt
 11 | 
 12 | # file management functions
 13 | import os
 14 | import sys
 15 | import opendatasets as od
 16 | import pickle
 17 | from pathlib import Path
 18 | 
 19 | from sklearn.utils import column_or_1d
 20 | 
 21 | class Utils:
 22 |     def camel_to_snake(name):
 23 |         """
 24 |         Changes string from camel case to snake case
 25 |         :params: a string
 26 |         :return: a string
 27 |         """
 28 |         list_words = re.findall('([A-Z][a-z]*)', name)
 29 |         
 30 |         if len(list_words)>1:
 31 |             new_name = list_words[0].lower()
 32 |             for w in range(1, len(list_words)):
 33 |                 new_name = new_name + '_' + list_words[w].lower()
 34 |         else:
 35 |             new_name = name.lower()
 36 |         return new_name
 37 |     
 38 |     def columns_camel_to_snake(df):
 39 |         """
 40 |         Changes dataframe columns from camel case to snake case
 41 |         :params: df as dataframe
 42 |         :return: a pandas dataframe
 43 |         """
 44 |         list_cols = list(df.columns)
 45 |         for name in list_cols: 
 46 |             new_name = Utils.camel_to_snake(name)
 47 |             df.rename(columns = {name: new_name}, inplace=True)
 48 |         return df
 49 |     
 50 |     def find_date(df):
 51 |             """
 52 |             Finds date columns in a dataframe
 53 |             :params: df as dataframe
 54 |             :return: a string
 55 |             """
 56 |             dates = list(df.select_dtypes(include=['datetime','datetime64[ns, UTC]']).drop_duplicates().columns)
 57 |                 
 58 |             if len(dates)==1:
 59 |                 print('find_date, date_col found:', dates)
 60 |                 date_col = dates[0]
 61 |             elif len(dates)==0:
 62 |                 dates = list(df.select_dtypes(include=['period[M]']).drop_duplicates().columns)
 63 |                 print('find_date, date_col found:', dates)
 64 |                 date_col = dates[0]
 65 |             else:
 66 |                 date_col = dates.copy()
 67 |             
 68 |             if (len(date_col)==0):
 69 |                 raise Exception('find_date, no date_col found')      
 70 |                 
 71 |             return date_col
 72 |         
 73 |     def find_match_in_list(list_to_match, match_to_find):
 74 |             """
 75 |             Finds a match in a list given a list of possible words to match
 76 |             :params: list to match as a list, match_to_find as a list of words to match
 77 |             :return: a list
 78 |             """
 79 | 
 80 |             list_to_match = list(dict.fromkeys(list_to_match))
 81 |             match_list = list()
 82 |             for m in match_to_find:
 83 |                 match_list.extend([el for el in list_to_match if isinstance(el, collections_abc.Iterable) and (m in el)])
 84 | 
 85 |             match_list = list(dict.fromkeys(match_list))
 86 |             return match_list
 87 |         
 88 |     def delta_format(delta: np.timedelta64) -> str:
 89 |         """
 90 |         Identifies frequency in numpy timedelta
 91 |         :params: numpy timedelta
 92 |         :return: a string
 93 |         """
 94 |         try:
 95 |             days = delta.astype("timedelta64[D]") / np.timedelta64(1, 'D')
 96 |             hours = int(delta.astype("timedelta64[h]") / np.timedelta64(1, 'h') % 24)
 97 |         except:
 98 |             days = delta / np.timedelta64(1, 'D')
 99 |             hours = int(delta / np.timedelta64(1, 'h') % 24)
100 | 
101 |         if days > 0 and hours > 0:
102 |             return f"{days:.0f} d, {hours:.0f} h"
103 |         elif days > 0:
104 |             return f"{days:.0f} d"
105 |         else:
106 |             return f"{hours:.0f} h"
107 |             
108 |     def find_freq(timedelta):
109 |         """
110 |         Finds frequency in numpy timedelta
111 |         :params: numpy timedelta
112 |         :return: a string
113 |         """
114 |         if ('d' in timedelta):
115 |             return 'D'
116 |         elif ('h' in timedelta) & ('d' not in timedelta):
117 |             return 'H'
118 |         else:
119 |             print('find_freq: could not infer frequency')
120 |             
121 |     def find_freq_in_dataframe(df, date_var):
122 |         """
123 |         Finds frequency in pandas dataframe
124 |         :params: df as pandas dataframe, date_var as string
125 |         :return: a string
126 |         """
127 |         freq = pd.Series(df[date_var].unique()).dt.freq
128 |         return freq
129 |     
130 |     def get_project_root(Path):
131 |         """
132 |         Finds the parent folder of the parent folder 
133 |         :params: Path
134 |         :return: Path
135 |         """
136 |         return Path(__file__).parent.parent
137 |     
138 |     def create_folder_tree(folder_name):                
139 |         try:
140 |             os.makedirs(os.path.join(folder_name))
141 |         except OSError:
142 |             print("Creation of the directory failed or already present", folder_name)
143 |         else:
144 |             print("Successfully created the directory", folder_name)
145 |         return
146 | 
147 |     def add_daily_date(df):
148 |         """
149 |         Adds a date variable at daily frequency to dataframe
150 |         :params: pandas dataframe
151 |         :return: pandas dataframe
152 |         """
153 |         
154 |         date_var = Utils.find_date(df)
155 |         delta = abs(np.diff(df[date_var])).mean()
156 |         timedelta = Utils.delta_format(delta)
157 |         freq = Utils.find_freq(timedelta)
158 |         
159 |         # Creating date_daily 
160 |         
161 |         if (freq == 'H'):
162 |             if isinstance(date_var,list)==False:
163 |                 new_var_hour_str = date_var + '_hour_str'
164 |                 new_var = date_var + '_daily'
165 |                 df.loc[:, new_var_hour_str] = df.loc[:, date_var].dt.strftime('%Y-%m-%d %H:%M:%S')
166 |                 df.loc[:, new_var] = pd.to_datetime(df.date_hour_str.apply(lambda x: x.split(' ')[0]), format = '%Y-%m-%d')
167 |                 df.drop(columns=new_var_hour_str, inplace=True)
168 |             else:
169 |                 for d in date_var:    
170 |                     new_var_hour_str = d + '_hour_str'
171 |                     new_var = d + '_daily'                
172 |                     df.loc[:, new_var_hour_str] = df.loc[:, d].dt.strftime('%Y-%m-%d %H:%M:%S')
173 |                     df.loc[:, new_var] = pd.to_datetime(df.date_hour_str.apply(lambda x: x.split(' ')[0]), format = '%Y-%m-%d')
174 |                     df.drop(columns=new_var_hour_str, inplace=True)
175 |         elif (freq == 'D'):
176 |             if (isinstance(date_var,list)==False):
177 |                 new_var = date_var + '_daily'
178 |                 if (new_var not in list(df.columns)):
179 |                     df.rename(columns = {date_var: date_var + '_daily'}, inplace=True)
180 |                 else:
181 |                     print('add_daily_date: data are in daily format')                
182 |             else:
183 |                 for d in date_var:
184 |                     new_var = d + '_daily'
185 |                     if (new_var not in list(df.columns)):
186 |                         df.rename(columns = {date_var: date_var + '_daily'}, inplace=True)
187 |                     else:
188 |                         print('add_daily_date: data are in daily format')                
189 |         return df
190 |     
191 |     def find_categorical_variables(df):
192 |         """
193 |         Finds categorical variables in pandas dataframe
194 |         :params: pandas dataframe
195 |         :return: pandas dataframe
196 |         """
197 |         
198 |         categorical_dtypes = ['category', 'bool']
199 |         date_dtypes = ["datetime64[ns, UTC]"]
200 |         list_categorical = []
201 |         for col in list(df.columns):
202 |             try:
203 |                 df[col] = df[col].apply(lambda x: int(x))
204 |                 if (df[col].dtype.name in categorical_dtypes) & (df[col].dtype.name not in date_dtypes):
205 |                     list_categorical = list_categorical + [col]
206 |                 elif all(df[col].isin([0, 1])) & (df[col].dtype.name not in date_dtypes):
207 |                     list_categorical = list_categorical + [col]
208 |                 elif (df[col].dtype.name not in date_dtypes):
209 |                     list_categorical = list_categorical.copy()
210 |             except:
211 |                 list_categorical = list_categorical.copy()
212 |                 
213 |         return list_categorical
214 |         
215 |     def resample_data(df, id, date_var, sampling, dict_grouping):
216 |         """
217 |         Resample the data to a particular frequency
218 |         :params: df as pandas dataframe, id as string, date_var as string, 
219 |             sampling as string of frequency and dict_grouping as dictionary as {variable_to_resample: 'function_to_apply'}
220 |         :return: a Pandas dataframe
221 |         """
222 |          
223 |         wanted_keys = list(set(dict_grouping.keys()) - set([id, date_var]))
224 |         dictfilt = lambda x, y: dict([ (i,x[i]) for i in x if i in wanted_keys])
225 |         list_variables = list(dictfilt(dict_grouping, wanted_keys).keys())      
226 |         
227 |         # df setup for merge    
228 |         id_list = list(df[id].unique())
229 |         df_resampled = df.loc[df[id] == id_list[0], [date_var, id, list_variables[0]]].drop_duplicates([date_var]).resample(sampling, on=date_var).agg({list_variables[0]: dict_grouping[list_variables[0]]}).reset_index()    
230 |         df_resampled.loc[:, id] = id_list[0]
231 |         print('resample_data: variable', list_variables[0])
232 |         for i in range(1, len(id_list)):
233 |             m = df.loc[df[id] == id_list[i], [date_var, id, list_variables[0]]].drop_duplicates([date_var]).resample(sampling, on=date_var).agg({list_variables[0]: dict_grouping[list_variables[0]]}).reset_index()    
234 |             m.loc[:, id] = id_list[i]
235 |             df_resampled = pd.merge(df_resampled, m, on=[id, date_var, list_variables[0]], how='outer', validate = '1:1')
236 |         print('resample_data: variable', list_variables[0], 'completed' )
237 |     
238 |         # df loop for merge
239 |         for k in range(1, len(list_variables)):
240 |             df_m = df.loc[df[id] == id_list[0], [date_var, id, list_variables[k]]].drop_duplicates([date_var]).resample(sampling, on=date_var).agg({list_variables[k]: dict_grouping[list_variables[k]]}).reset_index()    
241 |             df_m.loc[:, id] = id_list[0]
242 |             print('resample_data: variable', list_variables[k])
243 |             for i in range(1, len(id_list)):
244 |                 m = df.loc[df[id] == id_list[i], [date_var, id, list_variables[k]]].drop_duplicates([date_var]).resample(sampling, on=date_var).agg({list_variables[k]: dict_grouping[list_variables[k]]}).reset_index()    
245 |                 m.loc[:, id] = id_list[i]
246 |                 df_m = pd.merge(df_m, m, on=[id, date_var, list_variables[k]], how='outer', validate = '1:1')
247 |             
248 |             df_resampled = pd.merge(df_resampled, df_m, on=[id, date_var], how='outer', validate = '1:1')
249 |             print('resample_data: variable', list_variables[k], 'completed' )
250 |         print(df_resampled)
251 |         return df_resampled 
252 |     
253 |     def resample_data_pandassql(df_name, id_column, date_column, freq, aggregation_per_col):
254 |         """
255 |         Resample the data to a particular frequency
256 |         :params: df_name as string name of a pandas dataframe, id as string, date_var as string, 
257 |             the sampling as string freq (e.g. 3-m, 5-h, 1-D) and aggregation_per_col as dictionary as {variable_to_resample: 'function_to_apply'}
258 |         :return: a Pandas dataframe
259 |         """ 
260 |         # TO-DO: check for interval of original series
261 |         pysqldf = lambda q: sqldf(q, globals())
262 | 
263 |         num = freq.split('-')[0]
264 |         window = freq.split('-')[1]
265 | 
266 | 
267 |         for i in set(aggregation_per_col.values()):
268 |                 if i.upper() not in ['MAX','MIN','LAST', 'AVG', 'SUM' ]:
269 |                     print('''Aggregation not supported: Use one of these:
270 |                             'MAX','MIN','LAST', 'AVG', 'SUM''')
271 |                     return
272 | 
273 |                 if window == 'm':
274 |                     helper = f'''WITH helper AS(
275 |                                     SELECT *, Substr(date({date_column}), 1,Instr(date({date_column}),'-')-1) AS year,
276 |                                 Substr(date({date_column}), -5,Instr(date({date_column}),'-')-3) AS month,
277 |                                 Substr(date({date_column}), -2,Instr(date({date_column}),'-')-1) AS day,
278 |                                 Substr(time({date_column}), 1,Instr(time({date_column}),':')-1) AS hour,
279 |                                 CAST(Substr(time({date_column}), -5,Instr(time({date_column}),':')-1)/{num} AS modu) AS mod
280 |                                 FROM {df_name}
281 |                     )\n'''
282 |                     groupby = 'year, month, day, hour, mod, '+str(id_column)
283 | 
284 |                 if window == 'h':
285 |                     helper = f'''WITH helper AS(
286 |                                 SELECT *, Substr(date({date_column}), 1,Instr(date({date_column}),'-')-1) AS year,
287 |                                 Substr(date({date_column}), -5,Instr(date({date_column}),'-')-3) AS month,
288 |                                 Substr(date({date_column}), -2,Instr(date({date_column}),'-')-1) AS day,
289 |                                 CAST(Substr(time({date_column}), 1,Instr(time({date_column}),':')-1)/{num} AS modu) as mod
290 |                                 FROM {df_name}
291 |                     )\n'''
292 |                     groupby = 'year, month, day, mod, '+str(id_column)
293 | 
294 |                 if window == 'D':
295 |                     helper = f'''WITH helper AS(
296 |                       SELECT*, Substr(date({date_column}), 1,Instr(date({date_column}),'-')-1) AS year,
297 |                                 Substr(date({date_column}), -5,Instr(date({date_column}),'-')-3) AS month,
298 |                                 CAST(Substr(date({date_column}), -2,Instr(date({date_column}),'-')-1)/{num} AS modu) as mod
299 |                                 FROM {df_name}
300 |                     )\n'''
301 |                     groupby = 'year, month, mod, '+str(id_column)
302 | 
303 |         list_select = []
304 |         for i in aggregation_per_col:
305 |             aggElement = aggregation_per_col[i].upper()+'('+i+')' +' AS '+i
306 |             list_select.append(aggElement)
307 |         string_select = ',\n'.join(list_select)
308 | 
309 |         agg = 'SELECT '+ date_column+ ','+string_select + '\n FROM helper\n GROUP BY '+ groupby
310 |         query = helper + agg
311 | 
312 |         return pysqldf(query)
313 |     
314 |     
315 |        
316 |     def add_seq(df, date_var, serie, freq, end_date='', start_date=''):
317 |         """
318 |         Creates a sequence of completes date/hours to a dataframe
319 |         :params: dataframe in long format to add date/hour observations, date_var as string, 
320 |             serie or id as string or list, freq as datetime.timedelta end and start date in format "%dd/%mm/%YYYY"
321 |         :return: a Pandas dataframe
322 |         """       
323 |         
324 |         df.loc[:, date_var] = df[date_var].apply(lambda x: x.tz_localize(None))
325 | 
326 |         if isinstance(serie, list)==False:
327 |             seq = pd.DataFrame() 
328 |             serie_list = list(df.loc[:, serie].unique())
329 |             for i in serie_list:
330 |                 if start_date == '':
331 |                     start_date = min(df.loc[df[serie]==i, date_var]).tz_localize(None)
332 |                 else:
333 |                     start_date = pd.to_datetime(start_date, dayfirst=True).tz_localize(None)
334 |                     
335 |                 if end_date == '':
336 |                     end_date = max(df.loc[df[serie]==i, date_var]).tz_localize(None)
337 |                 else:
338 |                     end_date = pd.to_datetime(end_date, dayfirst=True).tz_localize(None)
339 |                                     
340 |                 # Sequence        
341 |                 time_range = pd.Series(pd.date_range(
342 |                         start=start_date, end=end_date, freq=freq))
343 |                 
344 |                 print('Adding sequence to serie', i, 'as', 
345 |                         serie_list.index(i) + 1, 'of', len(serie_list))
346 |                 temp = pd.DataFrame.from_dict({serie: [i] * len(time_range), 'date': time_range})
347 |                 temp.rename(columns={'date': date_var}, inplace=True)
348 |                 seq = pd.concat([seq, temp], axis=0, ignore_index=True)
349 |             
350 |             serie = [serie, date_var]
351 |         else:
352 |             seq = pd.DataFrame() 
353 |             serie_list = df.loc[:, serie].drop_duplicates().reset_index(drop=True)
354 |                             
355 |             row_list = serie_list.shape[0]
356 |             col_list = serie_list.shape[1]
357 |             for i in range(0, row_list, 1):     
358 |                 print('Adding sequence to serie', i + 1, 'of', row_list)
359 |                 dict = {}
360 |                 for c in range(0, col_list, 1):   
361 |                     col_name = serie_list.columns[c]
362 |                     id_col = serie_list.loc[i,col_name]
363 |                     if start_date == '':
364 |                         start_date = min(df.loc[(df[col_name]==id_col), date_var]).tz_localize(None)
365 |                     else:
366 |                         start_date = pd.to_datetime(start_date, dayfirst=True).tz_localize(None)
367 |                         
368 |                     if end_date == '':
369 |                         end_date = max(df.loc[(df[col_name]==id_col), date_var]).tz_localize(None)
370 |                     else:
371 |                         end_date = pd.to_datetime(end_date, dayfirst=True).tz_localize(None)
372 |                     
373 |                     # Sequence        
374 |                     time_range = pd.Series(pd.date_range(
375 |                             start=start_date, end=end_date, freq=freq))            
376 |                                             
377 |                     temp_col = {col_name: [serie_list.loc[i,col_name]]* len(time_range)}
378 |                     dict.update(temp_col)                    
379 |            
380 |                 temp = pd.DataFrame.from_dict(dict)
381 |                 temp.loc[:, date_var] = time_range
382 |                 seq = pd.concat([seq, temp], axis=0, ignore_index=True)            
383 |             serie.extend([date_var])
384 |             
385 |         duplicates = seq.loc[:, serie].duplicated().any()
386 |         if duplicates==True:
387 |             raise Exception(print("add_seq: there are duplicates in sequence"))
388 |         else:
389 |             print("add_seq: there are NO duplicates in sequence")
390 |         df_seq = pd.merge(seq, df, on=serie, how='left', validate='1:1')
391 |         
392 |         duplicates_in_df_seq = df_seq.loc[:, serie].duplicated().any()
393 |         if duplicates_in_df_seq==True:
394 |             raise Exception(print("add_seq: there are duplicates when adding sequence"))
395 |         else:
396 |             print("add_seq: there are NO duplicates when adding sequence")
397 | 
398 |         print('Total serie to forecast:', len(df_seq.loc[:, serie].drop_duplicates()))
399 | 
400 |         return df_seq
401 |     
402 |     def check_length_time_serie(df, date_var, index):
403 |         """
404 |         Checks the length that a time sequence of completes date/hours should have, so that it can be compared 
405 |         with actual observation
406 |         :params: df as pandas dataframe, date_var as string, index as list as groupby variable
407 |         :return: a Pandas dataframe
408 |         """       
409 |         freq = pd.Series(df[date_var].unique()).dt.freq
410 |         pivot = pd.pivot_table(df, index=index, values=date_var, aggfunc=['count', 'min', 'max']).reset_index()
411 |         pivot.columns = pivot.columns.get_level_values(0)
412 |         pivot.loc[:, 'td'] = pivot.loc[:, 'max'].max() - pivot.loc[:, 'min'].min()
413 |         pivot.loc[:, 'count'] = pivot.loc[:, 'count'].astype(float)
414 |         
415 |         if freq=='H':
416 |             pivot.loc[:, 'freq'] = 'H'
417 |             pivot.loc[:, 'expected_obs'] = pivot.loc[:, 'td'].apply(lambda x: x.days*24) + pivot.loc[:, 'td'].apply(lambda x: x.seconds/3600) + 1
418 |             pivot.loc[:, 'mismatch'] = 0
419 |             pivot.loc[pivot['count']!=pivot['expected_obs'], 'mismatch'] = 1
420 |             if sum(pivot.mismatch)>0:
421 |                 print('Expected length of sequence is NOT OK \n', pivot[[index, 'count', 'expected_obs']].drop_duplicates())
422 |             else:
423 |                 print('Expected length of sequence is OK \n', pivot[[index, 'count', 'expected_obs']].drop_duplicates())
424 | 
425 |         elif freq=='D':
426 |             pivot.loc[:, 'freq'] = 'D'
427 |             pivot.loc[:, 'expected_obs'] = pivot.loc[:, 'td'].apply(lambda x: x.days) + pivot.loc[:, 'td'].apply(lambda x: x.seconds/3600*24) + 1
428 |             pivot.loc[:, 'mismatch'] = 0
429 |             pivot.loc[pivot['count']!=pivot['expected_obs'], 'mismatch'] = 1
430 |             if sum(pivot.mismatch)>0:
431 |                 print('Expected length of sequence is NOT OK \n', pivot[[index, 'count', 'expected_obs']].drop_duplicates())
432 |             else:
433 |                 print('Expected length of sequence is OK \n', pivot[[index, 'count', 'expected_obs']].drop_duplicates())
434 |                 
435 |         else:
436 |             pivot.loc[:, 'freq'] = np.nan
437 |             pivot.loc[:, 'expected_obs'] = np.nan
438 |             print('check_length_time_serie: could not infer frequency')
439 | 
440 | 
441 |         return pivot
442 |     
443 |     def check_regressors_availability(df, date_var, regressors_list, forecast_end_date):
444 |         """
445 |         Checks the availability of regressors based on forecast end date
446 |         :params: df as pandas dataframe, date_var as string, regressors_list as list and forecast_end_date as string in format "2022-12-31"
447 |         :return: None
448 |         """       
449 |         forecast_end_date = pd.to_datetime(forecast_end_date, dayfirst = False)
450 | 
451 |         for r in regressors_list:
452 |             if any(df.loc[df[date_var]<=forecast_end_date, r].isnull()):
453 |                 print('Latest filled available date for regressor', r, 'is', df.loc[df[r].isnull()==False, date_var].max(), '\n expected is', forecast_end_date)
454 |                 raise Exception('Regressor', r, 'shows null values <= forecast_end_date. \n Please, fill them before going on')
455 |             else:
456 |                 print('Regressor', r, 'has all needed values')
457 |         return None
458 |     
459 |     def remove_regressors_with_nan(df, date_var, regressors_list, forecast_end_date):
460 |         """
461 |         Remove regressors with nan based on forecast end date
462 |         :params: df as pandas dataframe, date_var as string, regressors_list as list and forecast_end_date as string in format "2022-12-31"
463 |         :return: pandas dataframe
464 |         """       
465 |         forecast_end_date = pd.to_datetime(forecast_end_date, dayfirst = False)
466 |         
467 |         for r in regressors_list:
468 |             if any(df.loc[df[date_var]<=forecast_end_date, r].isnull()):
469 |                 print('Latest filled available date for regressor', r, 'is', df.loc[df[r].isnull()==False, date_var].max(), '\n expected is', forecast_end_date)
470 |                 print('Regressor', r, 'shows null values <= forecast_end_date. \n Regressor REMOVED')
471 |                 df.drop(columns = r, inplace=True)
472 |             else:
473 |                 print('Regressor', r, 'has all needed values')
474 |         return df
475 |             
476 |     def match_to_find(serie_to_find):
477 |         """
478 |         Finds a match in a list of possible words to match
479 |         :params: serie_to_find as a list of words to match
480 |         :return: a list
481 |         """
482 |         match_to_find = []
483 |         match_to_find = match_to_find + [serie_to_find]
484 |         match_to_find = match_to_find + [serie_to_find.lower()]
485 |         match_to_find = match_to_find + [serie_to_find.upper()]
486 |         match_to_find = match_to_find + [serie_to_find.capitalize()]
487 |         match_to_find = match_to_find + [re.sub('[^a-zA-Z0-9 \n\.]', '_', serie_to_find)]
488 |         match_to_find = match_to_find + [re.sub('[^a-zA-Z0-9 \n\.]', '_', serie_to_find.lower())]
489 |         match_to_find = match_to_find + [re.sub('[^a-zA-Z0-9 \n\.]', '_', serie_to_find.upper())]
490 |         match_to_find = match_to_find + [re.sub('[^a-zA-Z0-9 \n\.]', '_', serie_to_find.capitalize())]
491 |         return match_to_find 
492 |     
493 |     def find_match(df, serie_name, match_to_find):
494 |         """
495 |         Finds a match in a dataframe serie given a list of possible words to match
496 |         :params: dataframe, serie_name as string, match_to_find as a list of words to match
497 |         :return: a list
498 |         """
499 | 
500 |         list_to_match = list(df.loc[:, serie_name].unique())
501 |         match_list = list()
502 |         for m in match_to_find:
503 |             match_list.extend([el for el in list_to_match if isinstance(el, collections_abc.Iterable) and (m in el)])
504 | 
505 |         match_list = list(dict.fromkeys(match_list))
506 |         return match_list
507 |     
508 |     def find_match_in_list(list_to_match, match_to_find):
509 |         """
510 |         Finds a match in a list given a list of possible words to match
511 |         :params: list to match as a list, match_to_find as a list of words to match
512 |         :return: a list
513 |         """
514 | 
515 |         list_to_match = list(dict.fromkeys(list_to_match))
516 |         match_list = list()
517 |         for m in match_to_find:
518 |             match_list.extend([el for el in list_to_match if isinstance(el, collections_abc.Iterable) and (m in el)])
519 | 
520 |         match_list = list(dict.fromkeys(match_list))
521 |         return match_list
522 |     
523 |     def id_outliers_IQR(df, q1, q3, date_var, id, var, freq_var):
524 |         """
525 |         Identifies outliers creatinga dummy variable (0/1) called outlier using IQR method, where quantile value can be set
526 |         :param dates: dataframe, q1 and q3 values as numeric 0<x<1, date_var as string, var where we want to compute outliers as string,
527 |         freq_var as string such as month or day
528 |         :return: a Pandas dataframe
529 |         """
530 |         ### Removing negative values, since energy consumption can be only positive
531 |         df = df.loc[df[var]>0, ].copy()
532 |         
533 |         if isinstance(id, 'list'):
534 |             list_id = id + [var, freq_var]
535 |         else:
536 |             list_id = [id, var, freq_var]    
537 |               
538 |         # Freq var
539 |         df.loc[:, freq_var] = df.loc[:, date_var].apply(lambda x: x.month)
540 |         
541 |         ### ID outliers
542 |         grouped = df.loc[:, list_id].groupby(list_id)
543 |         df_q1 = grouped.quantile(q1).reset_index()
544 |         df_q1.rename(columns={var: 'q1'}, inplace=True)
545 |         df_q3 = grouped.quantile(q3).reset_index()
546 |         df_q3.rename(columns={var: 'q3'}, inplace=True)
547 |         
548 |         # Merge
549 |         dfs = [df, df_q1, df_q3]
550 |         df_outliers = reduce(lambda left,right: pd.merge(left,right,how='left', on=list_id, validate='m:1'), dfs)
551 |        
552 |         df_outliers.loc[:, 'IQR'] = df_outliers.q3 - df_outliers.q1
553 |         df_outliers.loc[:, 'outlier'] = 0
554 |         df_outliers.loc[((df_outliers[var]<(df_outliers.q1-1.5*df_outliers.IQR)) | (df_outliers[var]>(df_outliers.q3+1.5*df_outliers.IQR))), 'outlier']= 1
555 |         var_cleaned = var + '_cleaned'
556 |         df_outliers.loc[:, var_cleaned] = df_outliers.loc[:, var]
557 |         df_outliers.loc[df_outliers.outlier==1, var_cleaned] = np.nan
558 | 
559 |         # Summarizing outliers in a pivot table
560 |         pivot_sum = pd.pivot_table(df_outliers, values='outlier', index=list_id, aggfunc=sum).reset_index()
561 |         pivot_len = pd.pivot_table(df_outliers, values='outlier', index=list_id, aggfunc=len).reset_index()
562 |         pivot_len.rename(columns={'outlier': 'obs'}, inplace=True)
563 |         pivot = pd.merge(pivot_sum, pivot_len, on=list_id, how='inner', validate='1:1')
564 |         pivot.loc[:, 'outliers_perc'] =  round(pivot.outlier / pivot.obs,2)
565 | 
566 |         dict_outliers = {'df_outliers': df_outliers, 'pivot_outliers': pivot}
567 |         return dict_outliers
568 |         
569 |    
570 | class AlphabeticalCombinations:
571 |     def write_neat_csv(saving_file, df_fcst):
572 |         """
573 |         Writes neat csv
574 |         :params: saving_file as string, df_fcst as dataframe to write
575 |         :return: None
576 |         """
577 |         df_fcst.to_csv(saving_file, sep=';', date_format="%Y-%m-%d %H:%M:%S", header=True, index=False, compression='infer', quoting=None, quotechar='"', doublequote=False, decimal='.')
578 |                
579 |         return(print('*** write_neat_csv: completed', saving_file))       
580 |     
581 |     def convert(string):
582 |         """
583 |         Convert string to list
584 |         :params: string
585 |         :return: a list
586 |         """
587 |         list1=[]
588 |         list1[:0]=string
589 |         return list1
590 | 
591 |     def excel_columns():
592 |         """
593 |         Counts excel columns
594 |         :params: none
595 |         :return: a list
596 |         """
597 |         alphabet_string = string.ascii_uppercase
598 |         li = AlphabeticalCombinations.convert(alphabet_string)
599 |         excel_columns = [letter for letter in alphabet_string]
600 |         for L in li:
601 |             aces = [L + li for li in li]
602 |             excel_columns.extend(aces)
603 | 
604 |         return excel_columns
605 | 
606 |     def write_beautiful_excel(saving_file, dict_df_to_write):
607 |         """
608 |         Writes beautiful excel
609 |         :params: saving_file as string, dict_df_to_write as dictionary with dict key as sheet name and dict value as data
610 |         :return: None
611 |         """
612 |         ### Writing to Excel
613 |         writer = pd.ExcelWriter(saving_file, engine='xlsxwriter', datetime_format='dd/mm/yyyy hh:mm:ss', date_format='dd/mm/yyyy')
614 |         
615 |         # FCST
616 |         for d in list(dict_df_to_write.keys()):
617 |             df = dict_df_to_write[d]
618 |             df.to_excel(writer, sheet_name=d, index=False)
619 | 
620 |             # Make handles for workbook/sheet
621 |             workbook = writer.book
622 |             worksheet = writer.sheets[d]
623 | 
624 |             # Create positive/negative cell format
625 |             format_simone = workbook.add_format({'num_format': '#,##0;- #,##0'})
626 |             format_percentage = workbook.add_format({'num_format': '0.00%'})
627 | 
628 |             # Identify percentage columns
629 |             cols_percentage = []
630 |             for c in list(df.columns):
631 |                 try:
632 |                     if any(df[c]>=1) and any(df[c]>=0) and any(df[c].between(0, 1, inclusive=False)):
633 |                         cols_percentage.extend([c])
634 |                 except:
635 |                     pass
636 | 
637 |             # Define the worksheet range to apply number format
638 |             cols = AlphabeticalCombinations.excel_columns()
639 |             row = len(df)
640 |             format_range = '{}{}:{}{}'.format(cols[0], row, cols[len(df.columns)-1], row)
641 | 
642 |             # Apply number formats to specified range
643 |             worksheet.set_column(format_range, None, format_simone)
644 | 
645 |             if len(cols_percentage)>0:
646 |                 for f in cols_percentage:
647 |                     n = list(df.columns).index(f)
648 |                     row = len(df)
649 |                     format_range = '{}{}:{}{}'.format(cols[n], row, cols[n], row)
650 |                     worksheet.set_column(format_range, None, format_percentage)
651 | 
652 |             #Iterate through each column and set the width == the max length in that column. A padding length of 2 is also added.
653 |             for i, col in enumerate(df.columns):
654 |                 # find length of column i
655 |                 column_len = df[col].astype(str).str.len().max()
656 |                 # Setting the length if the column header is larger
657 |                 # than the max column value length
658 |                 column_len = max(column_len, len(col)) + 4
659 |                 # set the column length
660 |                 worksheet.set_column(i, i, column_len)
661 | 
662 |         ## Close the Pandas Excel writer and output the Excel file
663 |         writer.save()
664 |         return(print('*** write_beatiful_excel: completed', saving_file))
665 | 
666 |     def write_beautiful_excel_table(saving_file, dict_df_to_write):
667 |         """
668 |         Writes beautiful excel tables
669 |         :params: saving_file as string, dict_df_to_write as dictionary with dict key as sheet name and dict value as data
670 |         :return: None
671 |         """
672 |         ### Writing to Excel
673 |         writer = pd.ExcelWriter(saving_file, engine='xlsxwriter', datetime_format='dd/mm/yyyy hh:mm:ss', date_format='dd/mm/yyyy')
674 | 
675 |         # FCST
676 |         for d in list(dict_df_to_write.keys()):
677 |             df = dict_df_to_write[d]
678 |             df.to_excel(writer, sheet_name=d, index=False)
679 | 
680 |             # Make handles for workbook/sheet
681 |             workbook = writer.book
682 |             worksheet = writer.sheets[d]
683 | 
684 |             # Create positive/negative cell format
685 |             format_simone = workbook.add_format({'num_format': '#,##0;- #,##0'})
686 |             format_percentage = workbook.add_format({'num_format': '0.00%'})
687 | 
688 |             # Identify percentage columns
689 |             cols_percentage = []
690 |             for c in list(df.columns):
691 |                 try:
692 |                     if any(df[c]>=1) and any(df[c]>=0) and any(df[c].between(0, 1, inclusive=False)):
693 |                         cols_percentage.extend([c])
694 |                 except:
695 |                     pass
696 | 
697 |             # Define the worksheet range to apply number format
698 |             cols = AlphabeticalCombinations.excel_columns()
699 |             row = len(df)
700 |             format_range = '{}{}:{}{}'.format(cols[0], row, cols[len(df.columns)-1], row)
701 | 
702 |             # Apply number formats to specified range
703 |             worksheet.set_column(format_range, None, format_simone)
704 | 
705 |             if len(cols_percentage)>0:
706 |                 for f in cols_percentage:
707 |                     n = list(df.columns).index(f)
708 |                     row = len(df)
709 |                     format_range = '{}{}:{}{}'.format(cols[n], row, cols[n], row)
710 |                     worksheet.set_column(format_range, None, format_percentage)
711 | 
712 |             #Iterate through each column and set the width == the max length in that column. A padding length of 2 is also added.
713 |             for i, col in enumerate(df.columns):
714 |                 # find length of column i
715 |                 column_len = df[col].astype(str).str.len().max()
716 |                 # Setting the length if the column header is larger
717 |                 # than the max column value length
718 |                 column_len = max(column_len, len(col)) + 4
719 |                 # set the column length
720 |                 worksheet.set_column(i, i, column_len)
721 | 
722 |             # Create a list of column headers, to use in add_table().
723 |             column_settings = []
724 |             for header in df.columns:
725 |                 column_settings.append({'header': header})
726 | 
727 |             # Add the table.
728 |             worksheet.add_table(0, 0, df.shape[0], df.shape[1] - 1, {'columns': column_settings})
729 | 
730 |         ## Close the Pandas Excel writer and output the Excel file
731 |         writer.save()
732 |         return(print('*** write_beatiful_excel: completed', saving_file))
733 |     
734 | 


--------------------------------------------------------------------------------
/Code/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/__init__.py


--------------------------------------------------------------------------------
/Code/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Code/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/Configuration/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Configuration/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Configuration/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/Configuration/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Configuration/__pycache__/config.cpython-37.pyc


--------------------------------------------------------------------------------
/Configuration/config.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | from box import Box
 3 | import sys
 4 | import os
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | def get_project_root() -> Path:
 9 |     return Path(__file__).parent.parent
10 | 
11 | root = get_project_root()
12 | with open(os.path.join(root, "Configuration/config.yaml"), "r") as ymlfile:
13 |   cfg_path = Box(yaml.safe_load(ymlfile))
14 | 
15 | 


--------------------------------------------------------------------------------
/Configuration/config.yaml:
--------------------------------------------------------------------------------
1 | data_dir:
2 |   input_path: "Data/Input"
3 |   output_path: "Data/Output"
4 |   plot_path: "Data/Plots"
5 | 


--------------------------------------------------------------------------------
/Dashboards/EnergyDashboard.pbix:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Dashboards/EnergyDashboard.pbix


--------------------------------------------------------------------------------
/Docs/Images/banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/banner.jpg


--------------------------------------------------------------------------------
/Docs/Images/calendar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/calendar.png


--------------------------------------------------------------------------------
/Docs/Images/elbow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/elbow.png


--------------------------------------------------------------------------------
/Docs/Images/intermittent_TS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/intermittent_TS.png


--------------------------------------------------------------------------------
/Docs/Images/panel_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/panel_data.png


--------------------------------------------------------------------------------
/Docs/Images/sliding_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/sliding_plot.png


--------------------------------------------------------------------------------
/Docs/Images/thermal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Images/thermal.png


--------------------------------------------------------------------------------
/Docs/Slides/ds_toolkit_forecasting_2.0_memo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dstoolkit-forecasting/9ff73f0cc4354546bc175d380b122022123cb313/Docs/Slides/ds_toolkit_forecasting_2.0_memo.pdf


--------------------------------------------------------------------------------
/Environment/forecasting_energy.yml:
--------------------------------------------------------------------------------
  1 | name: forecasting_energy
  2 | channels:
  3 |   - anaconda
  4 |   - defaults
  5 | dependencies:
  6 |   - ca-certificates=2020.10.14=0
  7 |   - certifi=2020.6.20=py37_0
  8 |   - openssl=1.1.1h=he774522_0
  9 |   - pip=20.2.4=py37_0
 10 |   - python=3.7.7=h81c818b_4
 11 |   - setuptools=50.3.0=py37h9490d1a_1
 12 |   - sqlite=3.33.0=h2a8f88b_0
 13 |   - vc=14.1=h0510ff6_4
 14 |   - vs2015_runtime=14.16.27012=hf0eaf9b_3
 15 |   - wheel=0.35.1=py_0
 16 |   - wincertstore=0.2=py37_0
 17 |   - zlib=1.2.11=vc14h1cdd9ab_1
 18 |   - pip:
 19 |     - absl-py==0.11.0
 20 |     - adal==1.2.5
 21 |     - adjustText==0.7.3
 22 |     - altair==4.1.0
 23 |     - antlr4-python3-runtime==4.8
 24 |     - applicationinsights==0.11.9
 25 |     - argcomplete==1.12.3
 26 |     - argon2-cffi==21.1.0
 27 |     - argparse==1.4.0
 28 |     - astor==0.8.1
 29 |     - astunparse==1.6.3
 30 |     - async-generator==1.10
 31 |     - attrs==21.2.0
 32 |     - autopep8==1.5.7
 33 |     - azure-cognitiveservices-vision-customvision==3.0.0
 34 |     - azure-common==1.1.26
 35 |     - azure-core==1.23.0
 36 |     - azure-graphrbac==0.61.1
 37 |     - azure-identity==1.4.1
 38 |     - azure-keyvault-secrets==4.4.0
 39 |     - azure-mgmt-authorization==0.61.0
 40 |     - azure-mgmt-containerregistry==2.8.0
 41 |     - azure-mgmt-keyvault==2.2.0
 42 |     - azure-mgmt-resource==10.3.0
 43 |     - azure-mgmt-storage==11.2.0
 44 |     - azure-storage-blob==12.10.0
 45 |     - azureml-automl-core==1.18.0.post1
 46 |     - azureml-core==1.17.0
 47 |     - azureml-dataprep==2.4.4
 48 |     - azureml-dataprep-native==24.0.0
 49 |     - azureml-dataprep-rslex==1.2.3
 50 |     - azureml-dataset-runtime==1.18.0
 51 |     - azureml-defaults==1.18.0
 52 |     - azureml-model-management-sdk==1.0.1b6.post1
 53 |     - azureml-pipeline==1.18.0
 54 |     - azureml-pipeline-core==1.18.0
 55 |     - azureml-pipeline-steps==1.18.0
 56 |     - azureml-sdk==1.18.0
 57 |     - azureml-telemetry==1.18.0
 58 |     - azureml-train==1.18.0
 59 |     - azureml-train-automl-client==1.18.0
 60 |     - azureml-train-core==1.18.0.post1
 61 |     - azureml-train-restclients-hyperdrive==1.18.0
 62 |     - backcall==0.2.0
 63 |     - backports-tempfile==1.0
 64 |     - backports-weakref==1.0.post1
 65 |     - backports-zoneinfo==0.2.1
 66 |     - base58==2.1.1
 67 |     - bleach==4.1.0
 68 |     - blinker==1.4
 69 |     - cached-property==1.5.2
 70 |     - cachetools==4.1.1
 71 |     - cffi==1.14.3
 72 |     - charset-normalizer==2.0.6
 73 |     - click==7.1.2
 74 |     - cloudpickle==1.6.0
 75 |     - cmdstanpy==0.9.5
 76 |     - colorama==0.4.4
 77 |     - configparser==3.7.4
 78 |     - contextlib2==0.6.0.post1
 79 |     - convertdate==2.3.2
 80 |     - cryptography==3.2.1
 81 |     - cycler==0.10.0
 82 |     - cython==0.29.26
 83 |     - databricks-cli==0.16.2
 84 |     - databricks-connect==7.3.30
 85 |     - dateinfer==0.2.0
 86 |     - debugpy==1.5.0
 87 |     - decorator==5.1.0
 88 |     - defusedxml==0.7.1
 89 |     - dill==0.3.3
 90 |     - distro==1.5.0
 91 |     - docker==4.3.1
 92 |     - dotnetcore2==2.1.19
 93 |     - entrypoints==0.3
 94 |     - ephem==4.1.3
 95 |     - et-xmlfile==1.1.0
 96 |     - flask==1.0.3
 97 |     - fusepy==3.0.1
 98 |     - gast==0.3.3
 99 |     - gitdb==4.0.9
100 |     - gitpython==3.1.24
101 |     - google-auth==1.23.0
102 |     - google-auth-oauthlib==0.4.2
103 |     - google-pasta==0.2.0
104 |     - grpcio==1.33.2
105 |     - gunicorn==19.9.0
106 |     - h11==0.12.0
107 |     - h5py==3.1.0
108 |     - hijri-converter==2.2.2
109 |     - holidays==0.11.3.1
110 |     - idna==3.2
111 |     - importlib-metadata==2.0.0
112 |     - imutils==0.5.3
113 |     - ipykernel==6.4.1
114 |     - ipython==7.28.0
115 |     - ipython-genutils==0.2.0
116 |     - ipywidgets==7.6.5
117 |     - isodate==0.6.0
118 |     - itsdangerous==1.1.0
119 |     - jedi==0.18.0
120 |     - jeepney==0.6.0
121 |     - jinja2==2.11.2
122 |     - jmespath==0.10.0
123 |     - joblib==0.17.0
124 |     - json-logging-py==0.2
125 |     - json5==0.8.5
126 |     - jsonpickle==1.4.1
127 |     - jsonschema==4.0.1
128 |     - jupyter-client==7.0.6
129 |     - jupyter-core==4.8.1
130 |     - jupyterlab-pygments==0.1.2
131 |     - jupyterlab-widgets==1.0.2
132 |     - kaggle==1.5.12
133 |     - keras-applications==1.0.8
134 |     - keras-preprocessing==1.1.0
135 |     - kiwisolver==1.3.2
136 |     - kneed==0.7.0
137 |     - korean-lunar-calendar==0.2.1
138 |     - liac-arff==2.5.0
139 |     - lunarcalendar==0.0.9
140 |     - markdown==3.3.3
141 |     - markupsafe==1.1.1
142 |     - matplotlib==3.4.3
143 |     - matplotlib-inline==0.1.3
144 |     - mistune==0.8.4
145 |     - msal==1.6.0
146 |     - msal-extensions==0.2.2
147 |     - msrest==0.6.21
148 |     - msrestazure==0.6.2
149 |     - nbclient==0.5.4
150 |     - nbconvert==6.2.0
151 |     - nbformat==5.1.3
152 |     - nbimporter==0.3.4
153 |     - ndg-httpsclient==0.5.1
154 |     - nest-asyncio==1.5.1
155 |     - notebook==6.4.5
156 |     - numpy==1.19.0
157 |     - oauthlib==3.1.0
158 |     - omegaconf==2.1.2
159 |     - opencv-python==4.3.0.36
160 |     - opencv-python-headless==4.3.0.36
161 |     - opendatasets==0.1.20
162 |     - openpyxl==3.0.9
163 |     - opt-einsum==3.3.0
164 |     - outcome==1.1.0
165 |     - packaging==21.2
166 |     - pandas==1.3.5
167 |     - pandocfilters==1.5.0
168 |     - parso==0.8.2
169 |     - pathspec==0.8.1
170 |     - pep8==1.7.1
171 |     - pickleshare==0.7.5
172 |     - pillow==8.3.2
173 |     - plotly==5.3.1
174 |     - portalocker==1.7.1
175 |     - prometheus-client==0.12.0
176 |     - prompt-toolkit==3.0.20
177 |     - protobuf==3.14.0
178 |     - py4j==0.10.9
179 |     - pyarrow==1.0.1
180 |     - pyasn1==0.4.8
181 |     - pyasn1-modules==0.2.8
182 |     - pycodestyle==2.7.0
183 |     - pycparser==2.20
184 |     - pydeck==0.7.1
185 |     - pygments==2.10.0
186 |     - pyjwt==1.7.1
187 |     - pymeeus==0.5.11
188 |     - pyodbc==4.0.32
189 |     - pyopenssl==19.1.0
190 |     - pyparsing==2.4.7
191 |     - pyrsistent==0.18.0
192 |     - pystan==2.19.1.1
193 |     - python-box==5.4.1
194 |     - python-dateutil==2.8.1
195 |     - python-slugify==6.1.1
196 |     - pytz==2020.4
197 |     - pytz-deprecation-shim==0.1.0.post0
198 |     - pywin32==227
199 |     - pywinpty==1.1.5
200 |     - pyyaml==6.0
201 |     - pyzmq==22.3.0
202 |     - repackage==0.7.3
203 |     - requests==2.26.0
204 |     - requests-oauthlib==1.3.0
205 |     - rsa==4.6
206 |     - ruamel-yaml==0.16.12
207 |     - ruamel-yaml-clib==0.2.2
208 |     - scikit-learn==0.22.2.post1
209 |     - scipy==1.4.1
210 |     - seaborn==0.11.2
211 |     - secretstorage==3.2.0
212 |     - selenium==4.0.0
213 |     - send2trash==1.8.0
214 |     - setuptools-git==1.2
215 |     - shapely==1.7.0
216 |     - six==1.15.0
217 |     - sklearn==0.0
218 |     - smmap==5.0.0
219 |     - sniffio==1.2.0
220 |     - sortedcontainers==2.4.0
221 |     - streamlit==1.1.0
222 |     - tabulate==0.8.9
223 |     - tenacity==8.0.1
224 |     - tensorboard==2.2.2
225 |     - tensorboard-plugin-wit==1.7.0
226 |     - tensorflow==2.2.0
227 |     - tensorflow-estimator==2.2.0
228 |     - tensorflow-gpu==2.2.0
229 |     - tensorflow-gpu-estimator==2.2.0
230 |     - termcolor==1.1.0
231 |     - terminado==0.12.1
232 |     - testpath==0.5.0
233 |     - text-unidecode==1.3
234 |     - toml==0.10.2
235 |     - toolz==0.11.1
236 |     - tornado==6.1
237 |     - tqdm==4.62.3
238 |     - traitlets==5.1.0
239 |     - trio==0.19.0
240 |     - trio-websocket==0.9.2
241 |     - typing-extensions==4.1.1
242 |     - tzdata==2021.5
243 |     - tzlocal==4.1
244 |     - ujson==5.1.0
245 |     - urllib3==1.26.7
246 |     - validators==0.18.2
247 |     - watchdog==2.1.6
248 |     - wcwidth==0.2.5
249 |     - webencodings==0.5.1
250 |     - websocket-client==0.57.0
251 |     - werkzeug==1.0.1
252 |     - widgetsnbextension==3.5.2
253 |     - wrapt==1.13.1
254 |     - wsproto==1.0.0
255 |     - xgboost==1.4.2
256 |     - xlrd==2.0.1
257 |     - xlsxwriter==3.0.1
258 |     - zipp==3.4.0
259 | prefix: C:\Users\mabellani\.conda\envs\forecasting_energy
260 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/Notebooks/EnergyClusteringRegular.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "dc3e4402",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Implementation"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "sWbXCGozBRNW",
 14 |    "metadata": {
 15 |     "id": "sWbXCGozBRNW"
 16 |    },
 17 |    "source": [
 18 |     "## Packages"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "id": "kmxpysFu7zjH",
 25 |    "metadata": {
 26 |     "colab": {
 27 |      "base_uri": "https://localhost:8080/"
 28 |     },
 29 |     "id": "kmxpysFu7zjH",
 30 |     "outputId": "db2717d5-22be-4fa8-99fb-3f9ea90e7e1b"
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# data elaboration functions\n",
 35 |     "import pandas as pd\n",
 36 |     "from six.moves import collections_abc\n",
 37 |     "import string\n",
 38 |     "import numpy as np\n",
 39 |     "\n",
 40 |     "# datetime functions\n",
 41 |     "import datetime as dt\n",
 42 |     "\n",
 43 |     "# file management functions\n",
 44 |     "import os\n",
 45 |     "import sys\n",
 46 |     "import opendatasets as od\n",
 47 |     "import pickle\n",
 48 |     "from pathlib import Path\n",
 49 |     "\n",
 50 |     "# plot functions\n",
 51 |     "import matplotlib.pyplot as plt\n",
 52 |     "%matplotlib inline\n",
 53 |     "\n",
 54 |     "# data science functions\n",
 55 |     "import matplotlib.pyplot as plt\n",
 56 |     "from kneed import KneeLocator\n",
 57 |     "from sklearn.datasets import make_blobs\n",
 58 |     "from sklearn.cluster import KMeans\n",
 59 |     "from sklearn.metrics import silhouette_score\n",
 60 |     "from sklearn.preprocessing import StandardScaler, scale\n",
 61 |     "from sklearn.metrics import mean_absolute_error\n",
 62 |     "import joblib\n",
 63 |     "from sklearn.linear_model import LinearRegression\n",
 64 |     "from sklearn.ensemble import RandomForestRegressor\n",
 65 |     "from sklearn.model_selection import train_test_split\n",
 66 |     "import xgboost as xgb\n",
 67 |     "\n",
 68 |     "# statistical functions\n",
 69 |     "from scipy.stats.mstats import winsorize\n",
 70 |     "\n",
 71 |     "# configuration file\n",
 72 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 73 |     "if module_path not in sys.path:\n",
 74 |     "    sys.path.append(module_path)\n",
 75 |     "\n",
 76 |     "# custom functions\n",
 77 |     "from Code.Profiling.Intermittent.intermittent import Intermittent\n",
 78 |     "from Code.Utils.utils import Utils\n",
 79 |     "from Code.Scoring.kpi import Kpi\n",
 80 |     "from Code.Scoring.forecast import Forecasting\n",
 81 |     "from Code.Scoring.train import Training\n",
 82 |     "from Code.Scoring.train_test import TrainTest\n",
 83 |     "from Code.Scoring.scoring import Scoring\n",
 84 |     "from Code.Regressors.regressors import Regressors\n",
 85 |     "from Code.Plotting.plots import Plots\n",
 86 |     "from Configuration.config import cfg_path"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "id": "8dc26b7b",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## Setup"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "458162d0",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "# od.download(\"https://www.kaggle.com/arashnic/building-sites-power-consumption-dataset/download\")\n",
105 |     "root = Path(os.getcwd()).parent\n",
106 |     "dataset_path = os.path.join(root, cfg_path.data_dir.input_path)\n"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "id": "86bb0e13",
112 |    "metadata": {},
113 |    "source": [
114 |     "## Load Data"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "id": "09358d6d",
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "dict_profiling = pd.read_pickle(os.path.join(root, cfg_path.data_dir.output_path, 'dict_profiling.pkl'))\n",
125 |     "df_final = pd.read_pickle(os.path.join(\n",
126 |     "    root, cfg_path.data_dir.output_path, 'df_final.pkl'))\n",
127 |     "df_final.head()\n"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "id": "f23ed7fb",
133 |    "metadata": {},
134 |    "source": [
135 |     "## Parameter setup"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "id = 'site_id'\n",
145 |     "list_unique_id = ['site_id', 'timestamp']\n",
146 |     "list_temp = ['temp']\n",
147 |     "y = 'value'\n",
148 |     "date_var = Utils.find_date(df_final)\n",
149 |     "\n",
150 |     "# Winsorizing parameters\n",
151 |     "highest = 0.05\n",
152 |     "lowest = 0.05"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "# Clustering regular time series"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "# Define regular ids list\n",
169 |     "list_id_clustering = list(dict_profiling['regular'])\n",
170 |     "mask = df_final[id].isin(list(dict_profiling['regular']))\n",
171 |     "df = df_final.loc[mask, [date_var, id, y]]\n",
172 |     "\n",
173 |     "# Set seed\n",
174 |     "sample_seed_kmeans = 789\n",
175 |     "# Standardizing data\n",
176 |     "df_win_sum = df.loc[:, [id, y]].groupby(id).apply(\n",
177 |     "    lambda x: np.sum(winsorize(x, (highest, lowest)))).reset_index()\n",
178 |     "df_win_sum.columns = [id, \"sum_\" + y]\n",
179 |     "\n",
180 |     "# Checking if some ids have 0 values after winsorizing\n",
181 |     "if len(set(list_id_clustering) - set(list(df_win_sum[id].unique()))) > 0:\n",
182 |     "    list_id_clustering = list(set(list_id_clustering) - set(list(df_win_sum[id].unique())))\n",
183 |     "    print(id, list_id_clustering, \"has/have 0\", y, \"after winsorizing\")\n",
184 |     "    mask = (df[y]!=np.nan) & (~df[id].isin(list_id_clustering))\n",
185 |     "    df_std = df.loc[mask, ].pivot(index=date_var, columns=id, values=y).reset_index()\n",
186 |     "    charvec = df_std[date_var].dt.strftime('%Y-%m-%d')\n",
187 |     "    df_std.set_index(date_var, inplace=True)\n",
188 |     "else:\n",
189 |     "    mask = (df[y]!=np.nan)\n",
190 |     "    df_std = df.loc[mask, ].pivot(index=date_var, columns=id, values=y).reset_index()\n",
191 |     "    charvec = df_std[date_var].dt.strftime('%Y-%m-%d')\n",
192 |     "    df_std.set_index(date_var, inplace=True)\n",
193 |     "    print(\"NO\", id, \"has/have 0\", y, \"after winsorizing\")"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "## Defining a set of ids to cluster with NO nan"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "### In order to perform cluster analysis, one need to have a matrix with no nan value and set the index of the dataframe with date_var"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "df_std_no_nan = df_std.dropna()\n",
217 |     "if len(df_std_no_nan)==0:\n",
218 |     "    list_id_cluster = [16, 21,22,25,26, 27, 29, 33, 40, 49]\n",
219 |     "    df_cluster = df_std.loc[:, list_id_cluster].dropna()\n",
220 |     "else:\n",
221 |     "    list_id_cluster = list(set(list(df_std.columns)) - set(list(date_var)))\n",
222 |     "    df_cluster = df_std.loc[:, list_id_cluster].dropna()\n",
223 |     "print('Clustering regular profiles on ids', list_id_cluster)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "### Set the number of cluster you want to try"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "# Total sum of squares\n",
240 |     "tot_ss = pd.DataFrame(df_cluster.apply(scale, axis=1)**2).sum(axis=0, skipna=True)\n",
241 |     "\n",
242 |     "# Setting up charvec\n",
243 |     "start_date = min(df_cluster.index)\n",
244 |     "end_date = max(df_cluster.index)\n",
245 |     "\n",
246 |     "# Define the number of clusters\n",
247 |     "try_clusters = 11\n",
248 |     "\n",
249 |     "# K-means setup\n",
250 |     "kmeans_kwargs = { \n",
251 |     "    \"init\": \"random\",\n",
252 |     "    \"n_init\": 10,\n",
253 |     "    \"max_iter\": 300,\n",
254 |     "    \"random_state\": 42,\n",
255 |     "}"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "### Choosing the Appropriate Number of Clusters\n",
263 |     "In this section, you’ll look at two methods that are commonly used to evaluate the appropriate number of clusters:\n",
264 |     "\n",
265 |     "- The elbow method\n",
266 |     "- The silhouette coefficient\n",
267 |     "\n",
268 |     "These are often used as complementary evaluation techniques"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {},
274 |    "source": [
275 |     "#### The elbow method"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "#X = np.array(df_cluster.transpose())\n",
285 |     "X = np.array(df_cluster)\n",
286 |     "\n",
287 |     "# A list holds the SSE values for each k\n",
288 |     "\n",
289 |     "sse = []\n",
290 |     "for k in range(1, try_clusters):\n",
291 |     "    kmeans = KMeans(n_clusters = k, **kmeans_kwargs)\n",
292 |     "    kmeans.fit(X)\n",
293 |     "    sse.append(kmeans.inertia_)\n",
294 |     "\n",
295 |     "plt.style.use(\"fivethirtyeight\")\n",
296 |     "plt.plot(range(1, try_clusters), sse)\n",
297 |     "plt.xticks(range(1, try_clusters))\n",
298 |     "plt.xlabel(\"Number of Clusters\")\n",
299 |     "plt.ylabel(\"SSE\")\n",
300 |     "plt.show()"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "kl = KneeLocator(range(1, 11), sse, curve=\"convex\", direction=\"decreasing\")\n",
310 |     "print(\"Elbow method: optimal number of clusters is\", kl.elbow)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "#### The silhouette coefficient\n",
318 |     "The silhouette coefficient is a measure of cluster cohesion and separation. It quantifies how well a data point fits into its assigned cluster based on two factors:\n",
319 |     "\n",
320 |     "- How close the data point is to other points in the cluster\n",
321 |     "- How far away the data point is from points in other clusters\n",
322 |     "\n",
323 |     "Silhouette coefficient values range between -1 and 1. Larger numbers indicate that samples are closer to their clusters than they are to other clusters."
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "# A list holds the silhouette coefficients for each k\n",
333 |     "silhouette_coefficients = []\n",
334 |     "\n",
335 |     "# Notice you start at 2 clusters for silhouette coefficient\n",
336 |     "for k in range(2, try_clusters):\n",
337 |     "    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)\n",
338 |     "    kmeans.fit(X)\n",
339 |     "    score = silhouette_score(X, kmeans.labels_)\n",
340 |     "    silhouette_coefficients.append(score)\n",
341 |     "    \n",
342 |     "pd.DataFrame(silhouette_coefficients)\n",
343 |     "    \n",
344 |     "plt.style.use(\"fivethirtyeight\")\n",
345 |     "plt.plot(range(2, try_clusters), silhouette_coefficients)\n",
346 |     "plt.xticks(range(2, try_clusters))\n",
347 |     "plt.xlabel(\"Number of Clusters\")\n",
348 |     "plt.ylabel(\"Silhouette Coefficient\")\n",
349 |     "plt.show()\n",
350 |     "\n",
351 |     "df_sil_coeff = pd.DataFrame(silhouette_coefficients).reset_index()\n",
352 |     "optimal_silhouette_coefficients = df_sil_coeff.loc[df_sil_coeff[0]==max(silhouette_coefficients), 'index'][0] + 2\n",
353 |     "print(\"Silhouette coefficients: optimal number of clusters is\", optimal_silhouette_coefficients)"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "## Clustering using the optimal number of clusters chosen"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": [
369 |     "chosen_clusters = 4"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "kmeans = KMeans(n_clusters=chosen_clusters, **kmeans_kwargs)\n",
379 |     "identified_clusters = kmeans.fit_predict(X)\n",
380 |     "\n",
381 |     "df_cluster.loc[:, 'cluster'] = identified_clusters \n",
382 |     "\n",
383 |     "# Updating profiling dictionary\n",
384 |     "dict_profiling['regular']['cluster'] = {}\n",
385 |     "for c in range(0, len(dict_profiling['regular'])):\n",
386 |     "    dict_profiling['cluster'] = {dict_profiling['regular'][c]: df_cluster.loc[df_cluster.index==dict_profiling['regular'][c], 'cluster'].unique()[0]}\n",
387 |     "    print(id, c, dict_profiling[c])"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "### Plotting clustered regular series"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "df_to_plot = pd.melt(df_cluster.reset_index(), id_vars=[date_var, 'cluster'])\n",
404 |     "for cluster in list(df_cluster['cluster'].unique()):\n",
405 |     "    count = 1\n",
406 |     "    for i in list(df_to_plot[id].unique()):\n",
407 |     "        print('Plotting id:', i, 'as', count, 'of',\n",
408 |     "              len(list(df_to_plot[id].unique())))\n",
409 |     "        chart_title =  id + ' ' + str(i) + \" - Profile regular cluster \" +  str(cluster)\n",
410 |     "        plot = Plots.sliding_line_plot(df_to_plot, y, id, i, chart_title)\n",
411 |     "        plot.write_html(os.path.join(root, cfg_path.data_dir.plot_path,\n",
412 |     "                        id + '_' + str(i) + '_profile_regular_cluster_' + str(cluster) + \".html\"))\n",
413 |     "        count = count + 1\n"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {},
419 |    "source": [
420 |     "# Saving"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": null,
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": [
429 |     "# create a binary pickle file \n",
430 |     "f = open(os.path.join(root, cfg_path.data_dir.output_path, 'dict_profiling.pkl'),\"wb\")\n",
431 |     "# write the python object (dict) to pickle file\n",
432 |     "pickle.dump(dict_profiling,f)\n",
433 |     "# close file\n",
434 |     "f.close()\n"
435 |    ]
436 |   }
437 |  ],
438 |  "metadata": {
439 |   "interpreter": {
440 |    "hash": "bde6963c5f9d136d1b0963ec6638d0588f83e0d56652a4cd4ef0ca62bda372aa"
441 |   },
442 |   "kernelspec": {
443 |    "display_name": "Python 3.7.7 ('forecasting_energy')",
444 |    "language": "python",
445 |    "name": "python3"
446 |   },
447 |   "language_info": {
448 |    "codemirror_mode": {
449 |     "name": "ipython",
450 |     "version": 3
451 |    },
452 |    "file_extension": ".py",
453 |    "mimetype": "text/x-python",
454 |    "name": "python",
455 |    "nbconvert_exporter": "python",
456 |    "pygments_lexer": "ipython3",
457 |    "version": "3.7.7"
458 |   },
459 |   "orig_nbformat": 4
460 |  },
461 |  "nbformat": 4,
462 |  "nbformat_minor": 2
463 | }
464 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![banner](Docs/Images/banner.jpg)
  2 | 
  3 | # Forecasting 2.0 Accelerator
  4 | [Forecasting 2.0 accelerator presentation](Docs/Slides/ds_toolkit_forecasting_2.0_memo.pdf)
  5 | 
  6 | - [Forecasting 2.0 Accelerator](#forecasting-20-accelerator)
  7 | - [Overview](#overview)
  8 | - [I am a data scientist new to demand forecasting. How can this accelerator help me? What should I do to use it?](#i-am-a-data-scientist-new-to-demand-forecasting-how-can-this-accelerator-help-me-what-should-i-do-to-use-it)
  9 |   - [What do I need in terms of time series data to use this accelerator?](#what-do-i-need-in-terms-of-time-series-data-to-use-this-accelerator)
 10 |   - [Why this accelerator might be useful for you](#why-this-accelerator-might-be-useful-for-you)
 11 |   - [How to use this accelerator as guideline](#how-to-use-this-accelerator-as-guideline)
 12 |     - [Notebooks](#notebooks)
 13 |       - [1. EnergyDataExploration](#1-energydataexploration)
 14 |       - [2. EnergyPredictionDataPreparation](#2-energypredictiondatapreparation)
 15 |       - [3. EnergyProfilingIntermittent](#3-energyprofilingintermittent)
 16 |       - [4. EnergyClusteringRegular](#4-energyclusteringregular)
 17 |       - [5. EnergyPredictionScoring](#5-energypredictionscoring)
 18 |   - [How should I validate a model?](#how-should-i-validate-a-model)
 19 |     - [Interpreting errors](#interpreting-errors)
 20 | - [Profiling (clustering) Time Series:​](#profiling-clustering-time-series)
 21 |     - [Identifying intermittent time series:​](#identifying-intermittent-time-series)
 22 |       - [How to identify intermittent time series:​](#how-to-identify-intermittent-time-series)
 23 |         - [Intermittent indicators parameters](#intermittent-indicators-parameters)
 24 |         - [What if I am working with data that are not related to energy consumption?](#what-if-i-am-working-with-data-that-are-not-related-to-energy-consumption)
 25 |     - [References on intermittent time series: ​](#references-on-intermittent-time-series-)
 26 |       - [Methods to forecast intermittent time series (not yet implemented in this accelerator):​](#methods-to-forecast-intermittent-time-series-not-yet-implemented-in-this-accelerator)
 27 |         - [Constant](#constant)
 28 |         - [Constant at zero](#constant-at-zero)
 29 |         - [Unforecastable time and unforecastable quantity](#unforecastable-time-and-unforecastable-quantity)
 30 |         - [Spikes, lumpy, erratic](#spikes-lumpy-erratic)
 31 |     - [Clustering profiles​](#clustering-profiles)
 32 |       - [Methods to forecast regular time series](#methods-to-forecast-regular-time-series)
 33 | - [Getting Started](#getting-started)
 34 |     - [config.yaml file example](#configyaml-file-example)
 35 |   - [Default Directory Structure](#default-directory-structure)
 36 |   - [Build and Test](#build-and-test)
 37 | - [Functions](#functions)
 38 |   - [Plotting](#plotting)
 39 |   - [Profiling](#profiling)
 40 |   - [Regressors](#regressors)
 41 |   - [Scoring](#scoring)
 42 |   - [Kpi](#kpi)
 43 |   - [Utils](#utils)
 44 | - [Contributing](#contributing)
 45 |   - [As data scientist, how can I contribute?](#as-data-scientist-how-can-i-contribute)
 46 |     - [How to contribute to profiling?](#how-to-contribute-to-profiling)
 47 |       - [Insurance Claims data](#insurance-claims-data)
 48 |     - [How to contribute to data preparation and scoring?](#how-to-contribute-to-data-preparation-and-scoring)
 49 | - [Trademarks](#trademarks)
 50 | # Overview
 51 | This accelerator provides code and guidance to produce time series forecasting and time series profiling. The aim of this accelerator is to help data scientists to forecast multiple time series by building models based on the time-series profiling, by performing an accurate data preparation and by training and forecasting multiple time series based with models created ad-hoc for each profile. 
 52 | 
 53 | Time series modelling is defined as the combination of:
 54 | 1. Choice of explanatory variables or regressors - which variables help me in explaining the target variable I want to forecast?
 55 | 2. Choice of forecasting algorithm - which algorithm do I use to produce my forecast? Arima, Linear regression, Boosting model?
 56 | 3. Choice of train set - how many observations do I use to train my model and produce my forecast?
 57 | 
 58 | Each model is optimized to better fit the training dataset and forecast the target variable: from energy consumption to spare parts demand. Classification or Clustering profile of time series data helps in defining the best fitting model in terms of choice of regressors (calendar variables or temperatures), forecasting algorithm (ARIMA vs Exponential smoothing) and train set (one year or just few days of data). 
 59 | 
 60 | # If I am new to demand forecasting, how can this accelerator help me? What should I do to use it?
 61 | ## What do I need in terms of time series data to use this accelerator?
 62 | This accelerator deals with so-called **panel data**. In statistics and econometrics, panel data or longitudinal data is a collection of data that contains observations about different cross sections (groups or ids) that is assembled over intervals in time and ordered chronologically. Examples of groups that may make up panel data series include countries, firms, individuals, or demographic groups. 
 63 | 
 64 | ![Alt text](Docs/Images/panel_data.png?raw=true "Panel data")
 65 | 
 66 | Specifically:
 67 | 
 68 | | Group or Id     | Time period | Notation   |
 69 | | :---        | :---   | :--- |
 70 | | 1      | 1       | $Y_{11}$  |
 71 | | 1      | 2      | $Y_{12}$  |
 72 | | 1      | T       | $Y_{1T}$  |
 73 | | $\vdots$ | $\vdots$ | $\vdots$ |
 74 | | N      | 1       | $Y_{N1}$  |
 75 | | N      | 2      | $Y_{N2}$  |
 76 | | N      | T       | $Y_{NT}$  |
 77 | 
 78 | Example datasets:
 79 | 
 80 | | Field     | Topics | Example dataset     |
 81 | | :---        | :---   | :--- |
 82 | | Microeconomics      | GDP across multiple countries, Unemployment across different states, Income dynamic studies, international current account balances      | [Panel Study of Income Dynamics (PSID)](https://psidonline.isr.umich.edu/)   |
 83 | | Macroeconomics   | International trade tables, world socioeconomic tables, currency exchange rate tables       | [Penn World Tables](https://www.rug.nl/ggdc/productivity/pwt/)     |
 84 | Epidemiology and Health Statistics|	Public health insurance data, disease survival rate data, child development and well-being data| [Medical Expenditure Panel Survey](https://www.meps.ahrq.gov/mepsweb/)
 85 | Finance|	Stock prices by firm, market volatilities by country or firm|	[Global Market Indices](https://finance.yahoo.com/world-indices/)
 86 | 
 87 | If you have a **single time series** it can be thought of as special cases of panel data that has one dimension only (one panel member or individual), so you can still take advantge from the accelerator, altought it is not useful to run the profiler, since you will have just one profile by default. 
 88 | 
 89 | ## Why might this accelerator be useful for you
 90 | 1. It provides you with guidelines in the form of notebooks that can help you taking into account all necessary steps in order to perform a good data preparation, which is crucial in forecasting
 91 | 2. It provides you with a library of functions you might need when dealing with demand forecasting, such as:
 92 | - Sliding plots like the one below:
 93 |   ![Alt text](Docs/Images/sliding_plot.png?raw=true "Sliding plot")
 94 | - Adding holidays by country or other regressors such as months, weekdays and interaction terms
 95 | - Creating normal temperature future scenarios to generate years-ahead forecasts
 96 | - Filling missing data using similar days or similar weeks values 
 97 | - Compute errors like mean absolute error and mean absolute percentage error (also in case of zero dividend...)
 98 | - Wrap up results in Excel or csv files
 99 | 3. If you have several time series to forecast, thanks to the **Profiling** module, it allows you to quickly understand how "difficult" to forecast are the time series you are dealing with by classifying time series as intermittent or regular. You might want to know that if data profiling shows intermittent, you might not have consistent accuracy. This is crucial to drive the right customer expectations on the forecast accuracy. Profiling also helps you accelerating the production of forecast when dealing with high numbers of time series to forecast (more than 10 and less than 100): by grouping time series, for example with 2 intermittent + 4 regular consumption profiles, you can develop 6 models which can be applied by category thus reducing work load and increasing accuracy
100 | 4. It helps you to quickly run backtesting with multiple models, and choosing the best model in terms of mean absolute error
101 | 
102 | ## How to use this accelerator as guideline
103 | This accelerator provides you with 5 Notebooks that drives you through the essential steps you need to obtain a good forecast.
104 | 
105 | ### Notebooks
106 | Notebooks are available in the Notebooks folder and provide guidance to use the Forecast 2.0 functions. 
107 | #### 1. EnergyDataExploration
108 | [A notebook](./Notebooks/EnergyDataExploration.ipynb) that provides an exploratory data analysis in order to understand the type of time series you are dealing with
109 | #### 2. EnergyPredictionDataPreparation
110 | [A notebook](./Notebooks/EnergyPredictionDataPreparation.ipynb) that helps with Time Series Data Preparation, in particular how to deal with NAs, how to aggregate time series and how to create useful regressors (e.g. calendar variables)
111 | #### 3. EnergyProfilingIntermittent
112 | [A notebook](./Notebooks/EnergyProfilingIntermittent.ipynb) that profiles time series to be regular, intermittent, lumpy, erratic, unforecastable in terms of time, unforecastable in terms of quantity, constant and constant at zero
113 | #### 4. EnergyClusteringRegular
114 | [A notebook](./Notebooks/EnergyClusteringRegular.ipynb) that performs a k-means flat cluster analysis on those time series that were classified as regular
115 | #### 5. EnergyPredictionScoring
116 | [A notebook](./Notebooks/EnergyPredictionScoring.ipynb) that helps you produce a forecast, plot the results and compute KPIs on a panel dataframe, where you have multiple timeseries identified by a given group or id (e.g. multiple sensors time series, multiple plants or site-id energy consumption, etc)
117 | 
118 | ## How should I validate a model?
119 | You can validate your model using the following KPIs (implemented, please refer to the EnergyPredictionScoring Notebooks and to the Functions section below):
120 | 1. `Mean Error`: average of all forecast-actual
121 | 2. `Mean Absolute Error`: average of all absolute values (forecast-actual)
122 | 3. `Mean Absolute Percentage Error`: average of all absolute errors/actual
123 | 
124 | ### Interpreting errors 
125 | As you can infer, the above KPIs values depends on:
126 | - **Seasonality**
127 |   This means that when you have, for example, yearly seasonality, you might have periods of the year where the model performs better and where the model perform worse. Make sure which one is best for your use case.
128 | - **Low demand values**
129 |   This means that when you have, for example, a lot of low demand actual values and your forecast is in the neighbourhood of that value, your Absolute Percentage Error will easily result very close to 1, significantly worsening your MAPE. Make sure to interpret your error results accordingly.
130 | 
131 | Other important factors that can affect your error: 
132 | - **Auto-regressive components**
133 |   If you have data that allows to employ auto-regressive components, i.e. the lagged value of the variable you want to forecast, this will improve your accuracy significantly.
134 | - **Length of forecast horizon**
135 |   If you need to forecast a long duration of horizon ahead (i.e. you start from daily data granularity and you need to forecast years ahead), your accuracy will reduce
136 | - **Measurement error**
137 |   If your data has a lot of outliers, missing data or measurement errors (i.e. sensors data), this will reduce your accuracy
138 | - **Collinearity**
139 |   Multicollinearity is a statistical concept where several independent variables in a model are correlated. Two variables are considered to be perfectly collinear if their correlation coefficient is +/- 1.0. Multicollinearity among independent variables will result in less reliable statistical inferences. You might consider using techniques such as Principal Component Analysis in order to deal with the issue. 
140 | 
141 | # Profiling (clustering) Time Series:​
142 | The **goal** is to identify consumption patterns that are similar to each other in order to assign the optimal model in terms of min of MAE or MSE​. 
143 | 
144 | The **first step** is to identify the series that is classified as “intermittent” with respect to those “regular”​ and **then** proceed to perform a k-means cluster analysis only on the latter. 
145 | 
146 | The **expected output** is to label each time series as intermittent with respect to regular.
147 | 
148 | ### Identifying intermittent time series:​
149 | Definition of intermittent time series: intermittent time series or demand comes about when a product or a time series experiences several periods of zero demand. Often in these situation, when demand occurs it is small, and sometimes highly variable in size​
150 | 
151 | #### How to identify intermittent time series:​
152 | Compute the following indicators such as
153 | 1. ​Average Inter-demand Interval (ADI), this parameter is period based which is calculated as average interval time between two demand occurrences​
154 | 2. Coefficient of Variation Squared (CV2), this statistical parameter is calculated as standard deviation of the *For correspondence demand divided by the average demand for non-zero demand periods. The squared coefficient of variation represents variability of demand size.​
155 | 3. Standard Deviation of Inter-demand Interval (SDDI) ​
156 | 
157 | Based on their values, it is possible to identify intermittent time series as:
158 |   - spikes
159 |   - lumpy
160 |   - erratic
161 |   - unforecastable in terms of time volatility
162 |   - unforecastable in terms of quantity volatility
163 |   - constant
164 |   - constant at zero
165 |   - regular time series ​
166 | 
167 | ![Alt text](Docs/Images/intermittent_TS.png?raw=true "Intermittent time series")
168 | 
169 | #### Intermittent indicators parameters
170 | Intermittent indicators parameters vary depending on the type of time series (i.e. data generation process of the time series) such as energy consumption in KWh or insurance claims in USD, therefore intermittent indicators must be set every time depending on the type of time series and their validation is done looking at time series charts resulting from the profiling Notebook.
171 | 
172 | Intermittent indicators are the following:
173 |   - **thres_cv2_constant** defines the threshold value to set constant time series with respect to a constant at zero time series
174 |   - **thres_cv2** defines the threshold value between low CV2 and high CV2
175 |   - **thres_adi** defines the threshold value between low ADI and high ADI
176 |   - **thres_sddi** defines the threshold value between low SDDI and high SDDI
177 |   - **min_time_cons** defines the threshold value of minimum time between two demand entries (on with respect to off demand)
178 | 
179 | Parameters for electricity consumption in KWh, daily data.
180 |   - thres_cv2_constant = 0.06
181 |   - thres_cv2 = 2
182 |   - thres_adi = 3
183 |   - thres_sddi = 6.2
184 |   - min_time_cons = 2
185 | 
186 | Parameters for insurance claims data in USD, daily data. Claims from work accidents in mining industry.
187 |   - thres_cv2_constant = 0.01
188 |   - thres_cv2 = 0.2
189 |   - thres_adi = 1.2
190 |   - thres_sddi = 6.0
191 |   - min_time_cons = 25
192 | 
193 | ##### What if I am working with data that are not related to energy consumption?
194 | You can still use the accelerator and the profiler, but you need to setup new intermittent indicators. To do so, create a copy of the DataPreparation and ProfilingIntermittent Notebooks, run first the DataPreparation and save your data. Load them into the ProfilingIntermittent and having in mind the [Intermittent Classificator Chart](Docs/Images/intermittent_TS.png?raw=true "Intermittent time series"), set new parameters for thres_cv2_constant, thres_cv2, thres_adi, thres_sddi, min_time_cons and look if the resulting classification makes sense. 
195 | 
196 | ### References on intermittent time series: ​
197 | 
198 | - [Lancaster Centre For Marketing Analytics and Forecasting](https://www.lancaster.ac.uk/lums/research/areas-of-expertise/centre-for-marketing-analytics-and-forecasting/)
199 | 
200 | - [Methods for Intermittent Demand Forecasting](https://www.lancaster.ac.uk/pg/waller/pdfs/Intermittent_Demand_Forecasting.pdf)
201 | 
202 | #### Methods to forecast intermittent time series (not yet implemented in this accelerator):​
203 | ##### Constant
204 | - Moving average
205 | ##### Constant at zero
206 | - Moving average or actual zero value
207 | ##### Unforecastable time and unforecastable quantity
208 | - Do not use a statistical model, it is better to develop a deterministic model (i.e. based on if/then rules)
209 | ##### Spikes, lumpy, erratic
210 | - Croston’s method: Implementation in [sktime](https://www.sktime.org/en/v0.8.0/api_reference/auto_generated/sktime.forecasting.croston.Croston.html)​
211 | - Adjusted Croston methods ​
212 | - Model-based forecasting methods
213 |   - ARMA models​
214 |   - DARMA models -> Discrete ARMA​
215 |   - INARMA models -> Integer-valued ARMA (INARMA)​
216 | 
217 | ### Clustering profiles​
218 |   - Clustering regular time series using K-Means flat
219 |   - Choose the optimal number of clusters ​
220 |     - As a method to choose the optimal number of cluster, use max explained variance at the minimum number of cluster -> Elbow Method​
221 |     ![Alt text](Docs/Images/elbow.png?raw=true "Elbow method")
222 |     - Check weather identified profiles have a business meaning
223 |     - Define and assign a best model:
224 |       - use temperatures if heating or cooling is present in an energy consumption use case 
225 |         ![Alt text](Docs/Images/thermal.png?raw=true "Thermal time series")
226 |       - use calendar variables correlation when temperatures is not present 
227 |         ![Alt text](Docs/Images/calendar.png?raw=true "Calendar time series")
228 | 
229 | #### Methods to forecast regular time series
230 | |#    | Model | Library   | Status | Notes |
231 | | :---        |    :----:   |          ---: |  ---: |---: |
232 | | 1 | Linear regression      | [statsmodel](https://www.statsmodels.org/stable/api.html#univariate-time-series-analysis)     |Implemented    |  |
233 | | 2 | Gradient boosting      | [xgboost](https://xgboost.readthedocs.io/en/stable/)     |Implemented    |  |
234 | | 3 | Random forest      | [statsmodel](https://www.statsmodels.org/stable/api.html#univariate-time-series-analysis)     |Implemented    |  |
235 | | 4 | Kats |[Kats](https://facebookresearch.github.io/Kats/api/) |Not yet tmplemented    |  |
236 | | 5 | Prophet | [Prophet](https://facebook.github.io/prophet/docs/quick_start.html#python-api)|Not yet implemented    |Decompose into trend + season + holiday, etc |
237 | | 6 |Neural networks|[Neural prophet](https://neuralprophet.com/html/index.html) |Not yet implemented    |  |
238 | | 7 |Probabilistic model|[PyFlux](https://github.com/RJT1990/pyflux) |Not yet implemented    |  |
239 | | 8|Scikit-learn wrapper|[Sktime](https://www.sktime.org/en/stable/) |Not yet implemented    |  |
240 | | 9|Automatic time series|[AutoTimeSeries](https://github.com/AutoViML/Auto_TS) |Not yet implemented    |  |
241 | | 10 |Create synthetic time series for model testing|[TimeSynth](https://github.com/TimeSynth/TimeSynth) |Not yet implemented    |  |
242 | | 11 |Computes series characteristics|[Tsfresh](https://github.com/blue-yonder/tsfresh) |Not yet implemented    |  |
243 | | 12 |ARIMA and deep NN|[Darts](https://github.com/unit8co/darts) |Not yet implemented    |  |
244 | | 13 |Uber forecasting package|[Orbit](https://github.com/uber/orbit) |Not yet implemented    | pystan backend |
245 | | 14 |Converting dates|[Arrow](https://github.com/pastas/pastas) |Not yet implemented    |  |
246 | | 15 |Hydro(geo)logical time series analysis|[Pastas](https://github.com/pastas/pastas) |Not yet implemented    |  |
247 | | 16|Deep learning|[Flow forecast](https://github.com/AIStream-Peelout/flow-forecast) |Not yet implemented    |  |
248 | | 17 |Automating iterative tasks of machine learning model development|[AutoML in Azure ML](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#supported-models) |Not yet implemented    |  |
249 | | 18 |Netflix forecasting package | [Metaflow](https://docs.metaflow.org/introduction/what-is-metaflow) |Not yet implemented    |  |
250 | 
251 | # Getting Started
252 | 1. Create a new conda environment named forecasting_energy using the `forecasting_energy_env.yml` in the `Environment` folder in the repository. To install a new environment using conda, you can access Anaconda navigator, click on import, name the new environment as forecasting_energy, select Python 3.8 and use the path to forecasting_energy_env.yml to install the new environment. Or you can use the following command: 
253 |     ```sh
254 |     conda env create -f ./Environment/forecasting_energy.yml
255 |     ```
256 | 
257 | 2. To have an idea of software dependencies, read `requirements.txt`
258 | 3. Create a `config.yaml` in `Configuration` folder, in order to run the code on your local machine/virtual machine. This is an example of the file(`config.yaml`)
259 |     ```sh
260 |     data_dir:
261 |       input_path: "Data/Input"
262 |       output_path: "Data/Output"
263 |       plot_path: "Data/Plots"
264 |     ```
265 | 
266 | 4. Create your input, output and plot path 
267 | 5. Load the [test dataset from Kaggle](https://www.kaggle.com/arashnic/building-sites-power-consumption-dataset/download"), unzip and save it in your input folder
268 | 
269 | ## Default Directory Structure
270 | 
271 | ```bash
272 | ├───Code     # Folder containing all the custom function created for this accelerator
273 | │   ├───Plotting    # Plotting functions
274 | │   └───Profiling   # Profiling time series functions
275 | │       ├───Intermittent # Identification and classification of intermittent time series functions
276 | │   └───Regressors # Create useful time series regressors, such as calendar variables or temperature transformations
277 | │   └───Scoring # Create train and test sets, training, forecasting and computing KPIs functions
278 | │   └───Utils   # Several utils functions called in the notebooks
279 | ├── Configuration # config.py that lead to config.yaml. with configuration 
280 | ├───Docs # Additional documents
281 | ├───Notebooks     # Notebooks to do Profiling, Data Preparation, Scoring and Forecasting  
282 | ├───Test     # Test Notebooks to do Profiling, Data Preparation, Scoring and Forecasting on various use cases
283 | ├── .gitignore
284 | ├── CODE_OF_CONDUCT.md
285 | ├── LICENSE.md
286 | ├── README.md
287 | |── requirements.txt
288 | ├── SECURITY.md
289 | └── SUPPORT.md
290 | ```
291 | 
292 | ## Build and Test
293 | 1. Create a config.yaml as described above and compile it as:
294 |     - In data_dir set your folder tree for input, output and plot folder
295 |     - In saving choose your saving preferences
296 | 
297 | # Functions
298 | Functions are available in the Code folder.
299 | 
300 | ## Plotting
301 | - Class Plots
302 | 
303 |   ```sh
304 |   sliding_line_plot(df, serie_to_plot, id, i, chart_title="")
305 |   ```
306 | 
307 | - Creates a sliding time series chart
308 | 
309 |   ```sh
310 |   sliding_fcst_plot(df, predict_col, expected_values, chart_title="", kpi=True)
311 |   ```
312 | 
313 |   Creates a forecast vs actual sliding time series chart, with KPI option
314 | 
315 | 
316 | ## Profiling
317 | -  Class Intermittent
318 |     ```sh  
319 |     cv2_by_group(df, y, grouping_var, highest=0.05, lowest=0.05):
320 |     ```
321 | - Computes cv2 by group
322 |     ```sh  
323 |     cv2(array, highest=0.05, lowest=0.05):
324 |     ```
325 | - Winsorization is the process of replacing the extreme values of statistical data in order to limit 
326 |         the effect of the outliers on the calculations or the results obtained by using that data. 
327 |         The mean value calculated after such replacement of the extreme values is called winsorized mean.
328 |   ```sh  
329 |   adi(array, highest=0.05, lowest=0.05):
330 |   ```
331 | 
332 |   ```sh  
333 |   sddi(array, highest=0.05, lowest=0.05):
334 |   ```
335 | 
336 |   ```sh  
337 |   compute_indicator_values(vect, threshold, perc, quant, highest, lowest):
338 |   ```
339 | - Computes indicator values
340 |   ```sh
341 |   enh_compute_indicator_values(vect, threshold, perc, quant, highest, lowest):   
342 |   ```
343 |   Computes indicator values (enhanced)
344 | 
345 | ## Regressors
346 | -  Class Regressors
347 |     ```sh  
348 |     create_interactions(df, var1, var2)
349 |     ```
350 |     Adds interaction terms between two variables as var1*var2 to dataframe
351 | 
352 |     ```sh  
353 |     create_non_linear_terms(df, var, n)
354 |     ```
355 |     Adds non linear terms as var^2 to dataframe
356 | 
357 |     ```sh  
358 |     add_holidays_by_country(df, date_var, country)
359 |     ```
360 |     Adds holidays a dummy variable (0/1) to dataframe
361 | 
362 |     ```sh  
363 |     add_weekdays(df, date_var)
364 |     ```
365 |     Adds weekdays a dummy variables (0/1) for each weekday to dataframe
366 | 
367 |     ```sh  
368 |     add_months(df, date_var)
369 |     ```
370 |     Adds months a dummy variables (0/1) for each month to dataframe
371 | 
372 |     ```bash  
373 |     calculate_degree_days(df, base_temperature, temperature)
374 |     ```
375 |     Calculate the Degree Days Heating and Cooling values
376 | 
377 |     ```bash  
378 |     merge_holidays_by_date(df, df_holidays, id)
379 |     ```
380 |     Merge Holiday df with the train df
381 | 
382 |     ```bash  
383 |     merge_additional_days_off(df, df_metadata, id, dict_days_off)
384 |     ```
385 |     Merge Site Weekend data with train df
386 | 
387 |     ```bash  
388 |     merge_weather(df, weather, date_var, id)
389 |     ```
390 |     Merge weather data into the train df
391 | 
392 | - Class SimilarDay:
393 |     ```bash
394 |     get_similar_days_in_previous_year(dates, country)
395 |     ```
396 |     Retrieves the similar day for a given date
397 | 
398 |     ```bash
399 |     get_similar_days_in_previous_week(dates, country)
400 |     ```
401 |     Retrieves the similar day for a given date
402 | 
403 |     ```bash
404 |     get_similar_day_in_previous_year(d, holiday_calendar)
405 |     ```
406 |     Retrieves the similar day for a given date. If the given date is not an holiday, the similar day is the closest day of the previous year in terms of calendar position which shares the weekday. If such a date is an holiday, the same weekday of the week before is considered. 
407 | 
408 |     ```bash
409 |     get_similar_day_in_previous_week(d, holiday_calendar)
410 |     ```
411 |     Retrieves the similar day for a given date. If the given date is not an holiday, the similar day is the closest day of the previous year in terms of calendar position which shares the weekday. If such a date is an holiday, the same weekday of the week before is considered. If the given date is an holiday, its similar day is the closest holiday to the given date in the previous year.
412 | 
413 | - Class StandardConsumption:   
414 |     ```bash
415 |     get_standard_consumption_as_mean(df, id, date_var, var, country)
416 |     ```
417 |     Retrieves the standard consumption for a given date as hourly monthly mean differentiated by holiday, weekend, weekdays
418 | 
419 | - Class Temperatures:
420 |     ```bash       
421 |     ten_year(df, id, date_var = 'date_daily', start_date ='', end_date='31/12/2050')
422 |     ```
423 |     Computes ten year averages temperatures and As-Is temperatures: where available use actual temp, if not use ten year averages
424 | 
425 |     ```bash 
426 |     get_minimum_consumption(df, date_var, var, country)
427 |     ```
428 |     Retrieves the minimum consumption for a given date as hourly monthly minimum value differentiated by holiday, weekend, night
429 | 
430 | ## Scoring
431 | - Class Training
432 |   ```bash 
433 |   train(dict_model_to_train, model)
434 |   ```
435 |   Generate train
436 | 
437 | - Class Forecasting
438 |     ```bash
439 |     forecast(dict_test, trained_model)
440 |     ```
441 |     Generate forecast
442 | 
443 | - Class Scoring
444 |     ```bash 
445 |     find_best_algorithm(y, dict_train, dict_test, dict_algorithms, out_of_sample)
446 |     ```
447 |     Finds the best performing algorithm in terms of min mean absolute error
448 | 
449 |     ```bash 
450 |     stats_per_site(df, id, date_var)
451 |     ```
452 |     Helper function to identify amount of data per site
453 | 
454 |     ```bash 
455 |     resample_train_data(df, date_var, id, predict_col, sampling="D")
456 |     ```
457 |     Resample the data to a particular frequency
458 | 
459 | - Class TrainTest
460 |     ```bash 
461 |     define_train_test_set_dates(df, y, train_start_date, train_end_date, test_start_date, test_end_date, test_size=0.33)
462 |     ```
463 |     Defines train and test dates if left blank  
464 | 
465 |     ```bash 
466 |     def_train(df, y, list_id, train_start_date='', train_end_date='')
467 |     ```
468 |     Define train dataset 
469 | 
470 |     ```bash 
471 |     def_test(df, y, list_id, test_start_date='', test_end_date='')
472 |     ```
473 |     Define test dataset
474 | 
475 | ## Kpi
476 | - Class Kpi
477 |     ```bash
478 |     find_mae(y, dict_train, dict_test, dict_models):
479 |     ```
480 |     Compute mean absolute error
481 |     ```bash
482 |     compute_error(df, fcst, y):
483 |     ```    
484 |     Compute error as forecast-actual
485 |     ```bash
486 |     compute_absolute_error(df, fcst, y):
487 |     ```    
488 |     Compute absolute error as abs(forecast-actual)
489 |     ```bash
490 |     compute_absolute_percentage_error(df, fcst, y):
491 |     ```     
492 |     Compute absolute % error
493 |     ```bash
494 |     compute_mean_error(df, fcst, y):
495 |     ```      
496 |     Compute mean  error
497 |     ```bash
498 |     compute_mae(df, fcst, y):
499 |     ```   
500 |     Compute mean absolute error
501 |     ```bash
502 |     compute_mape(df, fcst, y):
503 |     ```     
504 |     Compute mean absolute % error
505 | 
506 | ## Utils
507 | - Class Utils
508 |     ```bash
509 |     def camel_to_snake(name)
510 |     ```
511 |     Changes string from camel case to snake case
512 |     ```bash
513 |     columns_camel_to_snake(df)
514 |     ```
515 |     Changes dataframe columns from camel case to snake case
516 |     ```bash
517 |     find_date(df)
518 |     ```
519 |     Finds date columns in a dataframe
520 |     ```bash
521 |     find_match_in_list(list_to_match, match_to_find):
522 |     ```
523 |     Finds a match in a list given a list of possible words to match
524 |     ```bash
525 |     delta_format(delta: np.timedelta64) -> str:
526 |     ```
527 |     Identifies frequency in numpy timedelta
528 |     ```bash
529 |     find_freq(timedelta):
530 |     ```
531 |     Finds frequency in numpy timedelta
532 |     ```bash
533 |     find_freq_in_dataframe(df, date_var)
534 |     ```
535 |     Finds frequency in pandas dataframe
536 |     ```bash
537 |     create_folder_tree(folder_name)
538 |     ```
539 |     creates folder tree
540 |     ```bash
541 |     get_project_root(Path):
542 |     ```
543 |     Finds the parent folder of the project 
544 |     ```bash
545 |     add_daily_date(df):
546 |     ```
547 |     Adds a date variable at daily frequency to dataframe
548 |     ```bash
549 |     find_categorical_variables(df):
550 |     ```
551 |     Finds categorical variables in pandas dataframe
552 |     ```bash
553 |     resample_data(df, id, date_var, sampling, dict_grouping)
554 |     ```
555 |     Resample by aggregating the data to a particular frequency as defined in dict_grouping as {variable_to_resample: 'function_to_apply'}, i.e.{value: 'sum'}
556 |     ```bash
557 |     resample_data(df, id, date_var, sampling, dict_grouping)
558 |     ```
559 |     Resample by aggregating the data to a particular frequency (x-m,x-h,x-D) as defined (e.g. 3-M) in aggregation_per_col as{variable_to_resample: 'function_to_apply'}, i.e.{value: 'sum'}
560 |     ```bash
561 |     add_seq(df, date_var, serie, freq, end_date='', start_date='')
562 |     ```
563 |     Creates a sequence of complete date/hours to a dataframe
564 |     ```bash
565 |     check_length_time_serie(df, date_var, index)
566 |     ```
567 |     Checks the length that a time series of complete date/hours should have, so that it can be compared 
568 |     with actual observation
569 |     ```bash
570 |     match_to_find(serie_to_find)
571 |     ```
572 |     Finds a match in a list of possible words to match
573 |     ```bash
574 |     find_match(df, serie_name, match_to_find):
575 |     ```
576 |     Finds a match in a dataframe series given a list of possible words to match
577 |     ```bash
578 |     find_match_in_list(list_to_match, match_to_find)
579 |     ```
580 |     Finds a match in a list given a list of possible words to match
581 |     ```bash
582 |     id_outliers_IQR(df, q1, q3, date_var, id, var, freq_var)
583 |     ```
584 |     Identifies outliers creatinga dummy variable (0/1) called outlier using IQR method, where quantile value can be set
585 |         
586 | - Class AlphabeticalCombinations
587 |     ```bash
588 |     write_neat_csv(saving_file, df_fcst)
589 |     ```
590 |     Writes neat csv
591 |     ```bash        
592 |     convert(string)
593 |     ```
594 |     Convert string to list
595 |     ```bash
596 |     excel_columns()
597 |     ```
598 |     Counts excel columns
599 |     ```bash
600 |     write_beautiful_excel(saving_file, dict_df_to_write)
601 |     ```
602 |     Writes beautiful excel
603 |     ```bash
604 |     write_beautiful_excel_table(saving_file, dict_df_to_write)
605 |     ```
606 |     Writes beautiful excel tables
607 | 
608 | # Contributing
609 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
610 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
611 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
612 | 
613 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
614 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
615 | provided by the bot. You will only need to do this once across all repos using our CLA.
616 | 
617 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
618 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
619 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
620 | 
621 | ## As data scientist, how can I contribute?
622 | You can contribute both in extending the **Profiling** tool and in the data preparation and scoring part of this accelerator.
623 | 
624 | ### How to contribute to profiling?
625 | What needs to be done is to test and define intermittent indicators (thres_cv2_constant, thres_cv2, thres_adi, thres_sddi, min_time_cons) for other types of data than electricity consumption, as reported below.
626 | 
627 | #### Insurance Claims data
628 | Insurance claims data in USD, daily data. Claims from work accidents in mining industry.
629 | 
630 | - thres_cv2_constant = 0.01
631 | - thres_cv2 = 0.2
632 | - thres_adi = 1.2
633 | - thres_sddi = 6.0
634 | - min_time_cons = 25
635 | 
636 | ### How to contribute to data preparation and scoring?
637 | What needs to be done is to improve the code to make it scalable and more efficient when working with big datasets (e.g. more than 100 id).
638 | 
639 | # Trademarks
640 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
641 | trademarks or logos is subject to and must follow 
642 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
643 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
644 | Any use of third-party trademarks or logos are subject to those third-party's policies.
645 | 
646 | 
647 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/Tests/InsuranceClaimsDataPreparation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "id": "dc3e4402",
  6 |       "metadata": {},
  7 |       "source": [
  8 |         "# Implementation"
  9 |       ]
 10 |     },
 11 |     {
 12 |       "cell_type": "markdown",
 13 |       "id": "sWbXCGozBRNW",
 14 |       "metadata": {
 15 |         "id": "sWbXCGozBRNW"
 16 |       },
 17 |       "source": [
 18 |         "## Packages"
 19 |       ]
 20 |     },
 21 |     {
 22 |       "cell_type": "code",
 23 |       "execution_count": null,
 24 |       "id": "kmxpysFu7zjH",
 25 |       "metadata": {
 26 |         "colab": {
 27 |           "base_uri": "https://localhost:8080/"
 28 |         },
 29 |         "id": "kmxpysFu7zjH",
 30 |         "outputId": "db2717d5-22be-4fa8-99fb-3f9ea90e7e1b"
 31 |       },
 32 |       "outputs": [],
 33 |       "source": [
 34 |         "# data elaboration functions\n",
 35 |         "import pandas as pd\n",
 36 |         "import string\n",
 37 |         "import numpy as np\n",
 38 |         "\n",
 39 |         "# datetime functions\n",
 40 |         "import datetime as dt\n",
 41 |         "\n",
 42 |         "# file management functions\n",
 43 |         "import os\n",
 44 |         "import sys\n",
 45 |         "import opendatasets as od\n",
 46 |         "import pickle\n",
 47 |         "from pathlib import Path\n",
 48 |         "\n",
 49 |         "# plot functions\n",
 50 |         "import matplotlib.pyplot as plt\n",
 51 |         "%matplotlib inline\n",
 52 |         "\n",
 53 |         "# data science functions\n",
 54 |         "import xgboost as xgb\n",
 55 |         "from sklearn.model_selection import train_test_split\n",
 56 |         "from sklearn.ensemble import RandomForestRegressor\n",
 57 |         "from sklearn.linear_model import LinearRegression\n",
 58 |         "import joblib\n",
 59 |         "from sklearn.metrics import mean_absolute_error\n",
 60 |         "\n",
 61 |         "# configuration file\n",
 62 |         "module_path = os.path.abspath(os.path.join('..'))\n",
 63 |         "if module_path not in sys.path:\n",
 64 |         "    sys.path.append(module_path)\n",
 65 |         "from Configuration.config import cfg_path\n",
 66 |         "\n",
 67 |         "# custom functions\n",
 68 |         "from Code.Plotting.plots import Plots\n",
 69 |         "from Code.Regressors.regressors import Regressors\n",
 70 |         "from Code.Regressors.temperatures import Temperatures\n",
 71 |         "from Code.Scoring.scoring import Scoring\n",
 72 |         "from Code.Scoring.train_test import TrainTest\n",
 73 |         "from Code.Scoring.train import Training\n",
 74 |         "from Code.Scoring.forecast import Forecasting\n",
 75 |         "from Code.Scoring.kpi import Kpi\n",
 76 |         "from Code.Scoring.scoring import Scoring\n",
 77 |         "from Code.Utils.utils import Utils\n"
 78 |       ]
 79 |     },
 80 |     {
 81 |       "cell_type": "markdown",
 82 |       "id": "8dc26b7b",
 83 |       "metadata": {},
 84 |       "source": [
 85 |         "## Setup"
 86 |       ]
 87 |     },
 88 |     {
 89 |       "cell_type": "code",
 90 |       "execution_count": null,
 91 |       "id": "458162d0",
 92 |       "metadata": {},
 93 |       "outputs": [],
 94 |       "source": [
 95 |         "root = Path(os.getcwd()).parent\n",
 96 |         "dataset_path = os.path.join(root, cfg_path.data_dir.input_path)"
 97 |       ]
 98 |     },
 99 |     {
100 |       "cell_type": "markdown",
101 |       "id": "4Q-4BToWB7LC",
102 |       "metadata": {
103 |         "id": "4Q-4BToWB7LC"
104 |       },
105 |       "source": [
106 |         "## Load Data\n"
107 |       ]
108 |     },
109 |     {
110 |       "cell_type": "code",
111 |       "execution_count": null,
112 |       "id": "d7e24623",
113 |       "metadata": {
114 |         "colab": {
115 |           "base_uri": "https://localhost:8080/"
116 |         },
117 |         "id": "d7e24623",
118 |         "outputId": "30507a03-42e3-4f9e-8b2b-bccb623a06c9"
119 |       },
120 |       "outputs": [],
121 |       "source": [
122 |         "df_train_data = pd.read_csv(os.path.join(\n",
123 |         "    root, cfg_path.data_dir.input_path, 'insurance-claims.csv'))\n",
124 |         "df_train_data.head()"
125 |       ]
126 |     },
127 |     {
128 |       "cell_type": "markdown",
129 |       "id": "1ShqG6YJGmBk",
130 |       "metadata": {
131 |         "id": "1ShqG6YJGmBk"
132 |       },
133 |       "source": [
134 |         "# Data Preparation\n"
135 |       ]
136 |     },
137 |     {
138 |       "cell_type": "markdown",
139 |       "id": "f23ed7fb",
140 |       "metadata": {},
141 |       "source": [
142 |         "## Parameter setup"
143 |       ]
144 |     },
145 |     {
146 |       "cell_type": "code",
147 |       "execution_count": null,
148 |       "id": "0ddada30",
149 |       "metadata": {},
150 |       "outputs": [],
151 |       "source": [
152 |         "id = 'ICD10Description'\n",
153 |         "list_unique_id = ['ICD10Description', 'DateOfAccident']\n",
154 |         "list_temp = []\n",
155 |         "y = 'Sum of PaidDaysValue'"
156 |       ]
157 |     },
158 |     {
159 |       "cell_type": "markdown",
160 |       "id": "78309614",
161 |       "metadata": {},
162 |       "source": [
163 |         "#### Setting date as datetime"
164 |       ]
165 |     },
166 |     {
167 |       "cell_type": "code",
168 |       "execution_count": null,
169 |       "id": "51b01c28",
170 |       "metadata": {},
171 |       "outputs": [],
172 |       "source": [
173 |         "df_train_data['DateOfAccident'] = pd.to_datetime(df_train_data['DateOfAccident'], format = '%d-%m-%y %H:%M:%S %p')"
174 |       ]
175 |     },
176 |     {
177 |       "cell_type": "markdown",
178 |       "id": "2799c9d5",
179 |       "metadata": {},
180 |       "source": [
181 |         "#### Setting forecast end date"
182 |       ]
183 |     },
184 |     {
185 |       "cell_type": "code",
186 |       "execution_count": null,
187 |       "id": "5f55942a",
188 |       "metadata": {},
189 |       "outputs": [],
190 |       "source": [
191 |         "# Make sure to have all regressors available until forecast_end_date (temperatures, etc)\n",
192 |         "forecast_end_date = '2022-12-31'"
193 |       ]
194 |     },
195 |     {
196 |       "cell_type": "markdown",
197 |       "id": "08af66c3",
198 |       "metadata": {},
199 |       "source": [
200 |         "## Plotting y series"
201 |       ]
202 |     },
203 |     {
204 |       "cell_type": "code",
205 |       "execution_count": null,
206 |       "id": "23685319",
207 |       "metadata": {},
208 |       "outputs": [],
209 |       "source": [
210 |         "# Print available ids and choose which one to plot \n",
211 |         "print(list(df_train_data[id].unique())[0:20])"
212 |       ]
213 |     },
214 |     {
215 |       "cell_type": "code",
216 |       "execution_count": null,
217 |       "id": "a1fabf6b",
218 |       "metadata": {},
219 |       "outputs": [],
220 |       "source": [
221 |         "# Adjusting id names by removing special characters\n",
222 |         "import re\n",
223 |         "df_train_data.loc[:, id] = df_train_data.loc[:, id].apply(lambda x: re.sub('[^A-Za-z0-9]+', '_', x))\n"
224 |       ]
225 |     },
226 |     {
227 |       "cell_type": "code",
228 |       "execution_count": null,
229 |       "id": "6e669264",
230 |       "metadata": {},
231 |       "outputs": [],
232 |       "source": [
233 |         "# Selecting 100 ids to plot\n",
234 |         "list_ids_to_plot = list(df_train_data[id].unique()[0:100])"
235 |       ]
236 |     },
237 |     {
238 |       "cell_type": "code",
239 |       "execution_count": null,
240 |       "id": "109aaf82",
241 |       "metadata": {},
242 |       "outputs": [],
243 |       "source": [
244 |         "count = 1\n",
245 |         "for i in list_ids_to_plot:\n",
246 |         "    print('Plotting id:', i, 'as', count, 'of', len(list_ids_to_plot))\n",
247 |         "    plot = Plots.sliding_line_plot(df_train_data, y, id, i, chart_title=\"\")\n",
248 |         "    plot.write_html(os.path.join(root, cfg_path.data_dir.plot_path, id + '_' + str(i) + \".html\"))\n",
249 |         "    count = count + 1 "
250 |       ]
251 |     },
252 |     {
253 |       "cell_type": "markdown",
254 |       "id": "c0be27d0",
255 |       "metadata": {},
256 |       "source": [
257 |         "## Dealing with NAs and aggregating at a chosen frequency"
258 |       ]
259 |     },
260 |     {
261 |       "cell_type": "markdown",
262 |       "id": "0e88444e",
263 |       "metadata": {},
264 |       "source": [
265 |         "Create a full time sequence on a chosen frequency and aggregate"
266 |       ]
267 |     },
268 |     {
269 |       "cell_type": "markdown",
270 |       "id": "26de9cc1",
271 |       "metadata": {},
272 |       "source": [
273 |         "#### Consumption data (y)"
274 |       ]
275 |     },
276 |     {
277 |       "cell_type": "code",
278 |       "execution_count": null,
279 |       "id": "77429654",
280 |       "metadata": {},
281 |       "outputs": [],
282 |       "source": [
283 |         "# Selecting 100 ids to elaborate\n",
284 |         "df_train_data = df_train_data.loc[df_train_data[id].isin(list_ids_to_plot), ]\n",
285 |         "date_var = Utils.find_date(df_train_data)\n",
286 |         "print('List ids:', list_ids_to_plot)\n",
287 |         "len(list_ids_to_plot)"
288 |       ]
289 |     },
290 |     {
291 |       "cell_type": "code",
292 |       "execution_count": null,
293 |       "id": "f711e287",
294 |       "metadata": {},
295 |       "outputs": [],
296 |       "source": [
297 |         "# Resampling function aggregates data in a dataframe with a chosen function, that can vary depending on the variable\n",
298 |         "# i.e. temperatures when aggregated should be averaged, consumption should be summed, dummy variables should be pick as 'first'\n",
299 |         "\n",
300 |         "df_train_data[date_var].apply(lambda x: x.tz_localize(None))\n",
301 |         "sampling = dt.timedelta(days=1)\n",
302 |         "dict_grouping = {'RmaRegionDesc': 'first', 'Product': 'first', 'Sum of PaidDaysValue': 'sum'}\n",
303 |         "df_resampled = Utils.resample_data(df_train_data, id, date_var, sampling, dict_grouping)\n",
304 |         "print('List ids after resampling:', list(df_resampled[id].unique()))"
305 |       ]
306 |     },
307 |     {
308 |       "cell_type": "code",
309 |       "execution_count": null,
310 |       "id": "fecd0d49",
311 |       "metadata": {},
312 |       "outputs": [],
313 |       "source": [
314 |         "# Adding a full time sequence\n",
315 |         "df_train_data = Utils.add_seq(df_resampled, date_var, serie = id, freq = sampling, end_date=forecast_end_date, start_date='')"
316 |       ]
317 |     },
318 |     {
319 |       "cell_type": "code",
320 |       "execution_count": null,
321 |       "id": "650cf7b7",
322 |       "metadata": {},
323 |       "outputs": [],
324 |       "source": [
325 |         "# This function count the number of obs you should have if you had a full time sequence\n",
326 |         "Utils.check_length_time_serie(df_train_data, date_var, index = id).head()"
327 |       ]
328 |     },
329 |     {
330 |       "cell_type": "code",
331 |       "execution_count": null,
332 |       "id": "7d18510c",
333 |       "metadata": {},
334 |       "outputs": [],
335 |       "source": [
336 |         "df_train_data.head()"
337 |       ]
338 |     },
339 |     {
340 |       "cell_type": "code",
341 |       "execution_count": null,
342 |       "id": "42bc870d",
343 |       "metadata": {},
344 |       "outputs": [],
345 |       "source": [
346 |         "print('List ids after resampling and adding full time sequence:', list(df_train_data[id].unique()))"
347 |       ]
348 |     },
349 |     {
350 |       "cell_type": "markdown",
351 |       "id": "8a56ffd2",
352 |       "metadata": {},
353 |       "source": [
354 |         "## Creating working dataset"
355 |       ]
356 |     },
357 |     {
358 |       "cell_type": "code",
359 |       "execution_count": null,
360 |       "id": "6mGY36qeLgvf",
361 |       "metadata": {
362 |         "id": "6mGY36qeLgvf"
363 |       },
364 |       "outputs": [],
365 |       "source": [
366 |         "# Final df\n",
367 |         "df_final = df_train_data.copy()\n",
368 |         "\n",
369 |         "# Date\n",
370 |         "date_var = Utils.find_date(df_final)"
371 |       ]
372 |     },
373 |     {
374 |       "cell_type": "markdown",
375 |       "id": "53a5656c",
376 |       "metadata": {},
377 |       "source": [
378 |         "#### Count NAs in y by id"
379 |       ]
380 |     },
381 |     {
382 |       "cell_type": "code",
383 |       "execution_count": null,
384 |       "id": "59ba6bca",
385 |       "metadata": {},
386 |       "outputs": [],
387 |       "source": [
388 |         "df_final.head()"
389 |       ]
390 |     },
391 |     {
392 |       "cell_type": "code",
393 |       "execution_count": null,
394 |       "id": "6a3889e4",
395 |       "metadata": {},
396 |       "outputs": [],
397 |       "source": [
398 |         "pivotna = pd.pivot_table(df_final[df_final[y].isna()], index=id, values = y, aggfunc='count').reset_index()\n",
399 |         "pivotna.rename(columns={y: y + '_count_NA'})\n",
400 |         "pivotna"
401 |       ]
402 |     },
403 |     {
404 |       "cell_type": "markdown",
405 |       "id": "6740bfb1",
406 |       "metadata": {},
407 |       "source": [
408 |         "### Adding regressors to final dataframe"
409 |       ]
410 |     },
411 |     {
412 |       "cell_type": "markdown",
413 |       "id": "e5c112c5",
414 |       "metadata": {},
415 |       "source": [
416 |         "#### Holidays"
417 |       ]
418 |     },
419 |     {
420 |       "cell_type": "markdown",
421 |       "id": "2830270a",
422 |       "metadata": {},
423 |       "source": [
424 |         "If you don't have specific holiday dataset, you can use the following general function by country that uses the holiday python package and adds to your dataframe a columns with a holiday dummy variable (0/1):\n",
425 |         "\n",
426 |         "    df_final = Regressors.add_holidays_by_country(df_final, date_var, country = 'France')"
427 |       ]
428 |     },
429 |     {
430 |       "cell_type": "code",
431 |       "execution_count": null,
432 |       "id": "805ebacf",
433 |       "metadata": {},
434 |       "outputs": [],
435 |       "source": [
436 |         "df_final = Regressors.add_holidays_by_country(df_final, date_var, country='United States')\n",
437 |         "print('Min date:', df_final[date_var].min())\n",
438 |         "print('Max date:', df_final[date_var].max())"
439 |       ]
440 |     },
441 |     {
442 |       "cell_type": "markdown",
443 |       "id": "395e6780",
444 |       "metadata": {},
445 |       "source": [
446 |         "#### Other calendar variables"
447 |       ]
448 |     },
449 |     {
450 |       "cell_type": "code",
451 |       "execution_count": null,
452 |       "id": "196089f6",
453 |       "metadata": {},
454 |       "outputs": [],
455 |       "source": [
456 |         "df_final = Regressors.add_weekdays(df_final, date_var)\n",
457 |         "df_final = Regressors.add_months(df_final, date_var)\n",
458 |         "print('Min date:', df_final[date_var].min())\n",
459 |         "print('Max date:', df_final[date_var].max())"
460 |       ]
461 |     },
462 |     {
463 |       "cell_type": "markdown",
464 |       "id": "6743f041",
465 |       "metadata": {},
466 |       "source": [
467 |         "#### Remove duplicates"
468 |       ]
469 |     },
470 |     {
471 |       "cell_type": "code",
472 |       "execution_count": null,
473 |       "id": "fbcb2765",
474 |       "metadata": {},
475 |       "outputs": [],
476 |       "source": [
477 |         "df_final = df_final.drop_duplicates()\n",
478 |         "print('List ids in df_final after removing duplicates:', list(df_final[id].unique()))\n",
479 |         "assert df_final[df_final.duplicated()].count().sum() == 0, \"y should not contain duplicates\"\n",
480 |         "print('Min date:', df_final[date_var].min())\n",
481 |         "print('Max date:', df_final[date_var].max())"
482 |       ]
483 |     },
484 |     {
485 |       "cell_type": "markdown",
486 |       "id": "a7809c54",
487 |       "metadata": {},
488 |       "source": [
489 |         "#### Check regressor availability"
490 |       ]
491 |     },
492 |     {
493 |       "cell_type": "code",
494 |       "execution_count": null,
495 |       "id": "4ea99f83",
496 |       "metadata": {},
497 |       "outputs": [],
498 |       "source": [
499 |         "df_final.columns"
500 |       ]
501 |     },
502 |     {
503 |       "cell_type": "code",
504 |       "execution_count": null,
505 |       "id": "e7945831",
506 |       "metadata": {},
507 |       "outputs": [],
508 |       "source": [
509 |         "# Temperatures have been filled, only temperature asis that is the composition between the actual temperature and ten year averages\n",
510 |         "regressors_list = [ 'holidays','RmaRegionDesc', 'Product',\n",
511 |         "       'holidays', 'wd_mon', 'wd_tue', 'wd_wed',\n",
512 |         "       'wd_thu', 'wd_fri', 'wd_sat', 'wd_sun', 'month_01', 'month_02',\n",
513 |         "       'month_03', 'month_04', 'month_05', 'month_06', 'month_07', 'month_08',\n",
514 |         "       'month_09', 'month_10', 'month_11', 'month_12']\n",
515 |         "\n",
516 |         "try:\n",
517 |         "       Utils.check_regressors_availability(df_final, date_var, regressors_list, forecast_end_date)\n",
518 |         "except:\n",
519 |         "       Utils.remove_regressors_with_nan(df_final, date_var, regressors_list, forecast_end_date)"
520 |       ]
521 |     },
522 |     {
523 |       "cell_type": "markdown",
524 |       "id": "f6dff377",
525 |       "metadata": {},
526 |       "source": [
527 |         "# Saving"
528 |       ]
529 |     },
530 |     {
531 |       "cell_type": "code",
532 |       "execution_count": null,
533 |       "id": "4715ab4e",
534 |       "metadata": {},
535 |       "outputs": [],
536 |       "source": [
537 |         "df_final.to_pickle(os.path.join(root, cfg_path.data_dir.output_path, 'insurance_claims_final.pkl'))"
538 |       ]
539 |     },
540 |     {
541 |       "cell_type": "code",
542 |       "execution_count": null,
543 |       "id": "bd0951d8",
544 |       "metadata": {},
545 |       "outputs": [],
546 |       "source": [
547 |         "print('Min date:', df_final[date_var].min())\n",
548 |         "print('Max date:', df_final[date_var].max())\n",
549 |         "df_final.head()\n"
550 |       ]
551 |     }
552 |   ],
553 |   "metadata": {
554 |     "colab": {
555 |       "collapsed_sections": [
556 |         "AbKOiffyAql8",
557 |         "6YxUycDC9p0h"
558 |       ],
559 |       "name": "Analysis (1).ipynb",
560 |       "provenance": []
561 |     },
562 |     "interpreter": {
563 |       "hash": "2b8f5b14411d0017ed363cef4929504a7281087d06f1b18c01da6e951b937e80"
564 |     },
565 |     "kernelspec": {
566 |       "display_name": "Python 3.7.7 ('forecasting_energy')",
567 |       "language": "python",
568 |       "name": "python3"
569 |     },
570 |     "language_info": {
571 |       "codemirror_mode": {
572 |         "name": "ipython",
573 |         "version": 3
574 |       },
575 |       "file_extension": ".py",
576 |       "mimetype": "text/x-python",
577 |       "name": "python",
578 |       "nbconvert_exporter": "python",
579 |       "pygments_lexer": "ipython3",
580 |       "version": "3.7.7"
581 |     }
582 |   },
583 |   "nbformat": 4,
584 |   "nbformat_minor": 5
585 | }
586 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.11.0
  2 | adal==1.2.5
  3 | adjustText==0.7.3
  4 | altair==4.1.0
  5 | antlr4-python3-runtime==4.8
  6 | applicationinsights==0.11.9
  7 | argcomplete==1.12.3
  8 | argon2-cffi==21.1.0
  9 | astor==0.8.1
 10 | astunparse==1.6.3
 11 | async-generator==1.10
 12 | attrs==21.2.0
 13 | autopep8==1.5.7
 14 | azure-cognitiveservices-vision-customvision==3.0.0
 15 | azure-common==1.1.26
 16 | azure-core==1.23.0
 17 | azure-graphrbac==0.61.1
 18 | azure-identity==1.4.1
 19 | azure-keyvault-secrets==4.4.0
 20 | azure-mgmt-authorization==0.61.0
 21 | azure-mgmt-containerregistry==2.8.0
 22 | azure-mgmt-keyvault==2.2.0
 23 | azure-mgmt-resource==10.3.0
 24 | azure-mgmt-storage==11.2.0
 25 | azure-storage-blob==12.10.0
 26 | azureml-automl-core==1.18.0.post1
 27 | azureml-core==1.17.0
 28 | azureml-dataprep==2.4.4
 29 | azureml-dataprep-native==24.0.0
 30 | azureml-dataprep-rslex==1.2.3
 31 | azureml-dataset-runtime==1.18.0
 32 | azureml-defaults==1.18.0
 33 | azureml-model-management-sdk==1.0.1b6.post1
 34 | azureml-pipeline==1.18.0
 35 | azureml-pipeline-core==1.18.0
 36 | azureml-pipeline-steps==1.18.0
 37 | azureml-sdk==1.18.0
 38 | azureml-telemetry==1.18.0
 39 | azureml-train==1.18.0
 40 | azureml-train-automl-client==1.18.0
 41 | azureml-train-core==1.18.0.post1
 42 | azureml-train-restclients-hyperdrive==1.18.0
 43 | backcall==0.2.0
 44 | backports.tempfile==1.0
 45 | backports.weakref==1.0.post1
 46 | backports.zoneinfo==0.2.1
 47 | base58==2.1.1
 48 | bleach==4.1.0
 49 | blinker==1.4
 50 | cached-property==1.5.2
 51 | cachetools==4.1.1
 52 | certifi==2020.6.20
 53 | cffi==1.14.3
 54 | charset-normalizer==2.0.6
 55 | click==7.1.2
 56 | cloudpickle==1.6.0
 57 | cmdstanpy==0.9.5
 58 | colorama==0.4.4
 59 | configparser==3.7.4
 60 | contextlib2==0.6.0.post1
 61 | convertdate==2.3.2
 62 | cryptography==3.3.2
 63 | cycler==0.10.0
 64 | Cython==0.29.26
 65 | databricks-cli==0.16.2
 66 | databricks-connect==7.3.30
 67 | dateinfer==0.2.0
 68 | debugpy==1.5.0
 69 | decorator==5.1.0
 70 | defusedxml==0.7.1
 71 | dill==0.3.3
 72 | distro==1.5.0
 73 | docker==4.3.1
 74 | dotnetcore2==2.1.19
 75 | entrypoints==0.3
 76 | ephem==4.1.3
 77 | et-xmlfile==1.1.0
 78 | Flask==1.0.3
 79 | fusepy==3.0.1
 80 | gast==0.3.3
 81 | gitdb==4.0.9
 82 | GitPython==3.1.24
 83 | google-auth==1.23.0
 84 | google-auth-oauthlib==0.4.2
 85 | google-pasta==0.2.0
 86 | grpcio==1.33.2
 87 | gunicorn==19.9.0
 88 | h11==0.12.0
 89 | h5py==3.1.0
 90 | hijri-converter==2.2.2
 91 | holidays==0.11.3.1
 92 | idna==3.2
 93 | importlib-metadata==2.0.0
 94 | imutils==0.5.3
 95 | ipykernel==6.4.1
 96 | ipython==7.31.1
 97 | ipython-genutils==0.2.0
 98 | ipywidgets==7.6.5
 99 | isodate==0.6.0
100 | itsdangerous==1.1.0
101 | jedi==0.18.0
102 | jeepney==0.6.0
103 | Jinja2==2.11.3
104 | jmespath==0.10.0
105 | joblib==0.17.0
106 | json-logging-py==0.2
107 | json5==0.8.5
108 | jsonpickle==1.4.1
109 | jsonschema==4.0.1
110 | jupyter-client==7.0.6
111 | jupyter-core==4.8.1
112 | jupyterlab-pygments==0.1.2
113 | jupyterlab-widgets==1.0.2
114 | kaggle==1.5.12
115 | Keras-Applications==1.0.8
116 | Keras-Preprocessing==1.1.0
117 | kiwisolver==1.3.2
118 | kneed==0.7.0
119 | korean-lunar-calendar==0.2.1
120 | liac-arff==2.5.0
121 | LunarCalendar==0.0.9
122 | Markdown==3.3.3
123 | MarkupSafe==1.1.1
124 | matplotlib==3.4.3
125 | matplotlib-inline==0.1.3
126 | mistune==0.8.4
127 | msal==1.6.0
128 | msal-extensions==0.2.2
129 | msrest==0.6.21
130 | msrestazure==0.6.2
131 | nbclient==0.5.4
132 | nbconvert==6.2.0
133 | nbformat==5.1.3
134 | nbimporter==0.3.4
135 | ndg-httpsclient==0.5.1
136 | nest-asyncio==1.5.1
137 | notebook==6.4.10
138 | numpy==1.21.0
139 | oauthlib==3.1.0
140 | omegaconf==2.1.2
141 | opencv-python==4.3.0.36
142 | opencv-python-headless==4.3.0.36
143 | opendatasets==0.1.20
144 | openpyxl==3.0.9
145 | opt-einsum==3.3.0
146 | outcome==1.1.0
147 | packaging==21.2
148 | pandas==1.3.5
149 | pandasql==0.7.3
150 | pandocfilters==1.5.0
151 | parso==0.8.2
152 | pathspec==0.8.1
153 | patsy==0.5.2
154 | pep8==1.7.1
155 | pickleshare==0.7.5
156 | Pillow==9.0.1
157 | plotly==5.3.1
158 | portalocker==1.7.1
159 | prometheus-client==0.12.0
160 | prompt-toolkit==3.0.20
161 | protobuf==3.15.0
162 | py4j==0.10.9
163 | pyarrow==1.0.1
164 | pyasn1==0.4.8
165 | pyasn1-modules==0.2.8
166 | pycodestyle==2.7.0
167 | pycparser==2.20
168 | pydeck==0.7.1
169 | Pygments==2.10.0
170 | PyJWT==2.4.0
171 | PyMeeus==0.5.11
172 | pyodbc==4.0.32
173 | pyOpenSSL==19.1.0
174 | pyparsing==2.4.7
175 | pyrsistent==0.18.0
176 | pystan==2.19.1.1
177 | python-box==5.4.1
178 | python-dateutil==2.8.1
179 | python-slugify==6.1.1
180 | pytz==2020.4
181 | pytz-deprecation-shim==0.1.0.post0
182 | pywin32==301
183 | pywinpty==1.1.5
184 | PyYAML==6.0
185 | pyzmq==22.3.0
186 | repackage==0.7.3
187 | requests==2.26.0
188 | requests-oauthlib==1.3.0
189 | rsa==4.7
190 | ruamel.yaml==0.16.12
191 | ruamel.yaml.clib==0.2.2
192 | scikit-learn==0.22.2.post1
193 | scipy==1.4.1
194 | seaborn==0.11.2
195 | SecretStorage==3.2.0
196 | selenium==4.0.0
197 | Send2Trash==1.8.0
198 | setuptools-git==1.2
199 | Shapely==1.7.0
200 | six==1.15.0
201 | sklearn==0.0
202 | smmap==5.0.0
203 | sniffio==1.2.0
204 | sortedcontainers==2.4.0
205 | statsmodels==0.13.1
206 | streamlit==1.1.0
207 | tabulate==0.8.9
208 | tenacity==8.0.1
209 | tensorboard==2.2.2
210 | tensorboard-plugin-wit==1.7.0
211 | tensorflow==2.7.2
212 | tensorflow-estimator==2.2.0
213 | tensorflow-gpu==2.7.2
214 | tensorflow-gpu-estimator==2.2.0
215 | termcolor==1.1.0
216 | terminado==0.12.1
217 | testpath==0.5.0
218 | text-unidecode==1.3
219 | toml==0.10.2
220 | toolz==0.11.1
221 | tornado==6.1
222 | tqdm==4.62.3
223 | traitlets==5.1.0
224 | trio==0.19.0
225 | trio-websocket==0.9.2
226 | typing-extensions==4.1.1
227 | tzdata==2021.5
228 | tzlocal==4.1
229 | ujson==5.2.0
230 | urllib3==1.26.7
231 | validators==0.18.2
232 | watchdog==2.1.6
233 | wcwidth==0.2.5
234 | webencodings==0.5.1
235 | websocket-client==0.57.0
236 | Werkzeug==1.0.1
237 | widgetsnbextension==3.5.2
238 | wincertstore==0.2
239 | wrapt==1.13.1
240 | wsproto==1.0.0
241 | xgboost==1.4.2
242 | xlrd==2.0.1
243 | XlsxWriter==3.0.1
244 | zipp==3.4.0
245 | 


--------------------------------------------------------------------------------